diff --git a/README.md b/README.md index 4206fe0..972395d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # ๐Ÿ‹๏ธโ€โ™‚๏ธ BenchLLM ๐Ÿ‹๏ธโ€โ™€๏ธ -๐Ÿฆพ Continuous Integration for LLM powered applications ๐Ÿฆ™๐Ÿฆ…๐Ÿค– +๐Ÿฆพ Continuous Integration for LLM powered applications ๐Ÿฆ™๐Ÿฆ…๐Ÿค– [![GitHub Repo stars](https://img.shields.io/github/stars/v7labs/BenchLLM?style=social)](https://github.com/v7labs/BenchLLM/stargazers) [![Twitter Follow](https://img.shields.io/twitter/follow/V7Labs?style=social)](https://twitter.com/V7Labs) @@ -10,7 +10,6 @@ BenchLLM is actively used at [V7](https://www.v7labs.com) for improving our LLM applications and is now Open Sourced under MIT License to share with the wider community - ## ๐Ÿ’ก Get help on [Discord](https://discord.gg/x7ExfHb3bG) or [Tweet at us](https://twitter.com/V7Labs)
@@ -26,7 +25,7 @@ Use BenchLLM to: > โš ๏ธ **NOTE:** BenchLLM is in the early stage of development and will be subject to rapid changes. > ->For bug reporting, feature requests, or contributions, please open an issue or submit a pull request (PR) on our GitHub page. +> For bug reporting, feature requests, or contributions, please open an issue or submit a pull request (PR) on our GitHub page. ## ๐Ÿงช BenchLLM Testing Methodology @@ -116,6 +115,16 @@ The non interactive evaluators also supports `--workers N` to run in the evaluat $ bench run --evaluator string-match --workers 5 ``` +To accelerate the evaluation process, BenchLLM uses a cache. If a (prediction, expected) pair has been evaluated in the past and a cache was used, the evaluation output will be saved for future evaluations. There are several types of caches: + +- `memory`, only caches output values during the current run. This is particularly useful when running with `--retry-count N` +- `file`, stores the cache at the end of the run as a JSON file in output/cache.json. This is the default behavior. +- `none`, does not use any cache. + +```bash +$ bench run examples --cache memory +``` + ### ๐Ÿงฎ Eval While _bench run_ runs each test function and then evaluates their output, it can often be beneficial to separate these into two steps. For example, if you want a person to manually do the evaluation or if you want to try multiple evaluation methods on the same function. @@ -163,6 +172,20 @@ results = evaluator.run() print(results) ``` +If you want to incorporate caching and run multiple parallel evaluation jobs, you can modify your evaluator as follows: + +```python +from benchllm.cache import FileCache + +... + +evaluator = FileCache(StringMatchEvaluator(workers=2), Path("path/to/cache.json")) +evaluator.load(predictions) +results = evaluator.run() +``` + +In this example, `FileCache` is used to enable caching, and the `workers` parameter of `StringMatchEvaluator` is set to `2` to allow for parallel evaluations. The cache results are saved in a file specified by `Path("path/to/cache.json")`. + ## โ˜•๏ธ Commands - `bench add`: Add a new test to a suite. diff --git a/benchllm/__init__.py b/benchllm/__init__.py index 43cb476..8c0bf5f 100644 --- a/benchllm/__init__.py +++ b/benchllm/__init__.py @@ -1,10 +1,10 @@ import inspect from pathlib import Path -from typing import Any, Callable, Generic, Optional, Type, TypeVar +from typing import Callable, Type, TypeVar from .data_types import Evaluation, Prediction, Test # noqa from .evaluator import Evaluator, SemanticEvaluator, StringMatchEvaluator # noqa -from .input_types import ChatInput, SimilarityInput +from .input_types import ChatInput, SimilarityInput # noqa from .similarity import semantically_similar # noqa from .singleton import TestSingleton # noqa from .tester import Tester # noqa diff --git a/benchllm/cache.py b/benchllm/cache.py new file mode 100644 index 0000000..9981d47 --- /dev/null +++ b/benchllm/cache.py @@ -0,0 +1,96 @@ +import json +from pathlib import Path +from typing import Optional + +from benchllm.data_types import Evaluation, Prediction +from benchllm.evaluator import Evaluator +from benchllm.input_types import Json +from benchllm.listener import EvaluatorListener + + +class MemoryCache(Evaluator): + """Caches the results of the evaluator in memory""" + + def __init__(self, evaluator: Evaluator): + super().__init__(workers=evaluator.workers) + self._data: dict = {} + self._evaluator = evaluator + self._num_cache_misses = 0 + self._num_cache_hits = 0 + + def _key(self, answer1: Json, answer2: Json) -> str: + key1, key2 = json.dumps([answer1, answer2]), json.dumps([answer2, answer1]) + return key1 if key1 < key2 else key2 + + def lookup(self, answer1: Json, answer2: Json) -> Optional[bool]: + return self._data.get(self._key(answer1, answer2), None) + + def store(self, answer1: Json, answer2: Json, value: bool) -> None: + key = self._key(answer1, answer2) + self._data[key] = value + + def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]: + uncached_expectations = [] + for expected in prediction.test.expected: + lookup = self.lookup(expected, prediction.output) + if lookup is None: + uncached_expectations.append(expected) + elif lookup: + # If we find a positive match we can stop comparing and just return. + # For negative matches we still need to check the other expected answers. + self._num_cache_hits += 1 + return Evaluator.Match(prediction=prediction.output, expected=expected) + + # If all expectations were found in the cache but were negative matches, + # we increment the cache hits counter and return None as there's no match. + if not uncached_expectations: + self._num_cache_hits += 1 + return None + + self._num_cache_misses += 1 + # set prediction.test.expected to only the ones that were not cached + prediction = Prediction(**prediction.dict()) + prediction.test.expected = uncached_expectations + result = self._evaluator.evaluate_prediction(prediction) + if result: + self.store(result.expected, result.prediction, True) + else: + for expected in prediction.test.expected: + self.store(expected, prediction.output, False) + return result + + @property + def num_cache_hits(self) -> int: + return self._num_cache_hits + + @property + def num_cache_misses(self) -> int: + return self._num_cache_misses + + +class FileCache(MemoryCache, EvaluatorListener): + """Caches the results of the evaluator in a json file""" + + def __init__(self, evaluator: Evaluator, path: Path): + super().__init__(evaluator) + self._path = path + self.add_listener(self) + self._load() + + def _load(self) -> None: + if self._path.exists(): + try: + cache = json.loads(self._path.read_text(encoding="UTF-8"), parse_int=str) + if cache["version"] != "1": + raise ValueError("Unsupported cache version") + self._data = cache["entries"] + except Exception: + print(f"Failed to load cache file {self._path}") + self._data = {} + + def _save(self) -> None: + cache = {"entries": self._data, "version": "1"} + self._path.write_text(json.dumps(cache, indent=4), encoding="UTF-8") + + def evaluate_ended(self, evaluations: list[Evaluation]) -> None: + self._save() diff --git a/benchllm/cli/commands/evaluate.py b/benchllm/cli/commands/evaluate.py index e3e14bc..0451d5d 100644 --- a/benchllm/cli/commands/evaluate.py +++ b/benchllm/cli/commands/evaluate.py @@ -1,13 +1,13 @@ from pathlib import Path +from benchllm.cache import FileCache from benchllm.cli.listener import ReportListener, RichCliListener -from benchllm.cli.utils import get_evaluator -from benchllm.evaluator import load_prediction_files -from benchllm.utils import find_json_yml_files +from benchllm.cli.utils import add_cache, get_evaluator +from benchllm.utils import find_json_yml_files, load_prediction_files def evaluate_predictions( - file_or_dir: list[Path], model: str, output_dir: Path, workers: int, evaluator_name: str + file_or_dir: list[Path], model: str, output_dir: Path, workers: int, evaluator_name: str, cache: str ) -> bool: files = find_json_yml_files(file_or_dir) @@ -17,6 +17,10 @@ def evaluate_predictions( load_prediction_files(file_or_dir) evaluator = get_evaluator(evaluator_name, model, workers) + evaluator = add_cache(cache, evaluator, output_dir.parent / "cache.json") + + cli_listener.set_evaulator(evaluator) + evaluator.add_listener(cli_listener) evaluator.add_listener(report_listener) for file in files: diff --git a/benchllm/cli/commands/run_suite.py b/benchllm/cli/commands/run_suite.py index 53e39ca..075d8c0 100644 --- a/benchllm/cli/commands/run_suite.py +++ b/benchllm/cli/commands/run_suite.py @@ -2,8 +2,9 @@ import typer +from benchllm.cache import FileCache from benchllm.cli.listener import ReportListener, RichCliListener -from benchllm.cli.utils import get_evaluator +from benchllm.cli.utils import add_cache, get_evaluator from benchllm.tester import Tester from benchllm.utils import find_files @@ -17,6 +18,7 @@ def run_suite( workers: int, evaluator_name: str, retry_count: int, + cache: str, ) -> bool: files = find_files(file_search_paths) if not files: @@ -45,6 +47,10 @@ def run_suite( return True evaluator = get_evaluator(evaluator_name, model, workers) + evaluator = add_cache(cache, evaluator, output_dir.parent / "cache.json") + + cli_listener.set_evaulator(evaluator) + evaluator.add_listener(cli_listener) evaluator.add_listener(report_listener) evaluator.load(tester.predictions) diff --git a/benchllm/cli/evaluator.py b/benchllm/cli/evaluator.py deleted file mode 100644 index d36bc85..0000000 --- a/benchllm/cli/evaluator.py +++ /dev/null @@ -1,78 +0,0 @@ -import signal - -import typer -from pywebio import session -from pywebio.input import actions -from pywebio.output import put_markdown, put_table - -from benchllm.data_types import Prediction -from benchllm.evaluator import Evaluator - - -class InteractiveEvaluator(Evaluator): - def evaluate_prediction(self, prediction: Prediction) -> bool: - header = ( - f'{typer.style("Does ", bold=True)}' - f"{typer.style(prediction.output, fg=typer.colors.BRIGHT_BLUE, bold=True)}" - f'{typer.style(" match any of the following expected prompts?", bold=True)}' - ) - typer.echo("") - typer.echo(header) - - for i, expected in enumerate(prediction.test.expected, start=1): - typer.secho(f"{i}. ", fg=typer.colors.BRIGHT_BLUE, bold=True, nl=False) - typer.secho(expected, bold=True) - - while True: - prompt_string = ( - f'{typer.style("[")}' - f'{typer.style("y", fg=typer.colors.GREEN, bold=True)}' - f'{typer.style("/")}' - f'{typer.style("n", fg=typer.colors.RED, bold=True)}' - f'{typer.style("]")}' - ) - - response = response = typer.prompt(prompt_string).lower() - if response == "y": - return True - elif response == "n": - return False - else: - typer.secho( - 'Invalid answer. Please just use "y" to mark the test as correct, and "n" to mark the test as incorrect', - fg=typer.colors.RED, - bold=True, - ) - continue - - -class WebEvaluator(Evaluator): - def __init__(self): - super().__init__(workers=1) - - @session.defer_call - def on_close(): - typer.secho( - f"The evaluation was interrupted. Run bench eval to start again", fg=typer.colors.RED, bold=True - ) - # sys.exit doesn't work here, so we have to raise a signal to kill the process - signal.raise_signal(signal.SIGINT) - - put_markdown("# BenchLLM Web Evaluator") - - def evaluate_prediction(self, prediction: Prediction) -> bool: - test_name = prediction.test.file_path or prediction.test.id - - put_markdown(f"## {test_name}") - table = [["Question:", f"{prediction.test.input}"], ["Prediction:", prediction.output]] - for i, expected in enumerate(prediction.test.expected): - table.append([f"Expected ({i+1}):", expected]) - - put_table(table) - - result = actions( - label="Does the prediction match any of the answers?", - buttons=[{"label": "Yes", "value": True}, {"label": "No", "value": False}], - ) - - return bool(result) diff --git a/benchllm/cli/evaluator/__init__.py b/benchllm/cli/evaluator/__init__.py new file mode 100644 index 0000000..fb9ee8a --- /dev/null +++ b/benchllm/cli/evaluator/__init__.py @@ -0,0 +1,2 @@ +from benchllm.cli.evaluator.interactive import InteractiveEvaluator # noqa +from benchllm.cli.evaluator.web import WebEvaluator # noqa diff --git a/benchllm/cli/evaluator/interactive.py b/benchllm/cli/evaluator/interactive.py new file mode 100644 index 0000000..25aa068 --- /dev/null +++ b/benchllm/cli/evaluator/interactive.py @@ -0,0 +1,31 @@ +from typing import Optional + +import click +import typer + +from benchllm.data_types import Prediction +from benchllm.evaluator import Evaluator + + +class InteractiveEvaluator(Evaluator): + def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]: + header = ( + f'{typer.style("Does ", bold=True)}' + f"{typer.style(prediction.output, fg=typer.colors.BRIGHT_BLUE, bold=True)}" + f'{typer.style(" match any of the following expected prompts?", bold=True)}' + ) + typer.echo("") + typer.echo(header) + + for i, expected in enumerate(prediction.test.expected, start=1): + typer.secho(f"{i}. ", fg=typer.colors.BRIGHT_BLUE, bold=True, nl=False) + typer.secho(expected, bold=True) + + options = [str(idx) for idx, _ in enumerate(prediction.test.expected, start=1)] + ["n"] + + prompt_string = f"[{typer.style('matching number', fg=typer.colors.GREEN, bold=True)} or {typer.style('n', fg=typer.colors.RED, bold=True)}]" + click_choice = click.Choice(options) + response = typer.prompt(prompt_string, default="n", type=click_choice, show_choices=False).lower() + if response == "n": + return None + return Evaluator.Match(prediction=prediction.output, expected=prediction.test.expected[int(response) - 1]) diff --git a/benchllm/cli/evaluator/web.py b/benchllm/cli/evaluator/web.py new file mode 100644 index 0000000..7ca7caf --- /dev/null +++ b/benchllm/cli/evaluator/web.py @@ -0,0 +1,47 @@ +import signal +from typing import Optional + +import typer +from pywebio import session +from pywebio.input import radio +from pywebio.output import put_markdown + +from benchllm.data_types import Prediction +from benchllm.evaluator import Evaluator + + +class WebEvaluator(Evaluator): + def __init__(self) -> None: + super().__init__(workers=1) + + @session.defer_call + def on_close() -> None: + print("shutting down") + typer.secho( + f"The evaluation was interrupted. Run bench eval to start again", fg=typer.colors.RED, bold=True + ) + # sys.exit doesn't work here, so we have to raise a signal to kill the process + signal.raise_signal(signal.SIGINT) + + put_markdown("# BenchLLM Web Evaluator") + + def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]: + test_name = prediction.test.file_path or prediction.test.id + + put_markdown(f"## {test_name}") + put_markdown(f"*Question*: `{prediction.test.input}`") + put_markdown(f"*Prediction*: `{prediction.output}`") + + table = [["Question:", f"{prediction.test.input}", ""], ["Prediction:", prediction.output], ""] + label = f"Question: {prediction.test.input}Prediction: {prediction.output}" + + options: list[dict[str, Optional[int | str]]] = [ + {"label": expected, "value": idx} for idx, expected in enumerate(prediction.test.expected) + ] + options.append({"label": "None", "value": None, "selected": True}) + answer = radio("Pick the matching answer", options=options, required=True) + + if answer and isinstance(answer, int): + return Evaluator.Match(prediction=prediction.output, expected=prediction.test.expected[answer]) + else: + return None diff --git a/benchllm/cli/listener.py b/benchllm/cli/listener.py index e1f517e..f1f6135 100644 --- a/benchllm/cli/listener.py +++ b/benchllm/cli/listener.py @@ -1,6 +1,7 @@ import datetime import json from pathlib import Path +from typing import Optional import typer from rich import print @@ -8,7 +9,9 @@ from rich.markup import render from rich.table import Table +from benchllm.cache import MemoryCache from benchllm.data_types import Evaluation, FunctionID, Prediction, Test, TestFunction +from benchllm.evaluator import Evaluator from benchllm.listener import EvaluatorListener, TesterListener @@ -37,12 +40,23 @@ def evaluate_prediction_ended(self, evaluation: Evaluation) -> None: class RichCliListener(TesterListener, EvaluatorListener): - def __init__(self, root_dir: Path, *, interactive: bool, test_only: bool = False, eval_only: bool = False) -> None: + def __init__( + self, + root_dir: Path, + *, + interactive: bool, + test_only: bool = False, + eval_only: bool = False, + ) -> None: super().__init__() self.root_dir = root_dir self.interactive = interactive self._eval_only = eval_only self._test_only = test_only + self._evaluator: Optional[Evaluator] = None + + def set_evaulator(self, evaluator: Evaluator) -> None: + self._evaluator = evaluator def test_run_started(self) -> None: print_centered(" Run Tests ") @@ -116,6 +130,9 @@ def evaluate_ended(self, evaluations: list[Evaluation]) -> None: console.print(table) tmp = f" [red]{len(failed)} failed[/red], [green]{len(evaluations) - len(failed)} passed[/green], in [blue]{format_time(total_eval_time + total_test_time)}[/blue] " + if isinstance(self._evaluator, MemoryCache): + tmp += f"(cached hits {self._evaluator.num_cache_hits}, cached misses {self._evaluator.num_cache_misses}) " + print_centered(tmp) diff --git a/benchllm/cli/main.py b/benchllm/cli/main.py index 353fca7..e2fe202 100644 --- a/benchllm/cli/main.py +++ b/benchllm/cli/main.py @@ -28,6 +28,7 @@ def run( workers: Annotated[int, typer.Option(help="Number of workers to use to run the evaluation.")] = 1, retry_count: Annotated[int, typer.Option(help="Rerun tests to spot flaky output")] = 1, evaluator: Annotated[str, typer.Option(help="Evaluator to use to run the evaluation.")] = "semantic", + cache: Annotated[str, typer.Option(help="Type of cache to use.")] = "file", ) -> None: if not file_or_dir: file_or_dir = [Path.cwd()] @@ -40,6 +41,7 @@ def run( evaluator_name=evaluator, no_eval=not eval, retry_count=retry_count, + cache=cache, ) if not success: raise typer.Exit(code=1) @@ -61,6 +63,7 @@ def eval( model: Annotated[str, typer.Option(help="Model to use to run the evaluation.")] = "gpt-3", workers: Annotated[int, typer.Option(help="Number of workers to use to run the evaluation.")] = 1, evaluator: Annotated[str, typer.Option(help="Evaluator to use to run the evaluation.")] = "semantic", + cache: Annotated[str, typer.Option(help="Type of cache to use.")] = "file", ) -> None: success = evaluate_predictions( file_or_dir=file_or_dir, @@ -68,6 +71,7 @@ def eval( output_dir=output_dir, workers=workers, evaluator_name=evaluator, + cache=cache, ) if not success: raise typer.Exit(code=1) diff --git a/benchllm/cli/utils.py b/benchllm/cli/utils.py index 9364e76..501048e 100644 --- a/benchllm/cli/utils.py +++ b/benchllm/cli/utils.py @@ -1,6 +1,7 @@ import datetime from pathlib import Path +from benchllm.cache import FileCache, MemoryCache from benchllm.cli.evaluator import InteractiveEvaluator, WebEvaluator from benchllm.evaluator import Evaluator, SemanticEvaluator, StringMatchEvaluator @@ -28,3 +29,14 @@ def get_evaluator(evaluator_name: str, model: str, workers: int) -> Evaluator: return WebEvaluator() else: raise ValueError(f"Unknown evaluator {evaluator_name}") + + +def add_cache(cache_name: str, evaluator: Evaluator, cache_path: Path) -> Evaluator: + if cache_name == "file": + return FileCache(evaluator, cache_path) + elif cache_name == "memory": + return MemoryCache(evaluator) + elif cache_name == "none": + return evaluator + else: + raise ValueError(f"Unknown cache {cache_name}, valid values are 'file', 'memory', 'none'") diff --git a/benchllm/evaluator/__init__.py b/benchllm/evaluator/__init__.py new file mode 100644 index 0000000..d92e96e --- /dev/null +++ b/benchllm/evaluator/__init__.py @@ -0,0 +1,3 @@ +from benchllm.evaluator.evaluator import Evaluator # noqa +from benchllm.evaluator.semantic import SemanticEvaluator # noqa +from benchllm.evaluator.string_match import StringMatchEvaluator # noqa diff --git a/benchllm/evaluator.py b/benchllm/evaluator/evaluator.py similarity index 63% rename from benchllm/evaluator.py rename to benchllm/evaluator/evaluator.py index 27d8f68..87affb6 100644 --- a/benchllm/evaluator.py +++ b/benchllm/evaluator/evaluator.py @@ -5,13 +5,14 @@ from operator import attrgetter from pathlib import Path from timeit import default_timer as timer -from typing import List +from typing import Optional import yaml +from pydantic import BaseModel from benchllm.data_types import Evaluation, FunctionID, Prediction +from benchllm.input_types import Json from benchllm.listener import EvaluatorListener -from benchllm.similarity import semantically_similar class Evaluator(ABC): @@ -21,6 +22,10 @@ def __init__(self, workers: int = 1): self._evaluations: list[Evaluation] = [] self._workers: int = workers + class Match(BaseModel): + prediction: Json + expected: Json + def add_listener(self, listener: EvaluatorListener) -> None: self._listeners.append(listener) @@ -55,7 +60,9 @@ def _run_evaluation(self, prediction: Prediction) -> Evaluation: start = timer() match = self.evaluate_prediction(prediction) end = timer() - evaluation = Evaluation(prediction=prediction, passed=match, eval_time_elapsed=end - start) + evaluation = Evaluation( + prediction=prediction, passed=isinstance(match, Evaluator.Match), eval_time_elapsed=end - start + ) self._broadcast_evaluate_prediction_ended(evaluation) return evaluation @@ -71,8 +78,17 @@ def failed(self) -> list[Evaluation]: def evaluations(self) -> list[Evaluation]: return self._evaluations + @property + def workers(self) -> int: + return self._workers + + @property + def predictions(self) -> list[Prediction]: + return self._predictions + @abstractmethod - def evaluate_prediction(self, prediction: Prediction) -> bool: + def evaluate_prediction(self, prediction: Prediction) -> Optional[Match]: + """Evaluate a single prediction, return a Match if the prediction matches the expected output.""" pass def max_threads(self) -> int: @@ -101,59 +117,3 @@ def _broadcast_evaluate_module_ended(self) -> None: def _broadcast_evaluate_ended(self, evaluations: list[Evaluation]) -> None: for listener in self._listeners: listener.evaluate_ended(evaluations) - - -class SemanticEvaluator(Evaluator): - def __init__(self, *, model: str = "gpt-3", workers: int = 1): - super().__init__(workers=workers) - self.model = model - - def evaluate_prediction(self, prediction: Prediction) -> bool: - for expected in prediction.test.expected: - if semantically_similar(expected, prediction.output, model=self.model): - return True - return False - - -class StringMatchEvaluator(Evaluator): - def __init__(self, *, case_sensitive: bool = False, fuzzy: bool = False, workers: int = 1): - super().__init__(workers=workers) - - self._case_sensitive = case_sensitive - self._fuzzy = fuzzy - - def match_strings(self, expected: str, output: str) -> bool: - if not self._case_sensitive: - expected = expected.lower() - output = output.lower() - - if self._fuzzy: - return expected in output or output in expected - - return expected == output - - def evaluate_prediction(self, prediction: Prediction) -> bool: - output = prediction.output - return any([self.match_strings(expected, output) for expected in prediction.test.expected]) - - -def load_prediction_files(paths: List[Path]) -> List[Prediction]: - import json - - import yaml - - predictions = [] - for path in paths: - for file_path in path.rglob("*"): - if not file_path.is_file(): - continue - if file_path.suffix not in {".json", ".yml", ".yaml"}: - continue - with open(file_path, "r") as file: - if file_path.suffix == ".json": - data = json.load(file) - predictions.append(Prediction(**data)) - elif file_path.suffix in {".yml", ".yaml"}: - data = yaml.safe_load(file) - predictions.append(Prediction(**data)) - return predictions diff --git a/benchllm/evaluator/semantic.py b/benchllm/evaluator/semantic.py new file mode 100644 index 0000000..73881a4 --- /dev/null +++ b/benchllm/evaluator/semantic.py @@ -0,0 +1,17 @@ +from typing import Optional + +from benchllm.data_types import Prediction +from benchllm.evaluator import Evaluator +from benchllm.similarity import semantically_similar + + +class SemanticEvaluator(Evaluator): + def __init__(self, *, model: str = "gpt-3", workers: int = 1): + super().__init__(workers=workers) + self.model = model + + def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]: + for expected in prediction.test.expected: + if semantically_similar(expected, prediction.output, model=self.model): + return Evaluator.Match(prediction=prediction.output, expected=expected) + return None diff --git a/benchllm/evaluator/string_match.py b/benchllm/evaluator/string_match.py new file mode 100644 index 0000000..eb59951 --- /dev/null +++ b/benchllm/evaluator/string_match.py @@ -0,0 +1,30 @@ +import json +from typing import Optional + +from benchllm.data_types import Prediction +from benchllm.evaluator import Evaluator + + +class StringMatchEvaluator(Evaluator): + def __init__(self, *, case_sensitive: bool = False, fuzzy: bool = False, workers: int = 1): + super().__init__(workers=workers) + + self._case_sensitive = case_sensitive + self._fuzzy = fuzzy + + def match_strings(self, expected: str, output: str) -> bool: + if not self._case_sensitive: + expected = expected.lower() + output = output.lower() + + if self._fuzzy: + return expected in output or output in expected + + return expected == output + + def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]: + output = prediction.output + for expected in prediction.test.expected: + if self.match_strings(expected, output): + return Evaluator.Match(prediction=prediction.output, expected=expected) + return None diff --git a/benchllm/input_types.py b/benchllm/input_types.py index b643261..d72e503 100644 --- a/benchllm/input_types.py +++ b/benchllm/input_types.py @@ -1,8 +1,9 @@ -import json -from typing import TypedDict +from typing import TypedDict, Union from pydantic import BaseModel +Json = Union[str, bool, list, dict] + class ChatInputItem(TypedDict): role: str diff --git a/benchllm/similarity.py b/benchllm/similarity.py index a6f5fbd..787d7d6 100644 --- a/benchllm/similarity.py +++ b/benchllm/similarity.py @@ -47,4 +47,6 @@ def semantically_similar(answer1: str, answer2: str, model: str = "gpt-3") -> bo }}""", model=model, ) + if response not in ["same", "different"]: + raise ValueError(f"Unexpected response: {response}") return response == "same" diff --git a/benchllm/singleton.py b/benchllm/singleton.py index 503fc1f..456bbe2 100644 --- a/benchllm/singleton.py +++ b/benchllm/singleton.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Any, Callable, Generic, Optional, Type, TypeVar +from typing import Callable, Generic, Optional, Type, TypeVar T = TypeVar("T") diff --git a/benchllm/utils.py b/benchllm/utils.py index 633ea99..b4f6f74 100644 --- a/benchllm/utils.py +++ b/benchllm/utils.py @@ -1,6 +1,11 @@ import ast +import json from pathlib import Path +import yaml + +from benchllm.data_types import Prediction + class DecoratorFinder(ast.NodeVisitor): def __init__(self) -> None: @@ -59,3 +64,21 @@ def find_json_yml_files(paths: list[Path]) -> list[Path]: if file.suffix in (".yml", ".json", ".yaml"): files.append(file) return list(set(files)) + + +def load_prediction_files(paths: list[Path]) -> list[Prediction]: + predictions = [] + for path in paths: + for file_path in path.rglob("*"): + if not file_path.is_file(): + continue + if file_path.suffix not in {".json", ".yml", ".yaml"}: + continue + with open(file_path, "r") as file: + if file_path.suffix == ".json": + data = json.load(file) + predictions.append(Prediction(**data)) + elif file_path.suffix in {".yml", ".yaml"}: + data = yaml.safe_load(file) + predictions.append(Prediction(**data)) + return predictions diff --git a/test/cache/test_file_cache.py b/test/cache/test_file_cache.py new file mode 100644 index 0000000..d5218c0 --- /dev/null +++ b/test/cache/test_file_cache.py @@ -0,0 +1,74 @@ +import tempfile +from pathlib import Path +from unittest.mock import patch + +from benchllm import Prediction, StringMatchEvaluator, Test +from benchllm.cache import FileCache +from benchllm.data_types import FunctionID + +EXAMPLE_PREDICTIONS = [ + Prediction( + test=Test(input="foo", expected=["abc", "def", "ghi"]), + output="no-match", + time_elapsed=0, + function_id=FunctionID.default(), + ), + Prediction( + test=Test(input="foo", expected=["abc", "def", "ghi"]), + output="def", + time_elapsed=0, + function_id=FunctionID.default(), + ), + Prediction( + test=Test(input="foo", expected=["abc", "def", "ghi"]), + output="no-match", + time_elapsed=0, + function_id=FunctionID.default(), + ), +] + +EXAMPLE_PREDICTIONS_ALL_SAME = [ + Prediction( + test=Test(input="foo", expected=["match"]), + output="match", + time_elapsed=0, + function_id=FunctionID.default(), + ), + Prediction( + test=Test(input="foo", expected=["match"]), + output="match", + time_elapsed=0, + function_id=FunctionID.default(), + ), +] + + +def test_file_writes_at_end(): + with patch.object( + StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction + ) as mock_method: + with tempfile.TemporaryDirectory() as temp_dir: + cache_path = Path(temp_dir, "cache.json") + evaluator = FileCache(StringMatchEvaluator(), cache_path) + evaluator.load(EXAMPLE_PREDICTIONS) + + evaluations = evaluator.run() + assert cache_path.exists() + assert not evaluations[0].passed + assert evaluations[1].passed + assert not evaluations[2].passed + assert mock_method.call_count == 2 + assert evaluator.num_cache_hits == 1 + mock_method.reset_mock() + + # second run will use cache + evaluator = FileCache(StringMatchEvaluator(), cache_path) + evaluator.load(EXAMPLE_PREDICTIONS) + + evaluations = evaluator.run() + assert cache_path.exists() + assert not evaluations[0].passed + assert evaluations[1].passed + assert not evaluations[2].passed + assert mock_method.call_count == 0 + assert evaluator.num_cache_hits == 3 diff --git a/test/cache/test_memory_cache.py b/test/cache/test_memory_cache.py new file mode 100644 index 0000000..0d4288b --- /dev/null +++ b/test/cache/test_memory_cache.py @@ -0,0 +1,163 @@ +from unittest.mock import patch + +from benchllm import Prediction, StringMatchEvaluator, Test +from benchllm.cache import MemoryCache +from benchllm.data_types import FunctionID + +EXAMPLE_PREDICTIONS = [ + Prediction( + test=Test(input="foo", expected=["abc", "def", "ghi"]), + output="no-match", + time_elapsed=0, + function_id=FunctionID.default(), + ), + Prediction( + test=Test(input="foo", expected=["abc", "def", "ghi"]), + output="def", + time_elapsed=0, + function_id=FunctionID.default(), + ), + Prediction( + test=Test(input="foo", expected=["abc", "def", "ghi"]), + output="no-match", + time_elapsed=0, + function_id=FunctionID.default(), + ), +] + +EXAMPLE_PREDICTIONS_ALL_SAME = [ + Prediction( + test=Test(input="foo", expected=["match"]), + output="match", + time_elapsed=0, + function_id=FunctionID.default(), + ), + Prediction( + test=Test(input="foo", expected=["match"]), + output="match", + time_elapsed=0, + function_id=FunctionID.default(), + ), +] + + +EXAMPLE_PREDICTIONS_CACHING_NEGATIVE = [ + Prediction( + test=Test(input="foo", expected=["no-match"]), + output="match", + time_elapsed=0, + function_id=FunctionID.default(), + ), + Prediction( + test=Test(input="foo", expected=["no-match", "match"]), + output="match", + time_elapsed=0, + function_id=FunctionID.default(), + ), + Prediction( + test=Test(input="foo", expected=["match", "no-match"]), + output="match", + time_elapsed=0, + function_id=FunctionID.default(), + ), +] + + +def test_memory_cache_will_prevent_calls_to_evaluate_prediction_on_second_run(): + with patch.object( + StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction + ) as mock_method: + evaluator = MemoryCache(StringMatchEvaluator()) + evaluator.load(EXAMPLE_PREDICTIONS) + evaluations = evaluator.run() + assert not evaluations[0].passed + assert evaluations[1].passed + assert not evaluations[2].passed + assert mock_method.call_count == 2 + assert evaluator.num_cache_hits == 1 + mock_method.reset_mock() + + # second run will use cache + evaluations = evaluator.run() + assert not evaluations[0].passed + assert evaluations[1].passed + assert not evaluations[2].passed + assert evaluator.num_cache_hits == 4 + assert mock_method.call_count == 0 + + +def test_memory_cache_caches_during_run(): + with patch.object( + StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction + ) as mock_method: + evaluator = MemoryCache(StringMatchEvaluator()) + evaluator.load(EXAMPLE_PREDICTIONS_ALL_SAME) + + evaluations = evaluator.run() + assert evaluations[0].passed + assert evaluations[1].passed + assert mock_method.call_count == 1 + assert evaluator.num_cache_hits == 1 + + +def test_memory_cache_caches_always_tries_to_pass(): + with patch.object( + StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction + ) as mock_method: + evaluator = MemoryCache(StringMatchEvaluator()) + evaluator.load(EXAMPLE_PREDICTIONS_CACHING_NEGATIVE) + + evaluations = evaluator.run() + assert not evaluations[0].passed + assert evaluations[1].passed + assert evaluations[2].passed + assert mock_method.call_count == 2 + assert evaluator.num_cache_hits == 1 + + +def test_memory_cache_does_not_pass_on_cached_negatives(): + with patch.object( + StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction + ) as mock_method: + evaluator = MemoryCache(StringMatchEvaluator()) + evaluator.load(EXAMPLE_PREDICTIONS_CACHING_NEGATIVE) + + evaluator.run() + assert mock_method.call_count == 2 + assert mock_method.call_args_list.pop(0).args[0].test.expected == ["no-match"] + assert mock_method.call_args_list.pop(0).args[0].test.expected == ["match"] + + +def test_memory_cache_supports_numbers(): + with patch.object( + StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction + ) as mock_method: + evaluator = MemoryCache(StringMatchEvaluator()) + evaluator.load( + [ + Prediction( + test=Test(input="foo", expected=["42"]), + output="42", + time_elapsed=0, + function_id=FunctionID.default(), + ), + Prediction( + test=Test(input="foo", expected=["42"]), + output="42", + time_elapsed=0, + function_id=FunctionID.default(), + ), + Prediction( + test=Test(input="foo", expected=["42"]), + output="24", + time_elapsed=0, + function_id=FunctionID.default(), + ), + ] + ) + evaluations = evaluator.run() + assert evaluations[0].passed + assert evaluations[1].passed + assert not evaluations[2].passed + assert mock_method.call_count == 2 + assert evaluator.num_cache_hits == 1 diff --git a/test/cli/test_interactive.py b/test/cli/test_interactive.py new file mode 100644 index 0000000..61bee1c --- /dev/null +++ b/test/cli/test_interactive.py @@ -0,0 +1,35 @@ +from unittest import mock + +import typer + +from benchllm.cli.evaluator import InteractiveEvaluator +from benchllm.data_types import FunctionID, Prediction, Test + +TEST_PREDICTION = [ + Prediction( + test=Test(input="Who are you?", expected=["Yoda I am.", "Yoda"]), + output="I am Yoda.", + time_elapsed=0, + function_id=FunctionID.default(), + ) +] + + +def test_interactive_press_y_passes(): + evalautor = InteractiveEvaluator() + evalautor.load(TEST_PREDICTION) + with mock.patch.object(typer, "prompt", lambda *args, **kwargs: "1"): + result = evalautor.run() + assert result[0].passed + + with mock.patch.object(typer, "prompt", lambda *args, **kwargs: "2"): + result = evalautor.run() + assert result[0].passed + + +def test_interactive_press_n_fails(): + evalautor = InteractiveEvaluator() + evalautor.load(TEST_PREDICTION) + with mock.patch.object(typer, "prompt", lambda *args, **kwargs: "n"): + result = evalautor.run() + assert not result[0].passed diff --git a/test/cli/test_list_tests.py b/test/cli/test_list_tests.py new file mode 100644 index 0000000..e54a541 --- /dev/null +++ b/test/cli/test_list_tests.py @@ -0,0 +1,14 @@ +from pathlib import Path + +from typer.testing import CliRunner + +from benchllm.cli.main import app + +runner = CliRunner() + + +def test_list_tests(): + result = runner.invoke(app, ["tests", str(Path.cwd() / "examples/qa")]) + assert "Input" in result.stdout + assert "No." in result.stdout + assert "Expected" in result.stdout diff --git a/test/test_cli_main.py b/test/cli/test_run_suite.py similarity index 73% rename from test/test_cli_main.py rename to test/cli/test_run_suite.py index 653cf47..1fa2ae5 100644 --- a/test/test_cli_main.py +++ b/test/cli/test_run_suite.py @@ -1,4 +1,3 @@ -from pathlib import Path from test.utils import create_openai_object from unittest.mock import MagicMock, patch @@ -19,10 +18,3 @@ def test_run_multiple_suites(completion_mock: MagicMock): def test_run_target_suite(completion_mock: MagicMock): runner.invoke(app, ["run", "examples/qa"]) completion_mock.assert_called() - - -def test_list_tests(): - result = runner.invoke(app, ["tests", str(Path.cwd() / "examples/qa")]) - assert "Input" in result.stdout - assert "No." in result.stdout - assert "Expected" in result.stdout diff --git a/test/evaulator/test_evalutator.py b/test/evaulator/test_evalutator.py new file mode 100644 index 0000000..ab8aa4b --- /dev/null +++ b/test/evaulator/test_evalutator.py @@ -0,0 +1,34 @@ +import json +import tempfile +from pathlib import Path +from test.utils import create_openai_object +from unittest.mock import MagicMock, Mock, call, patch + +from benchllm import Prediction, SemanticEvaluator, StringMatchEvaluator, Test +from benchllm.cache import MemoryCache +from benchllm.data_types import FunctionID +from benchllm.evaluator import Evaluator + + +class NoopEvaluator(Evaluator): + def evaluate_prediction(self, prediction: Prediction) -> Evaluator.Match: + return Evaluator.Match(prediction=prediction.output, expected=prediction.output) + + +def test_evaluator_can_load_prediction_file(): + prediction = { + "output": "42", + "test": {"input": "1+1", "expected": ["2"]}, + "time_elapsed": 0, + "function_id": {"module_path": "test", "line_number": 1}, + } + with tempfile.TemporaryDirectory() as tmpdir: + prediction_path = Path(tmpdir, "prediction.json") + prediction_path.write_bytes(json.dumps(prediction).encode()) + + evaluator = NoopEvaluator() + evaluator.load_prediction_file(prediction_path) + + assert evaluator.predictions[0].output == "42" + assert evaluator.predictions[0].test.input == "1+1" + assert evaluator.predictions[0].test.expected == ["2"] diff --git a/test/evaulator/test_semantic.py b/test/evaulator/test_semantic.py new file mode 100644 index 0000000..92dd9b2 --- /dev/null +++ b/test/evaulator/test_semantic.py @@ -0,0 +1,60 @@ +from test.utils import create_openai_object +from unittest.mock import MagicMock, patch + +from benchllm import Prediction, SemanticEvaluator, Test +from benchllm.data_types import FunctionID + + +@patch("openai.Completion.create", return_value=create_openai_object("same")) +def test_semantic_passes_if_output_is_equal(completion_mock: MagicMock): + evaluator = SemanticEvaluator(model="gpt-3") + evaluator.load( + [ + Prediction( + test=Test(input="Who are you?", expected=["Yoda I am."]), + output="I am Yoda.", + time_elapsed=0, + function_id=FunctionID.default(), + ) + ] + ) + evaluations = evaluator.run() + completion_mock.assert_called_once() + assert evaluations[0].passed + + +@patch("openai.Completion.create", return_value=create_openai_object("different")) +def test_semantic_fails_if_output_is_unequal(completion_mock: MagicMock): + evaluator = SemanticEvaluator(model="gpt-3") + evaluator.load( + [ + Prediction( + test=Test(input="What are you?", expected=["Everything"]), + output="Nothing", + time_elapsed=0, + function_id=FunctionID.default(), + ), + ] + ) + evaluations = evaluator.run() + completion_mock.assert_called_once() + assert not evaluations[0].passed + + +@patch("openai.Completion.create", return_value=create_openai_object("same")) +def test_semantic_passes_if_output_is_equal_multiple_workers(completion_mock: MagicMock): + evaluator = SemanticEvaluator(model="gpt-3", workers=10) + evaluator.load( + [ + Prediction( + test=Test(input="Who are you?", expected=["Yoda I am."]), + output="I am Yoda.", + time_elapsed=0, + function_id=FunctionID.default(), + ) + for _ in range(100) + ] + ) + evaluations = evaluator.run() + assert completion_mock.call_count == 100 + assert all([evaluation.passed for evaluation in evaluations]) diff --git a/test/test_evalutator.py b/test/evaulator/test_string_match.py similarity index 50% rename from test/test_evalutator.py rename to test/evaulator/test_string_match.py index 608172a..75ecc89 100644 --- a/test/test_evalutator.py +++ b/test/evaulator/test_string_match.py @@ -1,9 +1,4 @@ -import tempfile -from pathlib import Path -from test.utils import create_openai_object -from unittest.mock import MagicMock, Mock, call, patch - -from benchllm import Prediction, SemanticEvaluator, StringMatchEvaluator, Test +from benchllm import Prediction, StringMatchEvaluator, Test from benchllm.data_types import FunctionID @@ -66,58 +61,3 @@ def test_string_match_passes_if_output_is_equal_to_expected_fuzzy(): evaluations = evaluator.run() assert evaluations[0].passed assert not evaluations[1].passed - - -@patch("openai.Completion.create", return_value=create_openai_object("same")) -def test_semantic_passes_if_output_is_equal(completion_mock: MagicMock): - evaluator = SemanticEvaluator(model="gpt-3") - evaluator.load( - [ - Prediction( - test=Test(input="Who are you?", expected=["Yoda I am."]), - output="I am Yoda.", - time_elapsed=0, - function_id=FunctionID.default(), - ) - ] - ) - evaluations = evaluator.run() - completion_mock.assert_called_once() - assert evaluations[0].passed - - -@patch("openai.Completion.create", return_value=create_openai_object("different")) -def test_semantic_fails_if_output_is_unequal(completion_mock: MagicMock): - evaluator = SemanticEvaluator(model="gpt-3") - evaluator.load( - [ - Prediction( - test=Test(input="What are you?", expected=["Everything"]), - output="Nothing", - time_elapsed=0, - function_id=FunctionID.default(), - ), - ] - ) - evaluations = evaluator.run() - completion_mock.assert_called_once() - assert not evaluations[0].passed - - -@patch("openai.Completion.create", return_value=create_openai_object("same")) -def test_semantic_passes_if_output_is_equal_multiple_workers(completion_mock: MagicMock): - evaluator = SemanticEvaluator(model="gpt-3", workers=10) - evaluator.load( - [ - Prediction( - test=Test(input="Who are you?", expected=["Yoda I am."]), - output="I am Yoda.", - time_elapsed=0, - function_id=FunctionID.default(), - ) - for _ in range(100) - ] - ) - evaluations = evaluator.run() - assert completion_mock.call_count == 100 - assert all([evaluation.passed for evaluation in evaluations])