diff --git a/README.md b/README.md
index 4206fe0..972395d 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# ๐๏ธโโ๏ธ BenchLLM ๐๏ธโโ๏ธ
-๐ฆพ Continuous Integration for LLM powered applications ๐ฆ๐ฆ
๐ค
+๐ฆพ Continuous Integration for LLM powered applications ๐ฆ๐ฆ
๐ค
[![GitHub Repo stars](https://img.shields.io/github/stars/v7labs/BenchLLM?style=social)](https://github.com/v7labs/BenchLLM/stargazers)
[![Twitter Follow](https://img.shields.io/twitter/follow/V7Labs?style=social)](https://twitter.com/V7Labs)
@@ -10,7 +10,6 @@
BenchLLM is actively used at [V7](https://www.v7labs.com) for improving our LLM applications and is now Open Sourced under MIT License to share with the wider community
-
## ๐ก Get help on [Discord](https://discord.gg/x7ExfHb3bG) or [Tweet at us](https://twitter.com/V7Labs)
@@ -26,7 +25,7 @@ Use BenchLLM to:
> โ ๏ธ **NOTE:** BenchLLM is in the early stage of development and will be subject to rapid changes.
>
->For bug reporting, feature requests, or contributions, please open an issue or submit a pull request (PR) on our GitHub page.
+> For bug reporting, feature requests, or contributions, please open an issue or submit a pull request (PR) on our GitHub page.
## ๐งช BenchLLM Testing Methodology
@@ -116,6 +115,16 @@ The non interactive evaluators also supports `--workers N` to run in the evaluat
$ bench run --evaluator string-match --workers 5
```
+To accelerate the evaluation process, BenchLLM uses a cache. If a (prediction, expected) pair has been evaluated in the past and a cache was used, the evaluation output will be saved for future evaluations. There are several types of caches:
+
+- `memory`, only caches output values during the current run. This is particularly useful when running with `--retry-count N`
+- `file`, stores the cache at the end of the run as a JSON file in output/cache.json. This is the default behavior.
+- `none`, does not use any cache.
+
+```bash
+$ bench run examples --cache memory
+```
+
### ๐งฎ Eval
While _bench run_ runs each test function and then evaluates their output, it can often be beneficial to separate these into two steps. For example, if you want a person to manually do the evaluation or if you want to try multiple evaluation methods on the same function.
@@ -163,6 +172,20 @@ results = evaluator.run()
print(results)
```
+If you want to incorporate caching and run multiple parallel evaluation jobs, you can modify your evaluator as follows:
+
+```python
+from benchllm.cache import FileCache
+
+...
+
+evaluator = FileCache(StringMatchEvaluator(workers=2), Path("path/to/cache.json"))
+evaluator.load(predictions)
+results = evaluator.run()
+```
+
+In this example, `FileCache` is used to enable caching, and the `workers` parameter of `StringMatchEvaluator` is set to `2` to allow for parallel evaluations. The cache results are saved in a file specified by `Path("path/to/cache.json")`.
+
## โ๏ธ Commands
- `bench add`: Add a new test to a suite.
diff --git a/benchllm/__init__.py b/benchllm/__init__.py
index 43cb476..8c0bf5f 100644
--- a/benchllm/__init__.py
+++ b/benchllm/__init__.py
@@ -1,10 +1,10 @@
import inspect
from pathlib import Path
-from typing import Any, Callable, Generic, Optional, Type, TypeVar
+from typing import Callable, Type, TypeVar
from .data_types import Evaluation, Prediction, Test # noqa
from .evaluator import Evaluator, SemanticEvaluator, StringMatchEvaluator # noqa
-from .input_types import ChatInput, SimilarityInput
+from .input_types import ChatInput, SimilarityInput # noqa
from .similarity import semantically_similar # noqa
from .singleton import TestSingleton # noqa
from .tester import Tester # noqa
diff --git a/benchllm/cache.py b/benchllm/cache.py
new file mode 100644
index 0000000..9981d47
--- /dev/null
+++ b/benchllm/cache.py
@@ -0,0 +1,96 @@
+import json
+from pathlib import Path
+from typing import Optional
+
+from benchllm.data_types import Evaluation, Prediction
+from benchllm.evaluator import Evaluator
+from benchllm.input_types import Json
+from benchllm.listener import EvaluatorListener
+
+
+class MemoryCache(Evaluator):
+ """Caches the results of the evaluator in memory"""
+
+ def __init__(self, evaluator: Evaluator):
+ super().__init__(workers=evaluator.workers)
+ self._data: dict = {}
+ self._evaluator = evaluator
+ self._num_cache_misses = 0
+ self._num_cache_hits = 0
+
+ def _key(self, answer1: Json, answer2: Json) -> str:
+ key1, key2 = json.dumps([answer1, answer2]), json.dumps([answer2, answer1])
+ return key1 if key1 < key2 else key2
+
+ def lookup(self, answer1: Json, answer2: Json) -> Optional[bool]:
+ return self._data.get(self._key(answer1, answer2), None)
+
+ def store(self, answer1: Json, answer2: Json, value: bool) -> None:
+ key = self._key(answer1, answer2)
+ self._data[key] = value
+
+ def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]:
+ uncached_expectations = []
+ for expected in prediction.test.expected:
+ lookup = self.lookup(expected, prediction.output)
+ if lookup is None:
+ uncached_expectations.append(expected)
+ elif lookup:
+ # If we find a positive match we can stop comparing and just return.
+ # For negative matches we still need to check the other expected answers.
+ self._num_cache_hits += 1
+ return Evaluator.Match(prediction=prediction.output, expected=expected)
+
+ # If all expectations were found in the cache but were negative matches,
+ # we increment the cache hits counter and return None as there's no match.
+ if not uncached_expectations:
+ self._num_cache_hits += 1
+ return None
+
+ self._num_cache_misses += 1
+ # set prediction.test.expected to only the ones that were not cached
+ prediction = Prediction(**prediction.dict())
+ prediction.test.expected = uncached_expectations
+ result = self._evaluator.evaluate_prediction(prediction)
+ if result:
+ self.store(result.expected, result.prediction, True)
+ else:
+ for expected in prediction.test.expected:
+ self.store(expected, prediction.output, False)
+ return result
+
+ @property
+ def num_cache_hits(self) -> int:
+ return self._num_cache_hits
+
+ @property
+ def num_cache_misses(self) -> int:
+ return self._num_cache_misses
+
+
+class FileCache(MemoryCache, EvaluatorListener):
+ """Caches the results of the evaluator in a json file"""
+
+ def __init__(self, evaluator: Evaluator, path: Path):
+ super().__init__(evaluator)
+ self._path = path
+ self.add_listener(self)
+ self._load()
+
+ def _load(self) -> None:
+ if self._path.exists():
+ try:
+ cache = json.loads(self._path.read_text(encoding="UTF-8"), parse_int=str)
+ if cache["version"] != "1":
+ raise ValueError("Unsupported cache version")
+ self._data = cache["entries"]
+ except Exception:
+ print(f"Failed to load cache file {self._path}")
+ self._data = {}
+
+ def _save(self) -> None:
+ cache = {"entries": self._data, "version": "1"}
+ self._path.write_text(json.dumps(cache, indent=4), encoding="UTF-8")
+
+ def evaluate_ended(self, evaluations: list[Evaluation]) -> None:
+ self._save()
diff --git a/benchllm/cli/commands/evaluate.py b/benchllm/cli/commands/evaluate.py
index e3e14bc..0451d5d 100644
--- a/benchllm/cli/commands/evaluate.py
+++ b/benchllm/cli/commands/evaluate.py
@@ -1,13 +1,13 @@
from pathlib import Path
+from benchllm.cache import FileCache
from benchllm.cli.listener import ReportListener, RichCliListener
-from benchllm.cli.utils import get_evaluator
-from benchllm.evaluator import load_prediction_files
-from benchllm.utils import find_json_yml_files
+from benchllm.cli.utils import add_cache, get_evaluator
+from benchllm.utils import find_json_yml_files, load_prediction_files
def evaluate_predictions(
- file_or_dir: list[Path], model: str, output_dir: Path, workers: int, evaluator_name: str
+ file_or_dir: list[Path], model: str, output_dir: Path, workers: int, evaluator_name: str, cache: str
) -> bool:
files = find_json_yml_files(file_or_dir)
@@ -17,6 +17,10 @@ def evaluate_predictions(
load_prediction_files(file_or_dir)
evaluator = get_evaluator(evaluator_name, model, workers)
+ evaluator = add_cache(cache, evaluator, output_dir.parent / "cache.json")
+
+ cli_listener.set_evaulator(evaluator)
+
evaluator.add_listener(cli_listener)
evaluator.add_listener(report_listener)
for file in files:
diff --git a/benchllm/cli/commands/run_suite.py b/benchllm/cli/commands/run_suite.py
index 53e39ca..075d8c0 100644
--- a/benchllm/cli/commands/run_suite.py
+++ b/benchllm/cli/commands/run_suite.py
@@ -2,8 +2,9 @@
import typer
+from benchllm.cache import FileCache
from benchllm.cli.listener import ReportListener, RichCliListener
-from benchllm.cli.utils import get_evaluator
+from benchllm.cli.utils import add_cache, get_evaluator
from benchllm.tester import Tester
from benchllm.utils import find_files
@@ -17,6 +18,7 @@ def run_suite(
workers: int,
evaluator_name: str,
retry_count: int,
+ cache: str,
) -> bool:
files = find_files(file_search_paths)
if not files:
@@ -45,6 +47,10 @@ def run_suite(
return True
evaluator = get_evaluator(evaluator_name, model, workers)
+ evaluator = add_cache(cache, evaluator, output_dir.parent / "cache.json")
+
+ cli_listener.set_evaulator(evaluator)
+
evaluator.add_listener(cli_listener)
evaluator.add_listener(report_listener)
evaluator.load(tester.predictions)
diff --git a/benchllm/cli/evaluator.py b/benchllm/cli/evaluator.py
deleted file mode 100644
index d36bc85..0000000
--- a/benchllm/cli/evaluator.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import signal
-
-import typer
-from pywebio import session
-from pywebio.input import actions
-from pywebio.output import put_markdown, put_table
-
-from benchllm.data_types import Prediction
-from benchllm.evaluator import Evaluator
-
-
-class InteractiveEvaluator(Evaluator):
- def evaluate_prediction(self, prediction: Prediction) -> bool:
- header = (
- f'{typer.style("Does ", bold=True)}'
- f"{typer.style(prediction.output, fg=typer.colors.BRIGHT_BLUE, bold=True)}"
- f'{typer.style(" match any of the following expected prompts?", bold=True)}'
- )
- typer.echo("")
- typer.echo(header)
-
- for i, expected in enumerate(prediction.test.expected, start=1):
- typer.secho(f"{i}. ", fg=typer.colors.BRIGHT_BLUE, bold=True, nl=False)
- typer.secho(expected, bold=True)
-
- while True:
- prompt_string = (
- f'{typer.style("[")}'
- f'{typer.style("y", fg=typer.colors.GREEN, bold=True)}'
- f'{typer.style("/")}'
- f'{typer.style("n", fg=typer.colors.RED, bold=True)}'
- f'{typer.style("]")}'
- )
-
- response = response = typer.prompt(prompt_string).lower()
- if response == "y":
- return True
- elif response == "n":
- return False
- else:
- typer.secho(
- 'Invalid answer. Please just use "y" to mark the test as correct, and "n" to mark the test as incorrect',
- fg=typer.colors.RED,
- bold=True,
- )
- continue
-
-
-class WebEvaluator(Evaluator):
- def __init__(self):
- super().__init__(workers=1)
-
- @session.defer_call
- def on_close():
- typer.secho(
- f"The evaluation was interrupted. Run bench eval to start again", fg=typer.colors.RED, bold=True
- )
- # sys.exit doesn't work here, so we have to raise a signal to kill the process
- signal.raise_signal(signal.SIGINT)
-
- put_markdown("# BenchLLM Web Evaluator")
-
- def evaluate_prediction(self, prediction: Prediction) -> bool:
- test_name = prediction.test.file_path or prediction.test.id
-
- put_markdown(f"## {test_name}")
- table = [["Question:", f"{prediction.test.input}"], ["Prediction:", prediction.output]]
- for i, expected in enumerate(prediction.test.expected):
- table.append([f"Expected ({i+1}):", expected])
-
- put_table(table)
-
- result = actions(
- label="Does the prediction match any of the answers?",
- buttons=[{"label": "Yes", "value": True}, {"label": "No", "value": False}],
- )
-
- return bool(result)
diff --git a/benchllm/cli/evaluator/__init__.py b/benchllm/cli/evaluator/__init__.py
new file mode 100644
index 0000000..fb9ee8a
--- /dev/null
+++ b/benchllm/cli/evaluator/__init__.py
@@ -0,0 +1,2 @@
+from benchllm.cli.evaluator.interactive import InteractiveEvaluator # noqa
+from benchllm.cli.evaluator.web import WebEvaluator # noqa
diff --git a/benchllm/cli/evaluator/interactive.py b/benchllm/cli/evaluator/interactive.py
new file mode 100644
index 0000000..25aa068
--- /dev/null
+++ b/benchllm/cli/evaluator/interactive.py
@@ -0,0 +1,31 @@
+from typing import Optional
+
+import click
+import typer
+
+from benchllm.data_types import Prediction
+from benchllm.evaluator import Evaluator
+
+
+class InteractiveEvaluator(Evaluator):
+ def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]:
+ header = (
+ f'{typer.style("Does ", bold=True)}'
+ f"{typer.style(prediction.output, fg=typer.colors.BRIGHT_BLUE, bold=True)}"
+ f'{typer.style(" match any of the following expected prompts?", bold=True)}'
+ )
+ typer.echo("")
+ typer.echo(header)
+
+ for i, expected in enumerate(prediction.test.expected, start=1):
+ typer.secho(f"{i}. ", fg=typer.colors.BRIGHT_BLUE, bold=True, nl=False)
+ typer.secho(expected, bold=True)
+
+ options = [str(idx) for idx, _ in enumerate(prediction.test.expected, start=1)] + ["n"]
+
+ prompt_string = f"[{typer.style('matching number', fg=typer.colors.GREEN, bold=True)} or {typer.style('n', fg=typer.colors.RED, bold=True)}]"
+ click_choice = click.Choice(options)
+ response = typer.prompt(prompt_string, default="n", type=click_choice, show_choices=False).lower()
+ if response == "n":
+ return None
+ return Evaluator.Match(prediction=prediction.output, expected=prediction.test.expected[int(response) - 1])
diff --git a/benchllm/cli/evaluator/web.py b/benchllm/cli/evaluator/web.py
new file mode 100644
index 0000000..7ca7caf
--- /dev/null
+++ b/benchllm/cli/evaluator/web.py
@@ -0,0 +1,47 @@
+import signal
+from typing import Optional
+
+import typer
+from pywebio import session
+from pywebio.input import radio
+from pywebio.output import put_markdown
+
+from benchllm.data_types import Prediction
+from benchllm.evaluator import Evaluator
+
+
+class WebEvaluator(Evaluator):
+ def __init__(self) -> None:
+ super().__init__(workers=1)
+
+ @session.defer_call
+ def on_close() -> None:
+ print("shutting down")
+ typer.secho(
+ f"The evaluation was interrupted. Run bench eval to start again", fg=typer.colors.RED, bold=True
+ )
+ # sys.exit doesn't work here, so we have to raise a signal to kill the process
+ signal.raise_signal(signal.SIGINT)
+
+ put_markdown("# BenchLLM Web Evaluator")
+
+ def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]:
+ test_name = prediction.test.file_path or prediction.test.id
+
+ put_markdown(f"## {test_name}")
+ put_markdown(f"*Question*: `{prediction.test.input}`")
+ put_markdown(f"*Prediction*: `{prediction.output}`")
+
+ table = [["Question:", f"{prediction.test.input}", ""], ["Prediction:", prediction.output], ""]
+ label = f"Question: {prediction.test.input}Prediction: {prediction.output}"
+
+ options: list[dict[str, Optional[int | str]]] = [
+ {"label": expected, "value": idx} for idx, expected in enumerate(prediction.test.expected)
+ ]
+ options.append({"label": "None", "value": None, "selected": True})
+ answer = radio("Pick the matching answer", options=options, required=True)
+
+ if answer and isinstance(answer, int):
+ return Evaluator.Match(prediction=prediction.output, expected=prediction.test.expected[answer])
+ else:
+ return None
diff --git a/benchllm/cli/listener.py b/benchllm/cli/listener.py
index e1f517e..f1f6135 100644
--- a/benchllm/cli/listener.py
+++ b/benchllm/cli/listener.py
@@ -1,6 +1,7 @@
import datetime
import json
from pathlib import Path
+from typing import Optional
import typer
from rich import print
@@ -8,7 +9,9 @@
from rich.markup import render
from rich.table import Table
+from benchllm.cache import MemoryCache
from benchllm.data_types import Evaluation, FunctionID, Prediction, Test, TestFunction
+from benchllm.evaluator import Evaluator
from benchllm.listener import EvaluatorListener, TesterListener
@@ -37,12 +40,23 @@ def evaluate_prediction_ended(self, evaluation: Evaluation) -> None:
class RichCliListener(TesterListener, EvaluatorListener):
- def __init__(self, root_dir: Path, *, interactive: bool, test_only: bool = False, eval_only: bool = False) -> None:
+ def __init__(
+ self,
+ root_dir: Path,
+ *,
+ interactive: bool,
+ test_only: bool = False,
+ eval_only: bool = False,
+ ) -> None:
super().__init__()
self.root_dir = root_dir
self.interactive = interactive
self._eval_only = eval_only
self._test_only = test_only
+ self._evaluator: Optional[Evaluator] = None
+
+ def set_evaulator(self, evaluator: Evaluator) -> None:
+ self._evaluator = evaluator
def test_run_started(self) -> None:
print_centered(" Run Tests ")
@@ -116,6 +130,9 @@ def evaluate_ended(self, evaluations: list[Evaluation]) -> None:
console.print(table)
tmp = f" [red]{len(failed)} failed[/red], [green]{len(evaluations) - len(failed)} passed[/green], in [blue]{format_time(total_eval_time + total_test_time)}[/blue] "
+ if isinstance(self._evaluator, MemoryCache):
+ tmp += f"(cached hits {self._evaluator.num_cache_hits}, cached misses {self._evaluator.num_cache_misses}) "
+
print_centered(tmp)
diff --git a/benchllm/cli/main.py b/benchllm/cli/main.py
index 353fca7..e2fe202 100644
--- a/benchllm/cli/main.py
+++ b/benchllm/cli/main.py
@@ -28,6 +28,7 @@ def run(
workers: Annotated[int, typer.Option(help="Number of workers to use to run the evaluation.")] = 1,
retry_count: Annotated[int, typer.Option(help="Rerun tests to spot flaky output")] = 1,
evaluator: Annotated[str, typer.Option(help="Evaluator to use to run the evaluation.")] = "semantic",
+ cache: Annotated[str, typer.Option(help="Type of cache to use.")] = "file",
) -> None:
if not file_or_dir:
file_or_dir = [Path.cwd()]
@@ -40,6 +41,7 @@ def run(
evaluator_name=evaluator,
no_eval=not eval,
retry_count=retry_count,
+ cache=cache,
)
if not success:
raise typer.Exit(code=1)
@@ -61,6 +63,7 @@ def eval(
model: Annotated[str, typer.Option(help="Model to use to run the evaluation.")] = "gpt-3",
workers: Annotated[int, typer.Option(help="Number of workers to use to run the evaluation.")] = 1,
evaluator: Annotated[str, typer.Option(help="Evaluator to use to run the evaluation.")] = "semantic",
+ cache: Annotated[str, typer.Option(help="Type of cache to use.")] = "file",
) -> None:
success = evaluate_predictions(
file_or_dir=file_or_dir,
@@ -68,6 +71,7 @@ def eval(
output_dir=output_dir,
workers=workers,
evaluator_name=evaluator,
+ cache=cache,
)
if not success:
raise typer.Exit(code=1)
diff --git a/benchllm/cli/utils.py b/benchllm/cli/utils.py
index 9364e76..501048e 100644
--- a/benchllm/cli/utils.py
+++ b/benchllm/cli/utils.py
@@ -1,6 +1,7 @@
import datetime
from pathlib import Path
+from benchllm.cache import FileCache, MemoryCache
from benchllm.cli.evaluator import InteractiveEvaluator, WebEvaluator
from benchllm.evaluator import Evaluator, SemanticEvaluator, StringMatchEvaluator
@@ -28,3 +29,14 @@ def get_evaluator(evaluator_name: str, model: str, workers: int) -> Evaluator:
return WebEvaluator()
else:
raise ValueError(f"Unknown evaluator {evaluator_name}")
+
+
+def add_cache(cache_name: str, evaluator: Evaluator, cache_path: Path) -> Evaluator:
+ if cache_name == "file":
+ return FileCache(evaluator, cache_path)
+ elif cache_name == "memory":
+ return MemoryCache(evaluator)
+ elif cache_name == "none":
+ return evaluator
+ else:
+ raise ValueError(f"Unknown cache {cache_name}, valid values are 'file', 'memory', 'none'")
diff --git a/benchllm/evaluator/__init__.py b/benchllm/evaluator/__init__.py
new file mode 100644
index 0000000..d92e96e
--- /dev/null
+++ b/benchllm/evaluator/__init__.py
@@ -0,0 +1,3 @@
+from benchllm.evaluator.evaluator import Evaluator # noqa
+from benchllm.evaluator.semantic import SemanticEvaluator # noqa
+from benchllm.evaluator.string_match import StringMatchEvaluator # noqa
diff --git a/benchllm/evaluator.py b/benchllm/evaluator/evaluator.py
similarity index 63%
rename from benchllm/evaluator.py
rename to benchllm/evaluator/evaluator.py
index 27d8f68..87affb6 100644
--- a/benchllm/evaluator.py
+++ b/benchllm/evaluator/evaluator.py
@@ -5,13 +5,14 @@
from operator import attrgetter
from pathlib import Path
from timeit import default_timer as timer
-from typing import List
+from typing import Optional
import yaml
+from pydantic import BaseModel
from benchllm.data_types import Evaluation, FunctionID, Prediction
+from benchllm.input_types import Json
from benchllm.listener import EvaluatorListener
-from benchllm.similarity import semantically_similar
class Evaluator(ABC):
@@ -21,6 +22,10 @@ def __init__(self, workers: int = 1):
self._evaluations: list[Evaluation] = []
self._workers: int = workers
+ class Match(BaseModel):
+ prediction: Json
+ expected: Json
+
def add_listener(self, listener: EvaluatorListener) -> None:
self._listeners.append(listener)
@@ -55,7 +60,9 @@ def _run_evaluation(self, prediction: Prediction) -> Evaluation:
start = timer()
match = self.evaluate_prediction(prediction)
end = timer()
- evaluation = Evaluation(prediction=prediction, passed=match, eval_time_elapsed=end - start)
+ evaluation = Evaluation(
+ prediction=prediction, passed=isinstance(match, Evaluator.Match), eval_time_elapsed=end - start
+ )
self._broadcast_evaluate_prediction_ended(evaluation)
return evaluation
@@ -71,8 +78,17 @@ def failed(self) -> list[Evaluation]:
def evaluations(self) -> list[Evaluation]:
return self._evaluations
+ @property
+ def workers(self) -> int:
+ return self._workers
+
+ @property
+ def predictions(self) -> list[Prediction]:
+ return self._predictions
+
@abstractmethod
- def evaluate_prediction(self, prediction: Prediction) -> bool:
+ def evaluate_prediction(self, prediction: Prediction) -> Optional[Match]:
+ """Evaluate a single prediction, return a Match if the prediction matches the expected output."""
pass
def max_threads(self) -> int:
@@ -101,59 +117,3 @@ def _broadcast_evaluate_module_ended(self) -> None:
def _broadcast_evaluate_ended(self, evaluations: list[Evaluation]) -> None:
for listener in self._listeners:
listener.evaluate_ended(evaluations)
-
-
-class SemanticEvaluator(Evaluator):
- def __init__(self, *, model: str = "gpt-3", workers: int = 1):
- super().__init__(workers=workers)
- self.model = model
-
- def evaluate_prediction(self, prediction: Prediction) -> bool:
- for expected in prediction.test.expected:
- if semantically_similar(expected, prediction.output, model=self.model):
- return True
- return False
-
-
-class StringMatchEvaluator(Evaluator):
- def __init__(self, *, case_sensitive: bool = False, fuzzy: bool = False, workers: int = 1):
- super().__init__(workers=workers)
-
- self._case_sensitive = case_sensitive
- self._fuzzy = fuzzy
-
- def match_strings(self, expected: str, output: str) -> bool:
- if not self._case_sensitive:
- expected = expected.lower()
- output = output.lower()
-
- if self._fuzzy:
- return expected in output or output in expected
-
- return expected == output
-
- def evaluate_prediction(self, prediction: Prediction) -> bool:
- output = prediction.output
- return any([self.match_strings(expected, output) for expected in prediction.test.expected])
-
-
-def load_prediction_files(paths: List[Path]) -> List[Prediction]:
- import json
-
- import yaml
-
- predictions = []
- for path in paths:
- for file_path in path.rglob("*"):
- if not file_path.is_file():
- continue
- if file_path.suffix not in {".json", ".yml", ".yaml"}:
- continue
- with open(file_path, "r") as file:
- if file_path.suffix == ".json":
- data = json.load(file)
- predictions.append(Prediction(**data))
- elif file_path.suffix in {".yml", ".yaml"}:
- data = yaml.safe_load(file)
- predictions.append(Prediction(**data))
- return predictions
diff --git a/benchllm/evaluator/semantic.py b/benchllm/evaluator/semantic.py
new file mode 100644
index 0000000..73881a4
--- /dev/null
+++ b/benchllm/evaluator/semantic.py
@@ -0,0 +1,17 @@
+from typing import Optional
+
+from benchllm.data_types import Prediction
+from benchllm.evaluator import Evaluator
+from benchllm.similarity import semantically_similar
+
+
+class SemanticEvaluator(Evaluator):
+ def __init__(self, *, model: str = "gpt-3", workers: int = 1):
+ super().__init__(workers=workers)
+ self.model = model
+
+ def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]:
+ for expected in prediction.test.expected:
+ if semantically_similar(expected, prediction.output, model=self.model):
+ return Evaluator.Match(prediction=prediction.output, expected=expected)
+ return None
diff --git a/benchllm/evaluator/string_match.py b/benchllm/evaluator/string_match.py
new file mode 100644
index 0000000..eb59951
--- /dev/null
+++ b/benchllm/evaluator/string_match.py
@@ -0,0 +1,30 @@
+import json
+from typing import Optional
+
+from benchllm.data_types import Prediction
+from benchllm.evaluator import Evaluator
+
+
+class StringMatchEvaluator(Evaluator):
+ def __init__(self, *, case_sensitive: bool = False, fuzzy: bool = False, workers: int = 1):
+ super().__init__(workers=workers)
+
+ self._case_sensitive = case_sensitive
+ self._fuzzy = fuzzy
+
+ def match_strings(self, expected: str, output: str) -> bool:
+ if not self._case_sensitive:
+ expected = expected.lower()
+ output = output.lower()
+
+ if self._fuzzy:
+ return expected in output or output in expected
+
+ return expected == output
+
+ def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]:
+ output = prediction.output
+ for expected in prediction.test.expected:
+ if self.match_strings(expected, output):
+ return Evaluator.Match(prediction=prediction.output, expected=expected)
+ return None
diff --git a/benchllm/input_types.py b/benchllm/input_types.py
index b643261..d72e503 100644
--- a/benchllm/input_types.py
+++ b/benchllm/input_types.py
@@ -1,8 +1,9 @@
-import json
-from typing import TypedDict
+from typing import TypedDict, Union
from pydantic import BaseModel
+Json = Union[str, bool, list, dict]
+
class ChatInputItem(TypedDict):
role: str
diff --git a/benchllm/similarity.py b/benchllm/similarity.py
index a6f5fbd..787d7d6 100644
--- a/benchllm/similarity.py
+++ b/benchllm/similarity.py
@@ -47,4 +47,6 @@ def semantically_similar(answer1: str, answer2: str, model: str = "gpt-3") -> bo
}}""",
model=model,
)
+ if response not in ["same", "different"]:
+ raise ValueError(f"Unexpected response: {response}")
return response == "same"
diff --git a/benchllm/singleton.py b/benchllm/singleton.py
index 503fc1f..456bbe2 100644
--- a/benchllm/singleton.py
+++ b/benchllm/singleton.py
@@ -1,5 +1,5 @@
from pathlib import Path
-from typing import Any, Callable, Generic, Optional, Type, TypeVar
+from typing import Callable, Generic, Optional, Type, TypeVar
T = TypeVar("T")
diff --git a/benchllm/utils.py b/benchllm/utils.py
index 633ea99..b4f6f74 100644
--- a/benchllm/utils.py
+++ b/benchllm/utils.py
@@ -1,6 +1,11 @@
import ast
+import json
from pathlib import Path
+import yaml
+
+from benchllm.data_types import Prediction
+
class DecoratorFinder(ast.NodeVisitor):
def __init__(self) -> None:
@@ -59,3 +64,21 @@ def find_json_yml_files(paths: list[Path]) -> list[Path]:
if file.suffix in (".yml", ".json", ".yaml"):
files.append(file)
return list(set(files))
+
+
+def load_prediction_files(paths: list[Path]) -> list[Prediction]:
+ predictions = []
+ for path in paths:
+ for file_path in path.rglob("*"):
+ if not file_path.is_file():
+ continue
+ if file_path.suffix not in {".json", ".yml", ".yaml"}:
+ continue
+ with open(file_path, "r") as file:
+ if file_path.suffix == ".json":
+ data = json.load(file)
+ predictions.append(Prediction(**data))
+ elif file_path.suffix in {".yml", ".yaml"}:
+ data = yaml.safe_load(file)
+ predictions.append(Prediction(**data))
+ return predictions
diff --git a/test/cache/test_file_cache.py b/test/cache/test_file_cache.py
new file mode 100644
index 0000000..d5218c0
--- /dev/null
+++ b/test/cache/test_file_cache.py
@@ -0,0 +1,74 @@
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+
+from benchllm import Prediction, StringMatchEvaluator, Test
+from benchllm.cache import FileCache
+from benchllm.data_types import FunctionID
+
+EXAMPLE_PREDICTIONS = [
+ Prediction(
+ test=Test(input="foo", expected=["abc", "def", "ghi"]),
+ output="no-match",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+ Prediction(
+ test=Test(input="foo", expected=["abc", "def", "ghi"]),
+ output="def",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+ Prediction(
+ test=Test(input="foo", expected=["abc", "def", "ghi"]),
+ output="no-match",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+]
+
+EXAMPLE_PREDICTIONS_ALL_SAME = [
+ Prediction(
+ test=Test(input="foo", expected=["match"]),
+ output="match",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+ Prediction(
+ test=Test(input="foo", expected=["match"]),
+ output="match",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+]
+
+
+def test_file_writes_at_end():
+ with patch.object(
+ StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
+ ) as mock_method:
+ with tempfile.TemporaryDirectory() as temp_dir:
+ cache_path = Path(temp_dir, "cache.json")
+ evaluator = FileCache(StringMatchEvaluator(), cache_path)
+ evaluator.load(EXAMPLE_PREDICTIONS)
+
+ evaluations = evaluator.run()
+ assert cache_path.exists()
+ assert not evaluations[0].passed
+ assert evaluations[1].passed
+ assert not evaluations[2].passed
+ assert mock_method.call_count == 2
+ assert evaluator.num_cache_hits == 1
+ mock_method.reset_mock()
+
+ # second run will use cache
+ evaluator = FileCache(StringMatchEvaluator(), cache_path)
+ evaluator.load(EXAMPLE_PREDICTIONS)
+
+ evaluations = evaluator.run()
+ assert cache_path.exists()
+ assert not evaluations[0].passed
+ assert evaluations[1].passed
+ assert not evaluations[2].passed
+ assert mock_method.call_count == 0
+ assert evaluator.num_cache_hits == 3
diff --git a/test/cache/test_memory_cache.py b/test/cache/test_memory_cache.py
new file mode 100644
index 0000000..0d4288b
--- /dev/null
+++ b/test/cache/test_memory_cache.py
@@ -0,0 +1,163 @@
+from unittest.mock import patch
+
+from benchllm import Prediction, StringMatchEvaluator, Test
+from benchllm.cache import MemoryCache
+from benchllm.data_types import FunctionID
+
+EXAMPLE_PREDICTIONS = [
+ Prediction(
+ test=Test(input="foo", expected=["abc", "def", "ghi"]),
+ output="no-match",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+ Prediction(
+ test=Test(input="foo", expected=["abc", "def", "ghi"]),
+ output="def",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+ Prediction(
+ test=Test(input="foo", expected=["abc", "def", "ghi"]),
+ output="no-match",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+]
+
+EXAMPLE_PREDICTIONS_ALL_SAME = [
+ Prediction(
+ test=Test(input="foo", expected=["match"]),
+ output="match",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+ Prediction(
+ test=Test(input="foo", expected=["match"]),
+ output="match",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+]
+
+
+EXAMPLE_PREDICTIONS_CACHING_NEGATIVE = [
+ Prediction(
+ test=Test(input="foo", expected=["no-match"]),
+ output="match",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+ Prediction(
+ test=Test(input="foo", expected=["no-match", "match"]),
+ output="match",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+ Prediction(
+ test=Test(input="foo", expected=["match", "no-match"]),
+ output="match",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+]
+
+
+def test_memory_cache_will_prevent_calls_to_evaluate_prediction_on_second_run():
+ with patch.object(
+ StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
+ ) as mock_method:
+ evaluator = MemoryCache(StringMatchEvaluator())
+ evaluator.load(EXAMPLE_PREDICTIONS)
+ evaluations = evaluator.run()
+ assert not evaluations[0].passed
+ assert evaluations[1].passed
+ assert not evaluations[2].passed
+ assert mock_method.call_count == 2
+ assert evaluator.num_cache_hits == 1
+ mock_method.reset_mock()
+
+ # second run will use cache
+ evaluations = evaluator.run()
+ assert not evaluations[0].passed
+ assert evaluations[1].passed
+ assert not evaluations[2].passed
+ assert evaluator.num_cache_hits == 4
+ assert mock_method.call_count == 0
+
+
+def test_memory_cache_caches_during_run():
+ with patch.object(
+ StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
+ ) as mock_method:
+ evaluator = MemoryCache(StringMatchEvaluator())
+ evaluator.load(EXAMPLE_PREDICTIONS_ALL_SAME)
+
+ evaluations = evaluator.run()
+ assert evaluations[0].passed
+ assert evaluations[1].passed
+ assert mock_method.call_count == 1
+ assert evaluator.num_cache_hits == 1
+
+
+def test_memory_cache_caches_always_tries_to_pass():
+ with patch.object(
+ StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
+ ) as mock_method:
+ evaluator = MemoryCache(StringMatchEvaluator())
+ evaluator.load(EXAMPLE_PREDICTIONS_CACHING_NEGATIVE)
+
+ evaluations = evaluator.run()
+ assert not evaluations[0].passed
+ assert evaluations[1].passed
+ assert evaluations[2].passed
+ assert mock_method.call_count == 2
+ assert evaluator.num_cache_hits == 1
+
+
+def test_memory_cache_does_not_pass_on_cached_negatives():
+ with patch.object(
+ StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
+ ) as mock_method:
+ evaluator = MemoryCache(StringMatchEvaluator())
+ evaluator.load(EXAMPLE_PREDICTIONS_CACHING_NEGATIVE)
+
+ evaluator.run()
+ assert mock_method.call_count == 2
+ assert mock_method.call_args_list.pop(0).args[0].test.expected == ["no-match"]
+ assert mock_method.call_args_list.pop(0).args[0].test.expected == ["match"]
+
+
+def test_memory_cache_supports_numbers():
+ with patch.object(
+ StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
+ ) as mock_method:
+ evaluator = MemoryCache(StringMatchEvaluator())
+ evaluator.load(
+ [
+ Prediction(
+ test=Test(input="foo", expected=["42"]),
+ output="42",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+ Prediction(
+ test=Test(input="foo", expected=["42"]),
+ output="42",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+ Prediction(
+ test=Test(input="foo", expected=["42"]),
+ output="24",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+ ]
+ )
+ evaluations = evaluator.run()
+ assert evaluations[0].passed
+ assert evaluations[1].passed
+ assert not evaluations[2].passed
+ assert mock_method.call_count == 2
+ assert evaluator.num_cache_hits == 1
diff --git a/test/cli/test_interactive.py b/test/cli/test_interactive.py
new file mode 100644
index 0000000..61bee1c
--- /dev/null
+++ b/test/cli/test_interactive.py
@@ -0,0 +1,35 @@
+from unittest import mock
+
+import typer
+
+from benchllm.cli.evaluator import InteractiveEvaluator
+from benchllm.data_types import FunctionID, Prediction, Test
+
+TEST_PREDICTION = [
+ Prediction(
+ test=Test(input="Who are you?", expected=["Yoda I am.", "Yoda"]),
+ output="I am Yoda.",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ )
+]
+
+
+def test_interactive_press_y_passes():
+ evalautor = InteractiveEvaluator()
+ evalautor.load(TEST_PREDICTION)
+ with mock.patch.object(typer, "prompt", lambda *args, **kwargs: "1"):
+ result = evalautor.run()
+ assert result[0].passed
+
+ with mock.patch.object(typer, "prompt", lambda *args, **kwargs: "2"):
+ result = evalautor.run()
+ assert result[0].passed
+
+
+def test_interactive_press_n_fails():
+ evalautor = InteractiveEvaluator()
+ evalautor.load(TEST_PREDICTION)
+ with mock.patch.object(typer, "prompt", lambda *args, **kwargs: "n"):
+ result = evalautor.run()
+ assert not result[0].passed
diff --git a/test/cli/test_list_tests.py b/test/cli/test_list_tests.py
new file mode 100644
index 0000000..e54a541
--- /dev/null
+++ b/test/cli/test_list_tests.py
@@ -0,0 +1,14 @@
+from pathlib import Path
+
+from typer.testing import CliRunner
+
+from benchllm.cli.main import app
+
+runner = CliRunner()
+
+
+def test_list_tests():
+ result = runner.invoke(app, ["tests", str(Path.cwd() / "examples/qa")])
+ assert "Input" in result.stdout
+ assert "No." in result.stdout
+ assert "Expected" in result.stdout
diff --git a/test/test_cli_main.py b/test/cli/test_run_suite.py
similarity index 73%
rename from test/test_cli_main.py
rename to test/cli/test_run_suite.py
index 653cf47..1fa2ae5 100644
--- a/test/test_cli_main.py
+++ b/test/cli/test_run_suite.py
@@ -1,4 +1,3 @@
-from pathlib import Path
from test.utils import create_openai_object
from unittest.mock import MagicMock, patch
@@ -19,10 +18,3 @@ def test_run_multiple_suites(completion_mock: MagicMock):
def test_run_target_suite(completion_mock: MagicMock):
runner.invoke(app, ["run", "examples/qa"])
completion_mock.assert_called()
-
-
-def test_list_tests():
- result = runner.invoke(app, ["tests", str(Path.cwd() / "examples/qa")])
- assert "Input" in result.stdout
- assert "No." in result.stdout
- assert "Expected" in result.stdout
diff --git a/test/evaulator/test_evalutator.py b/test/evaulator/test_evalutator.py
new file mode 100644
index 0000000..ab8aa4b
--- /dev/null
+++ b/test/evaulator/test_evalutator.py
@@ -0,0 +1,34 @@
+import json
+import tempfile
+from pathlib import Path
+from test.utils import create_openai_object
+from unittest.mock import MagicMock, Mock, call, patch
+
+from benchllm import Prediction, SemanticEvaluator, StringMatchEvaluator, Test
+from benchllm.cache import MemoryCache
+from benchllm.data_types import FunctionID
+from benchllm.evaluator import Evaluator
+
+
+class NoopEvaluator(Evaluator):
+ def evaluate_prediction(self, prediction: Prediction) -> Evaluator.Match:
+ return Evaluator.Match(prediction=prediction.output, expected=prediction.output)
+
+
+def test_evaluator_can_load_prediction_file():
+ prediction = {
+ "output": "42",
+ "test": {"input": "1+1", "expected": ["2"]},
+ "time_elapsed": 0,
+ "function_id": {"module_path": "test", "line_number": 1},
+ }
+ with tempfile.TemporaryDirectory() as tmpdir:
+ prediction_path = Path(tmpdir, "prediction.json")
+ prediction_path.write_bytes(json.dumps(prediction).encode())
+
+ evaluator = NoopEvaluator()
+ evaluator.load_prediction_file(prediction_path)
+
+ assert evaluator.predictions[0].output == "42"
+ assert evaluator.predictions[0].test.input == "1+1"
+ assert evaluator.predictions[0].test.expected == ["2"]
diff --git a/test/evaulator/test_semantic.py b/test/evaulator/test_semantic.py
new file mode 100644
index 0000000..92dd9b2
--- /dev/null
+++ b/test/evaulator/test_semantic.py
@@ -0,0 +1,60 @@
+from test.utils import create_openai_object
+from unittest.mock import MagicMock, patch
+
+from benchllm import Prediction, SemanticEvaluator, Test
+from benchllm.data_types import FunctionID
+
+
+@patch("openai.Completion.create", return_value=create_openai_object("same"))
+def test_semantic_passes_if_output_is_equal(completion_mock: MagicMock):
+ evaluator = SemanticEvaluator(model="gpt-3")
+ evaluator.load(
+ [
+ Prediction(
+ test=Test(input="Who are you?", expected=["Yoda I am."]),
+ output="I am Yoda.",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ )
+ ]
+ )
+ evaluations = evaluator.run()
+ completion_mock.assert_called_once()
+ assert evaluations[0].passed
+
+
+@patch("openai.Completion.create", return_value=create_openai_object("different"))
+def test_semantic_fails_if_output_is_unequal(completion_mock: MagicMock):
+ evaluator = SemanticEvaluator(model="gpt-3")
+ evaluator.load(
+ [
+ Prediction(
+ test=Test(input="What are you?", expected=["Everything"]),
+ output="Nothing",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ ),
+ ]
+ )
+ evaluations = evaluator.run()
+ completion_mock.assert_called_once()
+ assert not evaluations[0].passed
+
+
+@patch("openai.Completion.create", return_value=create_openai_object("same"))
+def test_semantic_passes_if_output_is_equal_multiple_workers(completion_mock: MagicMock):
+ evaluator = SemanticEvaluator(model="gpt-3", workers=10)
+ evaluator.load(
+ [
+ Prediction(
+ test=Test(input="Who are you?", expected=["Yoda I am."]),
+ output="I am Yoda.",
+ time_elapsed=0,
+ function_id=FunctionID.default(),
+ )
+ for _ in range(100)
+ ]
+ )
+ evaluations = evaluator.run()
+ assert completion_mock.call_count == 100
+ assert all([evaluation.passed for evaluation in evaluations])
diff --git a/test/test_evalutator.py b/test/evaulator/test_string_match.py
similarity index 50%
rename from test/test_evalutator.py
rename to test/evaulator/test_string_match.py
index 608172a..75ecc89 100644
--- a/test/test_evalutator.py
+++ b/test/evaulator/test_string_match.py
@@ -1,9 +1,4 @@
-import tempfile
-from pathlib import Path
-from test.utils import create_openai_object
-from unittest.mock import MagicMock, Mock, call, patch
-
-from benchllm import Prediction, SemanticEvaluator, StringMatchEvaluator, Test
+from benchllm import Prediction, StringMatchEvaluator, Test
from benchllm.data_types import FunctionID
@@ -66,58 +61,3 @@ def test_string_match_passes_if_output_is_equal_to_expected_fuzzy():
evaluations = evaluator.run()
assert evaluations[0].passed
assert not evaluations[1].passed
-
-
-@patch("openai.Completion.create", return_value=create_openai_object("same"))
-def test_semantic_passes_if_output_is_equal(completion_mock: MagicMock):
- evaluator = SemanticEvaluator(model="gpt-3")
- evaluator.load(
- [
- Prediction(
- test=Test(input="Who are you?", expected=["Yoda I am."]),
- output="I am Yoda.",
- time_elapsed=0,
- function_id=FunctionID.default(),
- )
- ]
- )
- evaluations = evaluator.run()
- completion_mock.assert_called_once()
- assert evaluations[0].passed
-
-
-@patch("openai.Completion.create", return_value=create_openai_object("different"))
-def test_semantic_fails_if_output_is_unequal(completion_mock: MagicMock):
- evaluator = SemanticEvaluator(model="gpt-3")
- evaluator.load(
- [
- Prediction(
- test=Test(input="What are you?", expected=["Everything"]),
- output="Nothing",
- time_elapsed=0,
- function_id=FunctionID.default(),
- ),
- ]
- )
- evaluations = evaluator.run()
- completion_mock.assert_called_once()
- assert not evaluations[0].passed
-
-
-@patch("openai.Completion.create", return_value=create_openai_object("same"))
-def test_semantic_passes_if_output_is_equal_multiple_workers(completion_mock: MagicMock):
- evaluator = SemanticEvaluator(model="gpt-3", workers=10)
- evaluator.load(
- [
- Prediction(
- test=Test(input="Who are you?", expected=["Yoda I am."]),
- output="I am Yoda.",
- time_elapsed=0,
- function_id=FunctionID.default(),
- )
- for _ in range(100)
- ]
- )
- evaluations = evaluator.run()
- assert completion_mock.call_count == 100
- assert all([evaluation.passed for evaluation in evaluations])