v7labs · simedw · Jul 10, 2023 · Jul 7, 2023 · Jul 7, 2023 · Jul 8, 2023
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # 🏋️‍♂️ BenchLLM 🏋️‍♀️
 
-🦾 Continuous Integration for LLM powered applications 🦙🦅🤖 
+🦾 Continuous Integration for LLM powered applications 🦙🦅🤖
 
 [![GitHub Repo stars](https://img.shields.io/github/stars/v7labs/BenchLLM?style=social)](https://github.com/v7labs/BenchLLM/stargazers)
 [![Twitter Follow](https://img.shields.io/twitter/follow/V7Labs?style=social)](https://twitter.com/V7Labs)
@@ -10,7 +10,6 @@
 
 BenchLLM is actively used at [V7](https://www.v7labs.com) for improving our LLM applications and is now Open Sourced under MIT License to share with the wider community
 
-
 ## 💡 Get help on [Discord](https://discord.gg/x7ExfHb3bG) or [Tweet at us](https://twitter.com/V7Labs)
 
 <hr/>
@@ -26,7 +25,7 @@ Use BenchLLM to:
 
 > ⚠️ **NOTE:** BenchLLM is in the early stage of development and will be subject to rapid changes.
 >
->For bug reporting, feature requests, or contributions, please open an issue or submit a pull request (PR) on our GitHub page.
+> For bug reporting, feature requests, or contributions, please open an issue or submit a pull request (PR) on our GitHub page.
 
 ## 🧪 BenchLLM Testing Methodology
 
@@ -116,6 +115,16 @@ The non interactive evaluators also supports `--workers N` to run in the evaluat
 $ bench run --evaluator string-match --workers 5
 ```
 
+To accelerate the evaluation process, BenchLLM uses a cache. If a (prediction, expected) pair has been evaluated in the past and a cache was used, the evaluation output will be saved for future evaluations. There are several types of caches:
+
+- `memory`, only caches output values during the current run. This is particularly useful when running with `--retry-count N`
+- `file`, stores the cache at the end of the run as a JSON file in output/cache.json. This is the default behavior.
+- `none`, does not use any cache.
+
+```bash
+$ bench run examples --cache memory
+```
+
 ### 🧮 Eval
 
 While _bench run_ runs each test function and then evaluates their output, it can often be beneficial to separate these into two steps. For example, if you want a person to manually do the evaluation or if you want to try multiple evaluation methods on the same function.
@@ -163,6 +172,20 @@ results = evaluator.run()
 print(results)
 ```
 
+If you want to incorporate caching and run multiple parallel evaluation jobs, you can modify your evaluator as follows:
+
+```python
+from benchllm.cache import FileCache
+
+...
+
+evaluator = FileCache(StringMatchEvaluator(workers=2), Path("path/to/cache.json"))
+evaluator.load(predictions)
+results = evaluator.run()
+```
+
+In this example, `FileCache` is used to enable caching, and the `workers` parameter of `StringMatchEvaluator` is set to `2` to allow for parallel evaluations. The cache results are saved in a file specified by `Path("path/to/cache.json")`.
+
 ## ☕️ Commands
 
 - `bench add`: Add a new test to a suite.

diff --git a/benchllm/cache.py b/benchllm/cache.py
@@ -0,0 +1,87 @@
+import json
+from pathlib import Path
+from typing import Optional
+
+from benchllm.data_types import Evaluation, Prediction
+from benchllm.evaluator import Evaluator
+from benchllm.input_types import Json
+from benchllm.listener import EvaluatorListener
+
+
+class MemoryCache(Evaluator):
+    """Caches the results of the evaluator in memory"""
+
+    def __init__(self, evaluator: Evaluator):
+        super().__init__(workers=evaluator.workers)
+        self._data: dict = {}
+        self._evaluator = evaluator
+        self._num_cache_misses = 0
+        self._num_cache_hits = 0
+
+    def _key(self, answer1: Json, answer2: Json) -> str:
+        key1, key2 = json.dumps([answer1, answer2]), json.dumps([answer2, answer1])
+        return key1 if key1 < key2 else key2
+
+    def lookup(self, answer1: Json, answer2: Json) -> Optional[bool]:
+        return self._data.get(self._key(answer1, answer2), None)
+
+    def store(self, answer1: Json, answer2: Json, value: bool) -> None:
+        key = self._key(answer1, answer2)
+        self._data[key] = value
+
+    def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]:
+        for expected in prediction.test.expected:
+            lookup = self.lookup(expected, prediction.output)
+            # None indicates that nothing was found in the cache
+            # while True and False are both valid cache values
+            if lookup is None:
+                continue
+            self._num_cache_hits += 1
+            if lookup:
+                return Evaluator.Match(prediction=prediction.output, expected=expected)
+            return None
+
+        self._num_cache_misses += 1
+        result = self._evaluator.evaluate_prediction(prediction)
+        if result:
+            self.store(result.expected, result.prediction, True)
+        else:
+            for expected in prediction.test.expected:
+                self.store(expected, prediction.output, False)
+        return result
+
+    @property
+    def num_cache_hits(self) -> int:
+        return self._num_cache_hits
+
+    @property
+    def num_cache_misses(self) -> int:
+        return self._num_cache_misses
+
+
+class FileCache(MemoryCache, EvaluatorListener):
+    """Caches the results of the evaluator in a json file"""
+
+    def __init__(self, evaluator: Evaluator, path: Path):
+        super().__init__(evaluator)
+        self._path = path
+        self.add_listener(self)
+        self._load()
+
+    def _load(self) -> None:
+        if self._path.exists():
+            try:
+                cache = json.loads(self._path.read_text(encoding="UTF-8"), parse_int=str)
+                if cache["version"] != "1":
+                    raise ValueError("Unsupported cache version")
+                self._data = cache["entries"]
+            except Exception:
+                print(f"Failed to load cache file {self._path}")
+                self._data = {}
+
+    def _save(self) -> None:
+        cache = {"entries": self._data, "version": "1"}
+        self._path.write_text(json.dumps(cache, indent=4), encoding="UTF-8")
+
+    def evaluate_ended(self, evaluations: list[Evaluation]) -> None:
+        self._save()
diff --git a/benchllm/cli/commands/evaluate.py b/benchllm/cli/commands/evaluate.py
@@ -1,13 +1,14 @@
 from pathlib import Path
 
+from benchllm.cache import FileCache
 from benchllm.cli.listener import ReportListener, RichCliListener
-from benchllm.cli.utils import get_evaluator
+from benchllm.cli.utils import add_cache, get_evaluator
 from benchllm.evaluator import load_prediction_files
 from benchllm.utils import find_json_yml_files
 
 
 def evaluate_predictions(
-    file_or_dir: list[Path], model: str, output_dir: Path, workers: int, evaluator_name: str
+    file_or_dir: list[Path], model: str, output_dir: Path, workers: int, evaluator_name: str, cache: str
 ) -> bool:
     files = find_json_yml_files(file_or_dir)
 
@@ -17,6 +18,10 @@ def evaluate_predictions(
     load_prediction_files(file_or_dir)
 
     evaluator = get_evaluator(evaluator_name, model, workers)
+    evaluator = add_cache(cache, evaluator, output_dir / ".." / f"cache.json")
+
+    cli_listener.set_evaulator(evaluator)
+
     evaluator.add_listener(cli_listener)
     evaluator.add_listener(report_listener)
     for file in files:

diff --git a/benchllm/cli/commands/run_suite.py b/benchllm/cli/commands/run_suite.py
@@ -2,8 +2,9 @@
 
 import typer
 
+from benchllm.cache import FileCache
 from benchllm.cli.listener import ReportListener, RichCliListener
-from benchllm.cli.utils import get_evaluator
+from benchllm.cli.utils import add_cache, get_evaluator
 from benchllm.tester import Tester
 from benchllm.utils import find_files
 
@@ -17,6 +18,7 @@ def run_suite(
     workers: int,
     evaluator_name: str,
     retry_count: int,
+    cache: str,
 ) -> bool:
     files = find_files(file_search_paths)
     if not files:
@@ -45,6 +47,10 @@ def run_suite(
         return True
 
     evaluator = get_evaluator(evaluator_name, model, workers)
+    evaluator = add_cache(cache, evaluator, output_dir / ".." / f"cache.json")
+
+    cli_listener.set_evaulator(evaluator)
+
     evaluator.add_listener(cli_listener)
     evaluator.add_listener(report_listener)
     evaluator.load(tester.predictions)

diff --git a/benchllm/cli/evaluator.py b/benchllm/cli/evaluator.py
@@ -47,11 +47,11 @@ def evaluate_prediction(self, prediction: Prediction) -> bool:
 
 
 class WebEvaluator(Evaluator):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(workers=1)
 
         @session.defer_call
-        def on_close():
+        def on_close() -> None:
             typer.secho(
                 f"The evaluation was interrupted. Run bench eval to start again", fg=typer.colors.RED, bold=True
             )

diff --git a/benchllm/cli/listener.py b/benchllm/cli/listener.py
@@ -1,14 +1,17 @@
 import datetime
 import json
 from pathlib import Path
+from typing import Optional
 
 import typer
 from rich import print
 from rich.console import Console
 from rich.markup import render
 from rich.table import Table
 
+from benchllm.cache import MemoryCache
 from benchllm.data_types import Evaluation, FunctionID, Prediction, Test, TestFunction
+from benchllm.evaluator import Evaluator
 from benchllm.listener import EvaluatorListener, TesterListener
 
 
@@ -37,12 +40,23 @@ def evaluate_prediction_ended(self, evaluation: Evaluation) -> None:
 
 
 class RichCliListener(TesterListener, EvaluatorListener):
-    def __init__(self, root_dir: Path, *, interactive: bool, test_only: bool = False, eval_only: bool = False) -> None:
+    def __init__(
+        self,
+        root_dir: Path,
+        *,
+        interactive: bool,
+        test_only: bool = False,
+        eval_only: bool = False,
+    ) -> None:
         super().__init__()
         self.root_dir = root_dir
         self.interactive = interactive
         self._eval_only = eval_only
         self._test_only = test_only
+        self._evaluator: Optional[Evaluator] = None
+
+    def set_evaulator(self, evaluator: Evaluator) -> None:
+        self._evaluator = evaluator
 
     def test_run_started(self) -> None:
         print_centered(" Run Tests ")
@@ -116,6 +130,9 @@ def evaluate_ended(self, evaluations: list[Evaluation]) -> None:
                 console.print(table)
 
         tmp = f" [red]{len(failed)} failed[/red], [green]{len(evaluations) - len(failed)} passed[/green], in [blue]{format_time(total_eval_time + total_test_time)}[/blue] "
+        if isinstance(self._evaluator, MemoryCache):
+            tmp += f"(cached hits {self._evaluator.num_cache_hits}, cached misses {self._evaluator.num_cache_misses}) "
+
         print_centered(tmp)
 
 

diff --git a/benchllm/cli/main.py b/benchllm/cli/main.py
@@ -28,6 +28,7 @@ def run(
     workers: Annotated[int, typer.Option(help="Number of workers to use to run the evaluation.")] = 1,
     retry_count: Annotated[int, typer.Option(help="Rerun tests to spot flaky output")] = 1,
     evaluator: Annotated[str, typer.Option(help="Evaluator to use to run the evaluation.")] = "semantic",
+    cache: Annotated[str, typer.Option(help="Type of cache to use.")] = "file",
 ) -> None:
     if not file_or_dir:
         file_or_dir = [Path.cwd()]
@@ -40,6 +41,7 @@ def run(
         evaluator_name=evaluator,
         no_eval=not eval,
         retry_count=retry_count,
+        cache=cache,
     )
     if not success:
         raise typer.Exit(code=1)
@@ -61,13 +63,15 @@ def eval(
     model: Annotated[str, typer.Option(help="Model to use to run the evaluation.")] = "gpt-3",
     workers: Annotated[int, typer.Option(help="Number of workers to use to run the evaluation.")] = 1,
     evaluator: Annotated[str, typer.Option(help="Evaluator to use to run the evaluation.")] = "semantic",
+    cache: Annotated[str, typer.Option(help="Type of cache to use.")] = "file",
 ) -> None:
     success = evaluate_predictions(
         file_or_dir=file_or_dir,
         model=model,
         output_dir=output_dir,
         workers=workers,
         evaluator_name=evaluator,
+        cache=cache,
     )
     if not success:
         raise typer.Exit(code=1)

diff --git a/benchllm/cli/utils.py b/benchllm/cli/utils.py
@@ -1,6 +1,7 @@
 import datetime
 from pathlib import Path
 
+from benchllm.cache import FileCache, MemoryCache
 from benchllm.cli.evaluator import InteractiveEvaluator, WebEvaluator
 from benchllm.evaluator import Evaluator, SemanticEvaluator, StringMatchEvaluator
 
@@ -28,3 +29,14 @@ def get_evaluator(evaluator_name: str, model: str, workers: int) -> Evaluator:
         return WebEvaluator()
     else:
         raise ValueError(f"Unknown evaluator {evaluator_name}")
+
+
+def add_cache(cache_name: str, evaluator: Evaluator, cache_path: Path) -> Evaluator:
+    if cache_name == "file":
+        return FileCache(evaluator, cache_path)
+    elif cache_name == "memory":
+        return MemoryCache(evaluator)
+    elif cache_name == "none":
+        return evaluator
+    else:
+        raise ValueError(f"Unknown cache {cache_name}, valid values are 'file', 'memory', 'none'")