diff --git a/README.md b/README.md
index 4206fe0..972395d 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # 🏋️‍♂️ BenchLLM 🏋️‍♀️
 
-🦾 Continuous Integration for LLM powered applications 🦙🦅🤖 
+🦾 Continuous Integration for LLM powered applications 🦙🦅🤖
 
 [![GitHub Repo stars](https://img.shields.io/github/stars/v7labs/BenchLLM?style=social)](https://github.com/v7labs/BenchLLM/stargazers)
 [![Twitter Follow](https://img.shields.io/twitter/follow/V7Labs?style=social)](https://twitter.com/V7Labs)
@@ -10,7 +10,6 @@
 
 BenchLLM is actively used at [V7](https://www.v7labs.com) for improving our LLM applications and is now Open Sourced under MIT License to share with the wider community
 
-
 ## 💡 Get help on [Discord](https://discord.gg/x7ExfHb3bG) or [Tweet at us](https://twitter.com/V7Labs)
 
 <hr/>
@@ -26,7 +25,7 @@ Use BenchLLM to:
 
 > ⚠️ **NOTE:** BenchLLM is in the early stage of development and will be subject to rapid changes.
 >
->For bug reporting, feature requests, or contributions, please open an issue or submit a pull request (PR) on our GitHub page.
+> For bug reporting, feature requests, or contributions, please open an issue or submit a pull request (PR) on our GitHub page.
 
 ## 🧪 BenchLLM Testing Methodology
 
@@ -116,6 +115,16 @@ The non interactive evaluators also supports `--workers N` to run in the evaluat
 $ bench run --evaluator string-match --workers 5
 ```
 
+To accelerate the evaluation process, BenchLLM uses a cache. If a (prediction, expected) pair has been evaluated in the past and a cache was used, the evaluation output will be saved for future evaluations. There are several types of caches:
+
+- `memory`, only caches output values during the current run. This is particularly useful when running with `--retry-count N`
+- `file`, stores the cache at the end of the run as a JSON file in output/cache.json. This is the default behavior.
+- `none`, does not use any cache.
+
+```bash
+$ bench run examples --cache memory
+```
+
 ### 🧮 Eval
 
 While _bench run_ runs each test function and then evaluates their output, it can often be beneficial to separate these into two steps. For example, if you want a person to manually do the evaluation or if you want to try multiple evaluation methods on the same function.
@@ -163,6 +172,20 @@ results = evaluator.run()
 print(results)
 ```
 
+If you want to incorporate caching and run multiple parallel evaluation jobs, you can modify your evaluator as follows:
+
+```python
+from benchllm.cache import FileCache
+
+...
+
+evaluator = FileCache(StringMatchEvaluator(workers=2), Path("path/to/cache.json"))
+evaluator.load(predictions)
+results = evaluator.run()
+```
+
+In this example, `FileCache` is used to enable caching, and the `workers` parameter of `StringMatchEvaluator` is set to `2` to allow for parallel evaluations. The cache results are saved in a file specified by `Path("path/to/cache.json")`.
+
 ## ☕️ Commands
 
 - `bench add`: Add a new test to a suite.
diff --git a/benchllm/__init__.py b/benchllm/__init__.py
index 43cb476..8c0bf5f 100644
--- a/benchllm/__init__.py
+++ b/benchllm/__init__.py
@@ -1,10 +1,10 @@
 import inspect
 from pathlib import Path
-from typing import Any, Callable, Generic, Optional, Type, TypeVar
+from typing import Callable, Type, TypeVar
 
 from .data_types import Evaluation, Prediction, Test  # noqa
 from .evaluator import Evaluator, SemanticEvaluator, StringMatchEvaluator  # noqa
-from .input_types import ChatInput, SimilarityInput
+from .input_types import ChatInput, SimilarityInput  # noqa
 from .similarity import semantically_similar  # noqa
 from .singleton import TestSingleton  # noqa
 from .tester import Tester  # noqa
diff --git a/benchllm/cache.py b/benchllm/cache.py
new file mode 100644
index 0000000..9981d47
--- /dev/null
+++ b/benchllm/cache.py
@@ -0,0 +1,96 @@
+import json
+from pathlib import Path
+from typing import Optional
+
+from benchllm.data_types import Evaluation, Prediction
+from benchllm.evaluator import Evaluator
+from benchllm.input_types import Json
+from benchllm.listener import EvaluatorListener
+
+
+class MemoryCache(Evaluator):
+    """Caches the results of the evaluator in memory"""
+
+    def __init__(self, evaluator: Evaluator):
+        super().__init__(workers=evaluator.workers)
+        self._data: dict = {}
+        self._evaluator = evaluator
+        self._num_cache_misses = 0
+        self._num_cache_hits = 0
+
+    def _key(self, answer1: Json, answer2: Json) -> str:
+        key1, key2 = json.dumps([answer1, answer2]), json.dumps([answer2, answer1])
+        return key1 if key1 < key2 else key2
+
+    def lookup(self, answer1: Json, answer2: Json) -> Optional[bool]:
+        return self._data.get(self._key(answer1, answer2), None)
+
+    def store(self, answer1: Json, answer2: Json, value: bool) -> None:
+        key = self._key(answer1, answer2)
+        self._data[key] = value
+
+    def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]:
+        uncached_expectations = []
+        for expected in prediction.test.expected:
+            lookup = self.lookup(expected, prediction.output)
+            if lookup is None:
+                uncached_expectations.append(expected)
+            elif lookup:
+                # If we find a positive match we can stop comparing and just return.
+                # For negative matches we still need to check the other expected answers.
+                self._num_cache_hits += 1
+                return Evaluator.Match(prediction=prediction.output, expected=expected)
+
+        # If all expectations were found in the cache but were negative matches,
+        # we increment the cache hits counter and return None as there's no match.
+        if not uncached_expectations:
+            self._num_cache_hits += 1
+            return None
+
+        self._num_cache_misses += 1
+        # set prediction.test.expected to only the ones that were not cached
+        prediction = Prediction(**prediction.dict())
+        prediction.test.expected = uncached_expectations
+        result = self._evaluator.evaluate_prediction(prediction)
+        if result:
+            self.store(result.expected, result.prediction, True)
+        else:
+            for expected in prediction.test.expected:
+                self.store(expected, prediction.output, False)
+        return result
+
+    @property
+    def num_cache_hits(self) -> int:
+        return self._num_cache_hits
+
+    @property
+    def num_cache_misses(self) -> int:
+        return self._num_cache_misses
+
+
+class FileCache(MemoryCache, EvaluatorListener):
+    """Caches the results of the evaluator in a json file"""
+
+    def __init__(self, evaluator: Evaluator, path: Path):
+        super().__init__(evaluator)
+        self._path = path
+        self.add_listener(self)
+        self._load()
+
+    def _load(self) -> None:
+        if self._path.exists():
+            try:
+                cache = json.loads(self._path.read_text(encoding="UTF-8"), parse_int=str)
+                if cache["version"] != "1":
+                    raise ValueError("Unsupported cache version")
+                self._data = cache["entries"]
+            except Exception:
+                print(f"Failed to load cache file {self._path}")
+                self._data = {}
+
+    def _save(self) -> None:
+        cache = {"entries": self._data, "version": "1"}
+        self._path.write_text(json.dumps(cache, indent=4), encoding="UTF-8")
+
+    def evaluate_ended(self, evaluations: list[Evaluation]) -> None:
+        self._save()
diff --git a/benchllm/cli/commands/evaluate.py b/benchllm/cli/commands/evaluate.py
index e3e14bc..0451d5d 100644
--- a/benchllm/cli/commands/evaluate.py
+++ b/benchllm/cli/commands/evaluate.py
@@ -1,13 +1,13 @@
 from pathlib import Path
 
+from benchllm.cache import FileCache
 from benchllm.cli.listener import ReportListener, RichCliListener
-from benchllm.cli.utils import get_evaluator
-from benchllm.evaluator import load_prediction_files
-from benchllm.utils import find_json_yml_files
+from benchllm.cli.utils import add_cache, get_evaluator
+from benchllm.utils import find_json_yml_files, load_prediction_files
 
 
 def evaluate_predictions(
-    file_or_dir: list[Path], model: str, output_dir: Path, workers: int, evaluator_name: str
+    file_or_dir: list[Path], model: str, output_dir: Path, workers: int, evaluator_name: str, cache: str
 ) -> bool:
     files = find_json_yml_files(file_or_dir)
 
@@ -17,6 +17,10 @@ def evaluate_predictions(
     load_prediction_files(file_or_dir)
 
     evaluator = get_evaluator(evaluator_name, model, workers)
+    evaluator = add_cache(cache, evaluator, output_dir.parent / "cache.json")
+
+    cli_listener.set_evaulator(evaluator)
+
     evaluator.add_listener(cli_listener)
     evaluator.add_listener(report_listener)
     for file in files:
diff --git a/benchllm/cli/commands/run_suite.py b/benchllm/cli/commands/run_suite.py
index 53e39ca..075d8c0 100644
--- a/benchllm/cli/commands/run_suite.py
+++ b/benchllm/cli/commands/run_suite.py
@@ -2,8 +2,9 @@
 
 import typer
 
+from benchllm.cache import FileCache
 from benchllm.cli.listener import ReportListener, RichCliListener
-from benchllm.cli.utils import get_evaluator
+from benchllm.cli.utils import add_cache, get_evaluator
 from benchllm.tester import Tester
 from benchllm.utils import find_files
 
@@ -17,6 +18,7 @@ def run_suite(
     workers: int,
     evaluator_name: str,
     retry_count: int,
+    cache: str,
 ) -> bool:
     files = find_files(file_search_paths)
     if not files:
@@ -45,6 +47,10 @@ def run_suite(
         return True
 
     evaluator = get_evaluator(evaluator_name, model, workers)
+    evaluator = add_cache(cache, evaluator, output_dir.parent / "cache.json")
+
+    cli_listener.set_evaulator(evaluator)
+
     evaluator.add_listener(cli_listener)
     evaluator.add_listener(report_listener)
     evaluator.load(tester.predictions)
diff --git a/benchllm/cli/evaluator.py b/benchllm/cli/evaluator.py
deleted file mode 100644
index d36bc85..0000000
--- a/benchllm/cli/evaluator.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import signal
-
-import typer
-from pywebio import session
-from pywebio.input import actions
-from pywebio.output import put_markdown, put_table
-
-from benchllm.data_types import Prediction
-from benchllm.evaluator import Evaluator
-
-
-class InteractiveEvaluator(Evaluator):
-    def evaluate_prediction(self, prediction: Prediction) -> bool:
-        header = (
-            f'{typer.style("Does ", bold=True)}'
-            f"{typer.style(prediction.output, fg=typer.colors.BRIGHT_BLUE, bold=True)}"
-            f'{typer.style(" match any of the following expected prompts?", bold=True)}'
-        )
-        typer.echo("")
-        typer.echo(header)
-
-        for i, expected in enumerate(prediction.test.expected, start=1):
-            typer.secho(f"{i}. ", fg=typer.colors.BRIGHT_BLUE, bold=True, nl=False)
-            typer.secho(expected, bold=True)
-
-        while True:
-            prompt_string = (
-                f'{typer.style("[")}'
-                f'{typer.style("y", fg=typer.colors.GREEN, bold=True)}'
-                f'{typer.style("/")}'
-                f'{typer.style("n", fg=typer.colors.RED, bold=True)}'
-                f'{typer.style("]")}'
-            )
-
-            response = response = typer.prompt(prompt_string).lower()
-            if response == "y":
-                return True
-            elif response == "n":
-                return False
-            else:
-                typer.secho(
-                    'Invalid answer. Please just use "y" to mark the test as correct, and "n" to mark the test as incorrect',
-                    fg=typer.colors.RED,
-                    bold=True,
-                )
-                continue
-
-
-class WebEvaluator(Evaluator):
-    def __init__(self):
-        super().__init__(workers=1)
-
-        @session.defer_call
-        def on_close():
-            typer.secho(
-                f"The evaluation was interrupted. Run bench eval to start again", fg=typer.colors.RED, bold=True
-            )
-            # sys.exit doesn't work here, so we have to raise a signal to kill the process
-            signal.raise_signal(signal.SIGINT)
-
-        put_markdown("# BenchLLM Web Evaluator")
-
-    def evaluate_prediction(self, prediction: Prediction) -> bool:
-        test_name = prediction.test.file_path or prediction.test.id
-
-        put_markdown(f"## {test_name}")
-        table = [["Question:", f"{prediction.test.input}"], ["Prediction:", prediction.output]]
-        for i, expected in enumerate(prediction.test.expected):
-            table.append([f"Expected ({i+1}):", expected])
-
-        put_table(table)
-
-        result = actions(
-            label="Does the prediction match any of the answers?",
-            buttons=[{"label": "Yes", "value": True}, {"label": "No", "value": False}],
-        )
-
-        return bool(result)
diff --git a/benchllm/cli/evaluator/__init__.py b/benchllm/cli/evaluator/__init__.py
new file mode 100644
index 0000000..fb9ee8a
--- /dev/null
+++ b/benchllm/cli/evaluator/__init__.py
@@ -0,0 +1,2 @@
+from benchllm.cli.evaluator.interactive import InteractiveEvaluator  # noqa
+from benchllm.cli.evaluator.web import WebEvaluator  # noqa
diff --git a/benchllm/cli/evaluator/interactive.py b/benchllm/cli/evaluator/interactive.py
new file mode 100644
index 0000000..25aa068
--- /dev/null
+++ b/benchllm/cli/evaluator/interactive.py
@@ -0,0 +1,31 @@
+from typing import Optional
+
+import click
+import typer
+
+from benchllm.data_types import Prediction
+from benchllm.evaluator import Evaluator
+
+
+class InteractiveEvaluator(Evaluator):
+    def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]:
+        header = (
+            f'{typer.style("Does ", bold=True)}'
+            f"{typer.style(prediction.output, fg=typer.colors.BRIGHT_BLUE, bold=True)}"
+            f'{typer.style(" match any of the following expected prompts?", bold=True)}'
+        )
+        typer.echo("")
+        typer.echo(header)
+
+        for i, expected in enumerate(prediction.test.expected, start=1):
+            typer.secho(f"{i}. ", fg=typer.colors.BRIGHT_BLUE, bold=True, nl=False)
+            typer.secho(expected, bold=True)
+
+        options = [str(idx) for idx, _ in enumerate(prediction.test.expected, start=1)] + ["n"]
+
+        prompt_string = f"[{typer.style('matching number', fg=typer.colors.GREEN, bold=True)} or {typer.style('n', fg=typer.colors.RED, bold=True)}]"
+        click_choice = click.Choice(options)
+        response = typer.prompt(prompt_string, default="n", type=click_choice, show_choices=False).lower()
+        if response == "n":
+            return None
+        return Evaluator.Match(prediction=prediction.output, expected=prediction.test.expected[int(response) - 1])
diff --git a/benchllm/cli/evaluator/web.py b/benchllm/cli/evaluator/web.py
new file mode 100644
index 0000000..7ca7caf
--- /dev/null
+++ b/benchllm/cli/evaluator/web.py
@@ -0,0 +1,47 @@
+import signal
+from typing import Optional
+
+import typer
+from pywebio import session
+from pywebio.input import radio
+from pywebio.output import put_markdown
+
+from benchllm.data_types import Prediction
+from benchllm.evaluator import Evaluator
+
+
+class WebEvaluator(Evaluator):
+    def __init__(self) -> None:
+        super().__init__(workers=1)
+
+        @session.defer_call
+        def on_close() -> None:
+            print("shutting down")
+            typer.secho(
+                f"The evaluation was interrupted. Run bench eval to start again", fg=typer.colors.RED, bold=True
+            )
+            # sys.exit doesn't work here, so we have to raise a signal to kill the process
+            signal.raise_signal(signal.SIGINT)
+
+        put_markdown("# BenchLLM Web Evaluator")
+
+    def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]:
+        test_name = prediction.test.file_path or prediction.test.id
+
+        put_markdown(f"## {test_name}")
+        put_markdown(f"*Question*: `{prediction.test.input}`")
+        put_markdown(f"*Prediction*: `{prediction.output}`")
+
+        table = [["Question:", f"{prediction.test.input}", ""], ["Prediction:", prediction.output], ""]
+        label = f"Question: {prediction.test.input}Prediction: {prediction.output}"
+
+        options: list[dict[str, Optional[int | str]]] = [
+            {"label": expected, "value": idx} for idx, expected in enumerate(prediction.test.expected)
+        ]
+        options.append({"label": "None", "value": None, "selected": True})
+        answer = radio("Pick the matching answer", options=options, required=True)
+
+        if answer and isinstance(answer, int):
+            return Evaluator.Match(prediction=prediction.output, expected=prediction.test.expected[answer])
+        else:
+            return None
diff --git a/benchllm/cli/listener.py b/benchllm/cli/listener.py
index e1f517e..f1f6135 100644
--- a/benchllm/cli/listener.py
+++ b/benchllm/cli/listener.py
@@ -1,6 +1,7 @@
 import datetime
 import json
 from pathlib import Path
+from typing import Optional
 
 import typer
 from rich import print
@@ -8,7 +9,9 @@
 from rich.markup import render
 from rich.table import Table
 
+from benchllm.cache import MemoryCache
 from benchllm.data_types import Evaluation, FunctionID, Prediction, Test, TestFunction
+from benchllm.evaluator import Evaluator
 from benchllm.listener import EvaluatorListener, TesterListener
 
 
@@ -37,12 +40,23 @@ def evaluate_prediction_ended(self, evaluation: Evaluation) -> None:
 
 
 class RichCliListener(TesterListener, EvaluatorListener):
-    def __init__(self, root_dir: Path, *, interactive: bool, test_only: bool = False, eval_only: bool = False) -> None:
+    def __init__(
+        self,
+        root_dir: Path,
+        *,
+        interactive: bool,
+        test_only: bool = False,
+        eval_only: bool = False,
+    ) -> None:
         super().__init__()
         self.root_dir = root_dir
         self.interactive = interactive
         self._eval_only = eval_only
         self._test_only = test_only
+        self._evaluator: Optional[Evaluator] = None
+
+    def set_evaulator(self, evaluator: Evaluator) -> None:
+        self._evaluator = evaluator
 
     def test_run_started(self) -> None:
         print_centered(" Run Tests ")
@@ -116,6 +130,9 @@ def evaluate_ended(self, evaluations: list[Evaluation]) -> None:
                 console.print(table)
 
         tmp = f" [red]{len(failed)} failed[/red], [green]{len(evaluations) - len(failed)} passed[/green], in [blue]{format_time(total_eval_time + total_test_time)}[/blue] "
+        if isinstance(self._evaluator, MemoryCache):
+            tmp += f"(cached hits {self._evaluator.num_cache_hits}, cached misses {self._evaluator.num_cache_misses}) "
+
         print_centered(tmp)
 
 
diff --git a/benchllm/cli/main.py b/benchllm/cli/main.py
index 353fca7..e2fe202 100644
--- a/benchllm/cli/main.py
+++ b/benchllm/cli/main.py
@@ -28,6 +28,7 @@ def run(
     workers: Annotated[int, typer.Option(help="Number of workers to use to run the evaluation.")] = 1,
     retry_count: Annotated[int, typer.Option(help="Rerun tests to spot flaky output")] = 1,
     evaluator: Annotated[str, typer.Option(help="Evaluator to use to run the evaluation.")] = "semantic",
+    cache: Annotated[str, typer.Option(help="Type of cache to use.")] = "file",
 ) -> None:
     if not file_or_dir:
         file_or_dir = [Path.cwd()]
@@ -40,6 +41,7 @@ def run(
         evaluator_name=evaluator,
         no_eval=not eval,
         retry_count=retry_count,
+        cache=cache,
     )
     if not success:
         raise typer.Exit(code=1)
@@ -61,6 +63,7 @@ def eval(
     model: Annotated[str, typer.Option(help="Model to use to run the evaluation.")] = "gpt-3",
     workers: Annotated[int, typer.Option(help="Number of workers to use to run the evaluation.")] = 1,
     evaluator: Annotated[str, typer.Option(help="Evaluator to use to run the evaluation.")] = "semantic",
+    cache: Annotated[str, typer.Option(help="Type of cache to use.")] = "file",
 ) -> None:
     success = evaluate_predictions(
         file_or_dir=file_or_dir,
@@ -68,6 +71,7 @@ def eval(
         output_dir=output_dir,
         workers=workers,
         evaluator_name=evaluator,
+        cache=cache,
     )
     if not success:
         raise typer.Exit(code=1)
diff --git a/benchllm/cli/utils.py b/benchllm/cli/utils.py
index 9364e76..501048e 100644
--- a/benchllm/cli/utils.py
+++ b/benchllm/cli/utils.py
@@ -1,6 +1,7 @@
 import datetime
 from pathlib import Path
 
+from benchllm.cache import FileCache, MemoryCache
 from benchllm.cli.evaluator import InteractiveEvaluator, WebEvaluator
 from benchllm.evaluator import Evaluator, SemanticEvaluator, StringMatchEvaluator
 
@@ -28,3 +29,14 @@ def get_evaluator(evaluator_name: str, model: str, workers: int) -> Evaluator:
         return WebEvaluator()
     else:
         raise ValueError(f"Unknown evaluator {evaluator_name}")
+
+
+def add_cache(cache_name: str, evaluator: Evaluator, cache_path: Path) -> Evaluator:
+    if cache_name == "file":
+        return FileCache(evaluator, cache_path)
+    elif cache_name == "memory":
+        return MemoryCache(evaluator)
+    elif cache_name == "none":
+        return evaluator
+    else:
+        raise ValueError(f"Unknown cache {cache_name}, valid values are 'file', 'memory', 'none'")
diff --git a/benchllm/evaluator/__init__.py b/benchllm/evaluator/__init__.py
new file mode 100644
index 0000000..d92e96e
--- /dev/null
+++ b/benchllm/evaluator/__init__.py
@@ -0,0 +1,3 @@
+from benchllm.evaluator.evaluator import Evaluator  # noqa
+from benchllm.evaluator.semantic import SemanticEvaluator  # noqa
+from benchllm.evaluator.string_match import StringMatchEvaluator  # noqa
diff --git a/benchllm/evaluator.py b/benchllm/evaluator/evaluator.py
similarity index 63%
rename from benchllm/evaluator.py
rename to benchllm/evaluator/evaluator.py
index 27d8f68..87affb6 100644
--- a/benchllm/evaluator.py
+++ b/benchllm/evaluator/evaluator.py
@@ -5,13 +5,14 @@
 from operator import attrgetter
 from pathlib import Path
 from timeit import default_timer as timer
-from typing import List
+from typing import Optional
 
 import yaml
+from pydantic import BaseModel
 
 from benchllm.data_types import Evaluation, FunctionID, Prediction
+from benchllm.input_types import Json
 from benchllm.listener import EvaluatorListener
-from benchllm.similarity import semantically_similar
 
 
 class Evaluator(ABC):
@@ -21,6 +22,10 @@ def __init__(self, workers: int = 1):
         self._evaluations: list[Evaluation] = []
         self._workers: int = workers
 
+    class Match(BaseModel):
+        prediction: Json
+        expected: Json
+
     def add_listener(self, listener: EvaluatorListener) -> None:
         self._listeners.append(listener)
 
@@ -55,7 +60,9 @@ def _run_evaluation(self, prediction: Prediction) -> Evaluation:
         start = timer()
         match = self.evaluate_prediction(prediction)
         end = timer()
-        evaluation = Evaluation(prediction=prediction, passed=match, eval_time_elapsed=end - start)
+        evaluation = Evaluation(
+            prediction=prediction, passed=isinstance(match, Evaluator.Match), eval_time_elapsed=end - start
+        )
         self._broadcast_evaluate_prediction_ended(evaluation)
         return evaluation
 
@@ -71,8 +78,17 @@ def failed(self) -> list[Evaluation]:
     def evaluations(self) -> list[Evaluation]:
         return self._evaluations
 
+    @property
+    def workers(self) -> int:
+        return self._workers
+
+    @property
+    def predictions(self) -> list[Prediction]:
+        return self._predictions
+
     @abstractmethod
-    def evaluate_prediction(self, prediction: Prediction) -> bool:
+    def evaluate_prediction(self, prediction: Prediction) -> Optional[Match]:
+        """Evaluate a single prediction, return a Match if the prediction matches the expected output."""
         pass
 
     def max_threads(self) -> int:
@@ -101,59 +117,3 @@ def _broadcast_evaluate_module_ended(self) -> None:
     def _broadcast_evaluate_ended(self, evaluations: list[Evaluation]) -> None:
         for listener in self._listeners:
             listener.evaluate_ended(evaluations)
-
-
-class SemanticEvaluator(Evaluator):
-    def __init__(self, *, model: str = "gpt-3", workers: int = 1):
-        super().__init__(workers=workers)
-        self.model = model
-
-    def evaluate_prediction(self, prediction: Prediction) -> bool:
-        for expected in prediction.test.expected:
-            if semantically_similar(expected, prediction.output, model=self.model):
-                return True
-        return False
-
-
-class StringMatchEvaluator(Evaluator):
-    def __init__(self, *, case_sensitive: bool = False, fuzzy: bool = False, workers: int = 1):
-        super().__init__(workers=workers)
-
-        self._case_sensitive = case_sensitive
-        self._fuzzy = fuzzy
-
-    def match_strings(self, expected: str, output: str) -> bool:
-        if not self._case_sensitive:
-            expected = expected.lower()
-            output = output.lower()
-
-        if self._fuzzy:
-            return expected in output or output in expected
-
-        return expected == output
-
-    def evaluate_prediction(self, prediction: Prediction) -> bool:
-        output = prediction.output
-        return any([self.match_strings(expected, output) for expected in prediction.test.expected])
-
-
-def load_prediction_files(paths: List[Path]) -> List[Prediction]:
-    import json
-
-    import yaml
-
-    predictions = []
-    for path in paths:
-        for file_path in path.rglob("*"):
-            if not file_path.is_file():
-                continue
-            if file_path.suffix not in {".json", ".yml", ".yaml"}:
-                continue
-            with open(file_path, "r") as file:
-                if file_path.suffix == ".json":
-                    data = json.load(file)
-                    predictions.append(Prediction(**data))
-                elif file_path.suffix in {".yml", ".yaml"}:
-                    data = yaml.safe_load(file)
-                    predictions.append(Prediction(**data))
-    return predictions
diff --git a/benchllm/evaluator/semantic.py b/benchllm/evaluator/semantic.py
new file mode 100644
index 0000000..73881a4
--- /dev/null
+++ b/benchllm/evaluator/semantic.py
@@ -0,0 +1,17 @@
+from typing import Optional
+
+from benchllm.data_types import Prediction
+from benchllm.evaluator import Evaluator
+from benchllm.similarity import semantically_similar
+
+
+class SemanticEvaluator(Evaluator):
+    def __init__(self, *, model: str = "gpt-3", workers: int = 1):
+        super().__init__(workers=workers)
+        self.model = model
+
+    def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]:
+        for expected in prediction.test.expected:
+            if semantically_similar(expected, prediction.output, model=self.model):
+                return Evaluator.Match(prediction=prediction.output, expected=expected)
+        return None
diff --git a/benchllm/evaluator/string_match.py b/benchllm/evaluator/string_match.py
new file mode 100644
index 0000000..eb59951
--- /dev/null
+++ b/benchllm/evaluator/string_match.py
@@ -0,0 +1,30 @@
+import json
+from typing import Optional
+
+from benchllm.data_types import Prediction
+from benchllm.evaluator import Evaluator
+
+
+class StringMatchEvaluator(Evaluator):
+    def __init__(self, *, case_sensitive: bool = False, fuzzy: bool = False, workers: int = 1):
+        super().__init__(workers=workers)
+
+        self._case_sensitive = case_sensitive
+        self._fuzzy = fuzzy
+
+    def match_strings(self, expected: str, output: str) -> bool:
+        if not self._case_sensitive:
+            expected = expected.lower()
+            output = output.lower()
+
+        if self._fuzzy:
+            return expected in output or output in expected
+
+        return expected == output
+
+    def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]:
+        output = prediction.output
+        for expected in prediction.test.expected:
+            if self.match_strings(expected, output):
+                return Evaluator.Match(prediction=prediction.output, expected=expected)
+        return None
diff --git a/benchllm/input_types.py b/benchllm/input_types.py
index b643261..d72e503 100644
--- a/benchllm/input_types.py
+++ b/benchllm/input_types.py
@@ -1,8 +1,9 @@
-import json
-from typing import TypedDict
+from typing import TypedDict, Union
 
 from pydantic import BaseModel
 
+Json = Union[str, bool, list, dict]
+
 
 class ChatInputItem(TypedDict):
     role: str
diff --git a/benchllm/similarity.py b/benchllm/similarity.py
index a6f5fbd..787d7d6 100644
--- a/benchllm/similarity.py
+++ b/benchllm/similarity.py
@@ -47,4 +47,6 @@ def semantically_similar(answer1: str, answer2: str, model: str = "gpt-3") -> bo
     }}""",
         model=model,
     )
+    if response not in ["same", "different"]:
+        raise ValueError(f"Unexpected response: {response}")
     return response == "same"
diff --git a/benchllm/singleton.py b/benchllm/singleton.py
index 503fc1f..456bbe2 100644
--- a/benchllm/singleton.py
+++ b/benchllm/singleton.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Any, Callable, Generic, Optional, Type, TypeVar
+from typing import Callable, Generic, Optional, Type, TypeVar
 
 T = TypeVar("T")
 
diff --git a/benchllm/utils.py b/benchllm/utils.py
index 633ea99..b4f6f74 100644
--- a/benchllm/utils.py
+++ b/benchllm/utils.py
@@ -1,6 +1,11 @@
 import ast
+import json
 from pathlib import Path
 
+import yaml
+
+from benchllm.data_types import Prediction
+
 
 class DecoratorFinder(ast.NodeVisitor):
     def __init__(self) -> None:
@@ -59,3 +64,21 @@ def find_json_yml_files(paths: list[Path]) -> list[Path]:
                 if file.suffix in (".yml", ".json", ".yaml"):
                     files.append(file)
     return list(set(files))
+
+
+def load_prediction_files(paths: list[Path]) -> list[Prediction]:
+    predictions = []
+    for path in paths:
+        for file_path in path.rglob("*"):
+            if not file_path.is_file():
+                continue
+            if file_path.suffix not in {".json", ".yml", ".yaml"}:
+                continue
+            with open(file_path, "r") as file:
+                if file_path.suffix == ".json":
+                    data = json.load(file)
+                    predictions.append(Prediction(**data))
+                elif file_path.suffix in {".yml", ".yaml"}:
+                    data = yaml.safe_load(file)
+                    predictions.append(Prediction(**data))
+    return predictions
diff --git a/test/cache/test_file_cache.py b/test/cache/test_file_cache.py
new file mode 100644
index 0000000..d5218c0
--- /dev/null
+++ b/test/cache/test_file_cache.py
@@ -0,0 +1,74 @@
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+
+from benchllm import Prediction, StringMatchEvaluator, Test
+from benchllm.cache import FileCache
+from benchllm.data_types import FunctionID
+
+EXAMPLE_PREDICTIONS = [
+    Prediction(
+        test=Test(input="foo", expected=["abc", "def", "ghi"]),
+        output="no-match",
+        time_elapsed=0,
+        function_id=FunctionID.default(),
+    ),
+    Prediction(
+        test=Test(input="foo", expected=["abc", "def", "ghi"]),
+        output="def",
+        time_elapsed=0,
+        function_id=FunctionID.default(),
+    ),
+    Prediction(
+        test=Test(input="foo", expected=["abc", "def", "ghi"]),
+        output="no-match",
+        time_elapsed=0,
+        function_id=FunctionID.default(),
+    ),
+]
+
+EXAMPLE_PREDICTIONS_ALL_SAME = [
+    Prediction(
+        test=Test(input="foo", expected=["match"]),
+        output="match",
+        time_elapsed=0,
+        function_id=FunctionID.default(),
+    ),
+    Prediction(
+        test=Test(input="foo", expected=["match"]),
+        output="match",
+        time_elapsed=0,
+        function_id=FunctionID.default(),
+    ),
+]
+
+
+def test_file_writes_at_end():
+    with patch.object(
+        StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
+    ) as mock_method:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            cache_path = Path(temp_dir, "cache.json")
+            evaluator = FileCache(StringMatchEvaluator(), cache_path)
+            evaluator.load(EXAMPLE_PREDICTIONS)
+
+            evaluations = evaluator.run()
+            assert cache_path.exists()
+            assert not evaluations[0].passed
+            assert evaluations[1].passed
+            assert not evaluations[2].passed
+            assert mock_method.call_count == 2
+            assert evaluator.num_cache_hits == 1
+            mock_method.reset_mock()
+
+            # second run will use cache
+            evaluator = FileCache(StringMatchEvaluator(), cache_path)
+            evaluator.load(EXAMPLE_PREDICTIONS)
+
+            evaluations = evaluator.run()
+            assert cache_path.exists()
+            assert not evaluations[0].passed
+            assert evaluations[1].passed
+            assert not evaluations[2].passed
+            assert mock_method.call_count == 0
+            assert evaluator.num_cache_hits == 3
diff --git a/test/cache/test_memory_cache.py b/test/cache/test_memory_cache.py
new file mode 100644
index 0000000..0d4288b
--- /dev/null
+++ b/test/cache/test_memory_cache.py
@@ -0,0 +1,163 @@
+from unittest.mock import patch
+
+from benchllm import Prediction, StringMatchEvaluator, Test
+from benchllm.cache import MemoryCache
+from benchllm.data_types import FunctionID
+
+EXAMPLE_PREDICTIONS = [
+    Prediction(
+        test=Test(input="foo", expected=["abc", "def", "ghi"]),
+        output="no-match",
+        time_elapsed=0,
+        function_id=FunctionID.default(),
+    ),
+    Prediction(
+        test=Test(input="foo", expected=["abc", "def", "ghi"]),
+        output="def",
+        time_elapsed=0,
+        function_id=FunctionID.default(),
+    ),
+    Prediction(
+        test=Test(input="foo", expected=["abc", "def", "ghi"]),
+        output="no-match",
+        time_elapsed=0,
+        function_id=FunctionID.default(),
+    ),
+]
+
+EXAMPLE_PREDICTIONS_ALL_SAME = [
+    Prediction(
+        test=Test(input="foo", expected=["match"]),
+        output="match",
+        time_elapsed=0,
+        function_id=FunctionID.default(),
+    ),
+    Prediction(
+        test=Test(input="foo", expected=["match"]),
+        output="match",
+        time_elapsed=0,
+        function_id=FunctionID.default(),
+    ),
+]
+
+
+EXAMPLE_PREDICTIONS_CACHING_NEGATIVE = [
+    Prediction(
+        test=Test(input="foo", expected=["no-match"]),
+        output="match",
+        time_elapsed=0,
+        function_id=FunctionID.default(),
+    ),
+    Prediction(
+        test=Test(input="foo", expected=["no-match", "match"]),
+        output="match",
+        time_elapsed=0,
+        function_id=FunctionID.default(),
+    ),
+    Prediction(
+        test=Test(input="foo", expected=["match", "no-match"]),
+        output="match",
+        time_elapsed=0,
+        function_id=FunctionID.default(),
+    ),
+]
+
+
+def test_memory_cache_will_prevent_calls_to_evaluate_prediction_on_second_run():
+    with patch.object(
+        StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
+    ) as mock_method:
+        evaluator = MemoryCache(StringMatchEvaluator())
+        evaluator.load(EXAMPLE_PREDICTIONS)
+        evaluations = evaluator.run()
+        assert not evaluations[0].passed
+        assert evaluations[1].passed
+        assert not evaluations[2].passed
+        assert mock_method.call_count == 2
+        assert evaluator.num_cache_hits == 1
+        mock_method.reset_mock()
+
+        # second run will use cache
+        evaluations = evaluator.run()
+        assert not evaluations[0].passed
+        assert evaluations[1].passed
+        assert not evaluations[2].passed
+        assert evaluator.num_cache_hits == 4
+        assert mock_method.call_count == 0
+
+
+def test_memory_cache_caches_during_run():
+    with patch.object(
+        StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
+    ) as mock_method:
+        evaluator = MemoryCache(StringMatchEvaluator())
+        evaluator.load(EXAMPLE_PREDICTIONS_ALL_SAME)
+
+        evaluations = evaluator.run()
+        assert evaluations[0].passed
+        assert evaluations[1].passed
+        assert mock_method.call_count == 1
+        assert evaluator.num_cache_hits == 1
+
+
+def test_memory_cache_caches_always_tries_to_pass():
+    with patch.object(
+        StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
+    ) as mock_method:
+        evaluator = MemoryCache(StringMatchEvaluator())
+        evaluator.load(EXAMPLE_PREDICTIONS_CACHING_NEGATIVE)
+
+        evaluations = evaluator.run()
+        assert not evaluations[0].passed
+        assert evaluations[1].passed
+        assert evaluations[2].passed
+        assert mock_method.call_count == 2
+        assert evaluator.num_cache_hits == 1
+
+
+def test_memory_cache_does_not_pass_on_cached_negatives():
+    with patch.object(
+        StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
+    ) as mock_method:
+        evaluator = MemoryCache(StringMatchEvaluator())
+        evaluator.load(EXAMPLE_PREDICTIONS_CACHING_NEGATIVE)
+
+        evaluator.run()
+        assert mock_method.call_count == 2
+        assert mock_method.call_args_list.pop(0).args[0].test.expected == ["no-match"]
+        assert mock_method.call_args_list.pop(0).args[0].test.expected == ["match"]
+
+
+def test_memory_cache_supports_numbers():
+    with patch.object(
+        StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
+    ) as mock_method:
+        evaluator = MemoryCache(StringMatchEvaluator())
+        evaluator.load(
+            [
+                Prediction(
+                    test=Test(input="foo", expected=["42"]),
+                    output="42",
+                    time_elapsed=0,
+                    function_id=FunctionID.default(),
+                ),
+                Prediction(
+                    test=Test(input="foo", expected=["42"]),
+                    output="42",
+                    time_elapsed=0,
+                    function_id=FunctionID.default(),
+                ),
+                Prediction(
+                    test=Test(input="foo", expected=["42"]),
+                    output="24",
+                    time_elapsed=0,
+                    function_id=FunctionID.default(),
+                ),
+            ]
+        )
+        evaluations = evaluator.run()
+        assert evaluations[0].passed
+        assert evaluations[1].passed
+        assert not evaluations[2].passed
+        assert mock_method.call_count == 2
+        assert evaluator.num_cache_hits == 1
diff --git a/test/cli/test_interactive.py b/test/cli/test_interactive.py
new file mode 100644
index 0000000..61bee1c
--- /dev/null
+++ b/test/cli/test_interactive.py
@@ -0,0 +1,35 @@
+from unittest import mock
+
+import typer
+
+from benchllm.cli.evaluator import InteractiveEvaluator
+from benchllm.data_types import FunctionID, Prediction, Test
+
+TEST_PREDICTION = [
+    Prediction(
+        test=Test(input="Who are you?", expected=["Yoda I am.", "Yoda"]),
+        output="I am Yoda.",
+        time_elapsed=0,
+        function_id=FunctionID.default(),
+    )
+]
+
+
+def test_interactive_press_y_passes():
+    evalautor = InteractiveEvaluator()
+    evalautor.load(TEST_PREDICTION)
+    with mock.patch.object(typer, "prompt", lambda *args, **kwargs: "1"):
+        result = evalautor.run()
+    assert result[0].passed
+
+    with mock.patch.object(typer, "prompt", lambda *args, **kwargs: "2"):
+        result = evalautor.run()
+    assert result[0].passed
+
+
+def test_interactive_press_n_fails():
+    evalautor = InteractiveEvaluator()
+    evalautor.load(TEST_PREDICTION)
+    with mock.patch.object(typer, "prompt", lambda *args, **kwargs: "n"):
+        result = evalautor.run()
+    assert not result[0].passed
diff --git a/test/cli/test_list_tests.py b/test/cli/test_list_tests.py
new file mode 100644
index 0000000..e54a541
--- /dev/null
+++ b/test/cli/test_list_tests.py
@@ -0,0 +1,14 @@
+from pathlib import Path
+
+from typer.testing import CliRunner
+
+from benchllm.cli.main import app
+
+runner = CliRunner()
+
+
+def test_list_tests():
+    result = runner.invoke(app, ["tests", str(Path.cwd() / "examples/qa")])
+    assert "Input" in result.stdout
+    assert "No." in result.stdout
+    assert "Expected" in result.stdout
diff --git a/test/test_cli_main.py b/test/cli/test_run_suite.py
similarity index 73%
rename from test/test_cli_main.py
rename to test/cli/test_run_suite.py
index 653cf47..1fa2ae5 100644
--- a/test/test_cli_main.py
+++ b/test/cli/test_run_suite.py
@@ -1,4 +1,3 @@
-from pathlib import Path
 from test.utils import create_openai_object
 from unittest.mock import MagicMock, patch
 
@@ -19,10 +18,3 @@ def test_run_multiple_suites(completion_mock: MagicMock):
 def test_run_target_suite(completion_mock: MagicMock):
     runner.invoke(app, ["run", "examples/qa"])
     completion_mock.assert_called()
-
-
-def test_list_tests():
-    result = runner.invoke(app, ["tests", str(Path.cwd() / "examples/qa")])
-    assert "Input" in result.stdout
-    assert "No." in result.stdout
-    assert "Expected" in result.stdout
diff --git a/test/evaulator/test_evalutator.py b/test/evaulator/test_evalutator.py
new file mode 100644
index 0000000..ab8aa4b
--- /dev/null
+++ b/test/evaulator/test_evalutator.py
@@ -0,0 +1,34 @@
+import json
+import tempfile
+from pathlib import Path
+from test.utils import create_openai_object
+from unittest.mock import MagicMock, Mock, call, patch
+
+from benchllm import Prediction, SemanticEvaluator, StringMatchEvaluator, Test
+from benchllm.cache import MemoryCache
+from benchllm.data_types import FunctionID
+from benchllm.evaluator import Evaluator
+
+
+class NoopEvaluator(Evaluator):
+    def evaluate_prediction(self, prediction: Prediction) -> Evaluator.Match:
+        return Evaluator.Match(prediction=prediction.output, expected=prediction.output)
+
+
+def test_evaluator_can_load_prediction_file():
+    prediction = {
+        "output": "42",
+        "test": {"input": "1+1", "expected": ["2"]},
+        "time_elapsed": 0,
+        "function_id": {"module_path": "test", "line_number": 1},
+    }
+    with tempfile.TemporaryDirectory() as tmpdir:
+        prediction_path = Path(tmpdir, "prediction.json")
+        prediction_path.write_bytes(json.dumps(prediction).encode())
+
+        evaluator = NoopEvaluator()
+        evaluator.load_prediction_file(prediction_path)
+
+        assert evaluator.predictions[0].output == "42"
+        assert evaluator.predictions[0].test.input == "1+1"
+        assert evaluator.predictions[0].test.expected == ["2"]
diff --git a/test/evaulator/test_semantic.py b/test/evaulator/test_semantic.py
new file mode 100644
index 0000000..92dd9b2
--- /dev/null
+++ b/test/evaulator/test_semantic.py
@@ -0,0 +1,60 @@
+from test.utils import create_openai_object
+from unittest.mock import MagicMock, patch
+
+from benchllm import Prediction, SemanticEvaluator, Test
+from benchllm.data_types import FunctionID
+
+
+@patch("openai.Completion.create", return_value=create_openai_object("same"))
+def test_semantic_passes_if_output_is_equal(completion_mock: MagicMock):
+    evaluator = SemanticEvaluator(model="gpt-3")
+    evaluator.load(
+        [
+            Prediction(
+                test=Test(input="Who are you?", expected=["Yoda I am."]),
+                output="I am Yoda.",
+                time_elapsed=0,
+                function_id=FunctionID.default(),
+            )
+        ]
+    )
+    evaluations = evaluator.run()
+    completion_mock.assert_called_once()
+    assert evaluations[0].passed
+
+
+@patch("openai.Completion.create", return_value=create_openai_object("different"))
+def test_semantic_fails_if_output_is_unequal(completion_mock: MagicMock):
+    evaluator = SemanticEvaluator(model="gpt-3")
+    evaluator.load(
+        [
+            Prediction(
+                test=Test(input="What are you?", expected=["Everything"]),
+                output="Nothing",
+                time_elapsed=0,
+                function_id=FunctionID.default(),
+            ),
+        ]
+    )
+    evaluations = evaluator.run()
+    completion_mock.assert_called_once()
+    assert not evaluations[0].passed
+
+
+@patch("openai.Completion.create", return_value=create_openai_object("same"))
+def test_semantic_passes_if_output_is_equal_multiple_workers(completion_mock: MagicMock):
+    evaluator = SemanticEvaluator(model="gpt-3", workers=10)
+    evaluator.load(
+        [
+            Prediction(
+                test=Test(input="Who are you?", expected=["Yoda I am."]),
+                output="I am Yoda.",
+                time_elapsed=0,
+                function_id=FunctionID.default(),
+            )
+            for _ in range(100)
+        ]
+    )
+    evaluations = evaluator.run()
+    assert completion_mock.call_count == 100
+    assert all([evaluation.passed for evaluation in evaluations])
diff --git a/test/test_evalutator.py b/test/evaulator/test_string_match.py
similarity index 50%
rename from test/test_evalutator.py
rename to test/evaulator/test_string_match.py
index 608172a..75ecc89 100644
--- a/test/test_evalutator.py
+++ b/test/evaulator/test_string_match.py
@@ -1,9 +1,4 @@
-import tempfile
-from pathlib import Path
-from test.utils import create_openai_object
-from unittest.mock import MagicMock, Mock, call, patch
-
-from benchllm import Prediction, SemanticEvaluator, StringMatchEvaluator, Test
+from benchllm import Prediction, StringMatchEvaluator, Test
 from benchllm.data_types import FunctionID
 
 
@@ -66,58 +61,3 @@ def test_string_match_passes_if_output_is_equal_to_expected_fuzzy():
     evaluations = evaluator.run()
     assert evaluations[0].passed
     assert not evaluations[1].passed
-
-
-@patch("openai.Completion.create", return_value=create_openai_object("same"))
-def test_semantic_passes_if_output_is_equal(completion_mock: MagicMock):
-    evaluator = SemanticEvaluator(model="gpt-3")
-    evaluator.load(
-        [
-            Prediction(
-                test=Test(input="Who are you?", expected=["Yoda I am."]),
-                output="I am Yoda.",
-                time_elapsed=0,
-                function_id=FunctionID.default(),
-            )
-        ]
-    )
-    evaluations = evaluator.run()
-    completion_mock.assert_called_once()
-    assert evaluations[0].passed
-
-
-@patch("openai.Completion.create", return_value=create_openai_object("different"))
-def test_semantic_fails_if_output_is_unequal(completion_mock: MagicMock):
-    evaluator = SemanticEvaluator(model="gpt-3")
-    evaluator.load(
-        [
-            Prediction(
-                test=Test(input="What are you?", expected=["Everything"]),
-                output="Nothing",
-                time_elapsed=0,
-                function_id=FunctionID.default(),
-            ),
-        ]
-    )
-    evaluations = evaluator.run()
-    completion_mock.assert_called_once()
-    assert not evaluations[0].passed
-
-
-@patch("openai.Completion.create", return_value=create_openai_object("same"))
-def test_semantic_passes_if_output_is_equal_multiple_workers(completion_mock: MagicMock):
-    evaluator = SemanticEvaluator(model="gpt-3", workers=10)
-    evaluator.load(
-        [
-            Prediction(
-                test=Test(input="Who are you?", expected=["Yoda I am."]),
-                output="I am Yoda.",
-                time_elapsed=0,
-                function_id=FunctionID.default(),
-            )
-            for _ in range(100)
-        ]
-    )
-    evaluations = evaluator.run()
-    assert completion_mock.call_count == 100
-    assert all([evaluation.passed for evaluation in evaluations])