From 9610f38f0ef70d6d8f89121ee149ae06d7058395 Mon Sep 17 00:00:00 2001
From: Elizabeth Santorella <santorella@meta.com>
Date: Wed, 21 Aug 2024 07:14:29 -0700
Subject: [PATCH 1/3] Remove functionality for `BenchmarkRunner` without ground
 truth (#2674)

Summary:
Pull Request resolved: https://github.com/facebook/Ax/pull/2674

Context:

This is an alternative to D61431979.

Note: There are benchmarks that do not use `BenchmarkRunner`, but I plan to have them all use `BenchmarkRunner` in the future.

`BenchmarkRunner` technically supports benchmarks without a ground truth, but that functionality is never used, and there aren't any Ax benchmarks that are noisy *and* don't have a ground truth. It is not conceptually clear how such a case should be benchmarked, so it is better to not over-engineer for that need, which may never arise. Instead, benchmarks that lack a ground truth but are deterministic can be treated as noiseless problems with a ground truth, and we can reap support for problems without a ground truth.

Also, `BenchmarkRunner` has some methods that must either be defined or not defined depending on whether there is a ground truth. They can't be abstract because they will not always be defined. With this change, we can make the ground-truth methods abstract and get rid of the rest.

This PR:
- Rewrites docstrings
- Removes method `get_Y_Ystd`
- Makes `get_Y_true` and other methods abstract
- Removes functionality for the case where `get_Y_true` raises a `NotImplementedError`

Reviewed By: ItsMrLin

Differential Revision: D61483962
---
 ax/benchmark/runners/base.py | 106 ++++++++++++++++-------------------
 1 file changed, 49 insertions(+), 57 deletions(-)

diff --git a/ax/benchmark/runners/base.py b/ax/benchmark/runners/base.py
index f300742e33f..83469d8ca20 100644
--- a/ax/benchmark/runners/base.py
+++ b/ax/benchmark/runners/base.py
@@ -5,9 +5,9 @@
 
 # pyre-strict
 
-from abc import ABC, abstractmethod
+from abc import ABC, abstractmethod, abstractproperty
 from math import sqrt
-from typing import Any, Optional, Union
+from typing import Any, Union
 
 import torch
 from ax.core.arm import Arm
@@ -21,45 +21,44 @@
 
 
 class BenchmarkRunner(Runner, ABC):
-
-    @property
-    @abstractmethod
+    """
+    A Runner that produces both observed and ground-truth values.
+
+    Observed values equal ground-truth values plus noise, with the noise added
+    according to the standard deviations returned by `get_noise_stds()`.
+
+    This runner does require that every benchmark has a ground truth, which
+    won't necessarily be true for real-world problems. Such problems fall into
+    two categories:
+        - If they are deterministic, they can be used with this runner by
+          viewing them as noiseless problems where the observed values are the
+          ground truth. The observed values will be used for tracking the
+          progress of optimization.
+        - If they are not deterministc, they are not supported. It is not
+          conceptually clear how to benchmark such problems, so we decided to
+          not over-engineer for that before such a use case arrives.
+    """
+
+    @abstractproperty
     def outcome_names(self) -> list[str]:
         """The names of the outcomes of the problem (in the order of the outcomes)."""
         pass  # pragma: no cover
 
     def get_Y_true(self, arm: Arm) -> Tensor:
-        """Function returning the ground truth values for a given arm. The
-        synthetic noise is added as part of the Runner's `run()` method.
-        For problems that do not have a ground truth, the Runner must
-        implement the `get_Y_Ystd()` method instead."""
-        raise NotImplementedError(
-            "Must implement method `get_Y_true()` for Runner "
-            f"{self.__class__.__name__} as it does not implement a "
-            "`get_Y_Ystd()` method."
-        )
+        """
+        Return the ground truth values for a given arm.
+
+        Synthetic noise is added as part of the Runner's `run()` method.
+        """
+        ...
 
+    @abstractmethod
     def get_noise_stds(self) -> Union[None, float, dict[str, float]]:
-        """Function returning the standard errors for the synthetic noise
-        to be applied to the observed values. For problems that do not have
-        a ground truth, the Runner must implement the `get_Y_Ystd()` method
-        instead."""
-        raise NotImplementedError(
-            "Must implement method `get_Y_Ystd()` for Runner "
-            f"{self.__class__.__name__} as it does not implement a "
-            "`get_noise_stds()` method."
-        )
-
-    def get_Y_Ystd(self, arm: Arm) -> tuple[Tensor, Optional[Tensor]]:
-        """Function returning the observed values and their standard errors
-        for a given arm. This function is unused for problems that have a
-        ground truth (in this case `get_Y_true()` is used), and is required
-        for problems that do not have a ground truth."""
-        raise NotImplementedError(
-            "Must implement method `get_Y_Ystd()` for Runner "
-            f"{self.__class__.__name__} as it does not implement a "
-            "`get_Y_true()` method."
-        )
+        """
+        Return the standard errors for the synthetic noise to be applied to the
+        observed values.
+        """
+        ...
 
     def run(self, trial: BaseTrial) -> dict[str, Any]:
         """Run the trial by evaluating its parameterization(s).
@@ -110,33 +109,26 @@ def run(self, trial: BaseTrial) -> dict[str, Any]:
                 )
 
         for arm in trial.arms:
-            try:
-                # Case where we do have a ground truth
-                Y_true = self.get_Y_true(arm)
-                Ys_true[arm.name] = Y_true.tolist()
-                if noise_stds is None:
-                    # No noise, so just return the true outcome.
-                    Ystds[arm.name] = [0.0] * len(Y_true)
-                    Ys[arm.name] = Y_true.tolist()
-                else:
-                    # We can scale the noise std by the inverse of the relative sample
-                    # budget allocation to each arm. This works b/c (i) we assume that
-                    # observations per unit sample budget are i.i.d. and (ii) the
-                    # normalized weights sum to one.
-                    std = noise_stds_tsr.to(Y_true) / sqrt(nlzd_arm_weights[arm])
-                    Ystds[arm.name] = std.tolist()
-                    Ys[arm.name] = (Y_true + std * torch.randn_like(Y_true)).tolist()
-            except NotImplementedError:
-                # Case where we don't have a ground truth.
-                Y, Ystd = self.get_Y_Ystd(arm)
-                Ys[arm.name] = Y.tolist()
-                Ystds[arm.name] = Ystd.tolist() if Ystd is not None else None
+            # Case where we do have a ground truth
+            Y_true = self.get_Y_true(arm)
+            Ys_true[arm.name] = Y_true.tolist()
+            if noise_stds is None:
+                # No noise, so just return the true outcome.
+                Ystds[arm.name] = [0.0] * len(Y_true)
+                Ys[arm.name] = Y_true.tolist()
+            else:
+                # We can scale the noise std by the inverse of the relative sample
+                # budget allocation to each arm. This works b/c (i) we assume that
+                # observations per unit sample budget are i.i.d. and (ii) the
+                # normalized weights sum to one.
+                std = noise_stds_tsr.to(Y_true) / sqrt(nlzd_arm_weights[arm])
+                Ystds[arm.name] = std.tolist()
+                Ys[arm.name] = (Y_true + std * torch.randn_like(Y_true)).tolist()
 
         run_metadata = {
             "Ys": Ys,
             "Ystds": Ystds,
             "outcome_names": self.outcome_names,
+            "Ys_true": Ys_true,
         }
-        if Ys_true:  # only add key if we actually have a ground truth
-            run_metadata["Ys_true"] = Ys_true
         return run_metadata

From 4f5c3384497efc09f2e0fbba7945d6cf3c71b627 Mon Sep 17 00:00:00 2001
From: Elizabeth Santorella <santorella@fb.com>
Date: Wed, 21 Aug 2024 07:14:29 -0700
Subject: [PATCH 2/3] Introduce `ParamBasedTestProblem` for benchmarking
 (#2675)

Summary:
Pull Request resolved: https://github.com/facebook/Ax/pull/2675

Context:

In a future refactor that will enable more flexible and powerful best-point functionality, every BenchmarkProblem's runner will be able to produce an "oracle" value (possibly the ground truth) for any arm, in-sample or not, with a function like `BenchmarkRunner.evaluate_oracle(arm=arm)`, with the problem handling computation and the runner formatting results.  However, the current `BenchmarkRunner` and `BenchmarkMetric` setup currently doesn't cover every benchmark. Consolidating on `BenchmarkRunner` and `BenchmarkMetric` will enable the refactor, make it easier to universalize functionality like handling of constraints, noise, and inference regret, and will also allow for deleting some LOC for more custom problems.

Current `BenchmarkRunner`s only handle problems that can consume tensor-valued arguments: BoTorch synthetic problems and surrogate problems. This isn't a good fit for problems like Jenatton that have a hierarchical search space and can have some parameters not passed. Because Ax always passes parameters and only sometimes represents them as tensors, a `TParameterization` is a more natural abstraction to handle parameters than a tensor.

This PR:
- Introduces `ParamBasedTestProblem`, which is like a BoTorch synthetic test problem but consumes a `TParameterization` rather than a tensor
- Added `ParamBasedProblemRunner`, which shares a base class `SyntheticProblemRunner` and most functionality with  `BotorchTestProblemRunner` (so it is a `BenchmarkRunner` and supports both observed and unboserved noise).

Differential Revision: D60996475
---
 ax/benchmark/runners/base.py                  |  17 +-
 ax/benchmark/runners/botorch_test.py          | 256 +++++++++++++-----
 ax/benchmark/runners/surrogate.py             |  12 +-
 .../runners/test_botorch_test_problem.py      | 117 ++++++--
 ax/benchmark/tests/stubs.py                   |  26 ++
 ax/storage/json_store/registry.py             |   7 +-
 ax/utils/testing/benchmark_stubs.py           |  24 +-
 7 files changed, 341 insertions(+), 118 deletions(-)
 create mode 100644 ax/benchmark/tests/stubs.py

diff --git a/ax/benchmark/runners/base.py b/ax/benchmark/runners/base.py
index 83469d8ca20..cfb0eebd6f9 100644
--- a/ax/benchmark/runners/base.py
+++ b/ax/benchmark/runners/base.py
@@ -5,13 +5,15 @@
 
 # pyre-strict
 
-from abc import ABC, abstractmethod, abstractproperty
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
 from math import sqrt
 from typing import Any, Union
 
 import torch
 from ax.core.arm import Arm
-from ax.core.base_trial import BaseTrial
+
+from ax.core.base_trial import BaseTrial, TrialStatus
 from ax.core.batch_trial import BatchTrial
 from ax.core.runner import Runner
 from ax.core.trial import Trial
@@ -39,10 +41,7 @@ class BenchmarkRunner(Runner, ABC):
           not over-engineer for that before such a use case arrives.
     """
 
-    @abstractproperty
-    def outcome_names(self) -> list[str]:
-        """The names of the outcomes of the problem (in the order of the outcomes)."""
-        pass  # pragma: no cover
+    outcome_names: list[str]
 
     def get_Y_true(self, arm: Arm) -> Tensor:
         """
@@ -132,3 +131,9 @@ def run(self, trial: BaseTrial) -> dict[str, Any]:
             "Ys_true": Ys_true,
         }
         return run_metadata
+
+    # This will need to be udpated once asynchronous benchmarks are supported.
+    def poll_trial_status(
+        self, trials: Iterable[BaseTrial]
+    ) -> dict[TrialStatus, set[int]]:
+        return {TrialStatus.COMPLETED: {t.index for t in trials}}
diff --git a/ax/benchmark/runners/botorch_test.py b/ax/benchmark/runners/botorch_test.py
index 6796c4b533b..90fddbdefda 100644
--- a/ax/benchmark/runners/botorch_test.py
+++ b/ax/benchmark/runners/botorch_test.py
@@ -6,42 +6,76 @@
 # pyre-strict
 
 import importlib
-from collections.abc import Iterable
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from typing import Any, Optional, Union
 
 import torch
 from ax.benchmark.runners.base import BenchmarkRunner
 from ax.core.arm import Arm
-from ax.core.base_trial import BaseTrial, TrialStatus
+from ax.core.types import TParameterization
 from ax.utils.common.base import Base
 from ax.utils.common.equality import equality_typechecker
 from ax.utils.common.serialization import TClassDecoderRegistry, TDecoderRegistry
-from ax.utils.common.typeutils import checked_cast
-from botorch.test_functions.base import BaseTestProblem, ConstrainedBaseTestProblem
-from botorch.test_functions.multi_objective import MultiObjectiveTestProblem
+from botorch.test_functions.synthetic import (
+    ConstrainedSyntheticTestFunction,
+    SyntheticTestFunction,
+)
 from botorch.utils.transforms import normalize, unnormalize
+from pyre_extensions import assert_is_instance
 from torch import Tensor
 
 
-class BotorchTestProblemRunner(BenchmarkRunner):
-    """A Runner for evaluating Botorch BaseTestProblems.
+@dataclass(kw_only=True)
+class ParamBasedTestProblem(ABC):
+    """
+    Similar to a BoTorch test problem, but evaluated using an Ax
+    TParameterization rather than a tensor.
+    """
+
+    num_objectives: int
+    optimal_value: float
+    # Constraints could easily be supported similar to BoTorch test problems,
+    # but haven't been hooked up.
+    _is_constrained: bool = False
+    constraint_noise_std: Optional[Union[float, list[float]]] = None
+    noise_std: Optional[Union[float, list[float]]] = None
+    negate: bool = False
+
+    @abstractmethod
+    def evaluate_true(self, params: TParameterization) -> Tensor: ...
+
+    def evaluate_slack_true(self, params: TParameterization) -> Tensor:
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not support constraints."
+        )
+
+    # pyre-fixme: Missing parameter annotation [2]: Parameter `other` must have
+    # a type other than `Any`.
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, type(self)):
+            return False
+        return self.__class__.__name__ == other.__class__.__name__
+
 
-    Given a trial the Runner will evaluate the BaseTestProblem.forward method for each
-    arm in the trial, as well as return some metadata about the underlying Botorch
-    problem such as the noise_std. We compute the full result on the Runner (as opposed
-    to the Metric as is typical in synthetic test problems) because the BoTorch problem
-    computes all metrics in one stacked tensor in the MOO case, and we wish to avoid
-    recomputation per metric.
+class SyntheticProblemRunner(BenchmarkRunner, ABC):
+    """A Runner for evaluating synthetic problems, either BoTorch
+    `SyntheticTestFunction`s or Ax benchmarking `ParamBasedTestProblem`s.
+
+    Given a trial, the Runner will evaluate the problem noiselessly for each
+    arm in the trial, as well as return some metadata about the underlying
+    problem such as the noise_std.
     """
 
-    test_problem: BaseTestProblem
+    test_problem: Union[SyntheticTestFunction, ParamBasedTestProblem]
     _is_constrained: bool
-    _test_problem_class: type[BaseTestProblem]
+    _test_problem_class: type[Union[SyntheticTestFunction, ParamBasedTestProblem]]
     _test_problem_kwargs: Optional[dict[str, Any]]
 
     def __init__(
         self,
-        test_problem_class: type[BaseTestProblem],
+        *,
+        test_problem_class: type[Union[SyntheticTestFunction, ParamBasedTestProblem]],
         test_problem_kwargs: dict[str, Any],
         outcome_names: list[str],
         modified_bounds: Optional[list[tuple[float, float]]] = None,
@@ -49,7 +83,8 @@ def __init__(
         """Initialize the test problem runner.
 
         Args:
-            test_problem_class: The BoTorch test problem class.
+            test_problem_class: A BoTorch `SyntheticTestFunction` class or Ax
+                `ParamBasedTestProblem` class.
             test_problem_kwargs: The keyword arguments used for initializing the
                 test problem.
             outcome_names: The names of the outcomes returned by the problem.
@@ -63,28 +98,27 @@ def __init__(
                 If modified bounds are not provided, the test problem will be
                 evaluated using the raw parameter values.
         """
-
         self._test_problem_class = test_problem_class
         self._test_problem_kwargs = test_problem_kwargs
-
-        # pyre-fixme [45]: Invalid class instantiation
-        self.test_problem = test_problem_class(**test_problem_kwargs).to(
-            dtype=torch.double
+        self.test_problem = (
+            # pyre-fixme: Invalid class instantiation [45]: Cannot instantiate
+            # abstract class with abstract method `evaluate_true`.
+            test_problem_class(**test_problem_kwargs)
         )
+        if isinstance(self.test_problem, SyntheticTestFunction):
+            self.test_problem = self.test_problem.to(dtype=torch.double)
+        # A `ConstrainedSyntheticTestFunction` is a type of `SyntheticTestFunction`; a
+        # `ParamBasedTestProblem` is never constrained.
         self._is_constrained: bool = isinstance(
-            self.test_problem, ConstrainedBaseTestProblem
+            self.test_problem, ConstrainedSyntheticTestFunction
         )
-        self._is_moo: bool = isinstance(self.test_problem, MultiObjectiveTestProblem)
-        self._outcome_names = outcome_names
+        self._is_moo: bool = self.test_problem.num_objectives > 1
+        self.outcome_names = outcome_names
         self._modified_bounds = modified_bounds
 
-    @property
-    def outcome_names(self) -> list[str]:
-        return self._outcome_names
-
     @equality_typechecker
     def __eq__(self, other: Base) -> bool:
-        if not isinstance(other, BotorchTestProblemRunner):
+        if not isinstance(other, type(self)):
             return False
 
         return (
@@ -129,12 +163,95 @@ def get_noise_stds(self) -> Union[None, float, dict[str, float]]:
 
         return noise_std_dict
 
+    @classmethod
+    # pyre-fixme [2]: Parameter `obj` must have a type other than `Any``
+    def serialize_init_args(cls, obj: Any) -> dict[str, Any]:
+        """Serialize the properties needed to initialize the runner.
+        Used for storage.
+        """
+        runner = assert_is_instance(obj, cls)
+
+        return {
+            "test_problem_module": runner._test_problem_class.__module__,
+            "test_problem_class_name": runner._test_problem_class.__name__,
+            "test_problem_kwargs": runner._test_problem_kwargs,
+            "outcome_names": runner.outcome_names,
+            "modified_bounds": runner._modified_bounds,
+        }
+
+    @classmethod
+    def deserialize_init_args(
+        cls,
+        args: dict[str, Any],
+        decoder_registry: Optional[TDecoderRegistry] = None,
+        class_decoder_registry: Optional[TClassDecoderRegistry] = None,
+    ) -> dict[str, Any]:
+        """Given a dictionary, deserialize the properties needed to initialize the
+        runner. Used for storage.
+        """
+
+        module = importlib.import_module(args["test_problem_module"])
+
+        return {
+            "test_problem_class": getattr(module, args["test_problem_class_name"]),
+            "test_problem_kwargs": args["test_problem_kwargs"],
+            "outcome_names": args["outcome_names"],
+            "modified_bounds": args["modified_bounds"],
+        }
+
+
+class BotorchTestProblemRunner(SyntheticProblemRunner):
+    """
+    A `SyntheticProblemRunner` for BoTorch `SyntheticTestFunction`s.
+
+    Args:
+        test_problem_class: A BoTorch `SyntheticTestFunction` class.
+        test_problem_kwargs: The keyword arguments used for initializing the
+            test problem.
+        outcome_names: The names of the outcomes returned by the problem.
+        modified_bounds: The bounds that are used by the Ax search space
+            while optimizing the problem. If different from the bounds of the
+            test problem, we project the parameters into the test problem
+            bounds before evaluating the test problem.
+            For example, if the test problem is defined on [0, 1] but the Ax
+            search space is integers in [0, 10], an Ax parameter value of
+            5 will correspond to 0.5 while evaluating the test problem.
+            If modified bounds are not provided, the test problem will be
+            evaluated using the raw parameter values.
+    """
+
+    def __init__(
+        self,
+        *,
+        test_problem_class: type[SyntheticTestFunction],
+        test_problem_kwargs: dict[str, Any],
+        outcome_names: list[str],
+        modified_bounds: Optional[list[tuple[float, float]]] = None,
+    ) -> None:
+        super().__init__(
+            test_problem_class=test_problem_class,
+            test_problem_kwargs=test_problem_kwargs,
+            outcome_names=outcome_names,
+            modified_bounds=modified_bounds,
+        )
+        self.test_problem: SyntheticTestFunction = self.test_problem.to(
+            dtype=torch.double
+        )
+        self._is_constrained: bool = isinstance(
+            self.test_problem, ConstrainedSyntheticTestFunction
+        )
+
     def get_Y_true(self, arm: Arm) -> Tensor:
-        """Converts X to original bounds -- only if modified bounds were provided --
-        and evaluates the test problem. See `__init__` docstring for details.
+        """
+        Convert the arm to a tensor and evaluate it on the base test problem.
+
+        Convert the tensor to original bounds -- only if modified bounds were
+        provided -- and evaluates the test problem. See the docstring for
+        `modified_bounds` in `BotorchTestProblemRunner.__init__` for details.
 
         Args:
-            X: A `batch_shape x d`-dim tensor of point(s) at which to evaluate the
+            arm: Arm to evaluate. It will be converted to a
+                `batch_shape x d`-dim tensor of point(s) at which to evaluate the
                 test problem.
 
         Returns:
@@ -157,7 +274,7 @@ def get_Y_true(self, arm: Arm) -> Tensor:
             X = unnormalize(unit_X, self.test_problem.bounds)
 
         Y_true = self.test_problem.evaluate_true(X).view(-1)
-        # `BaseTestProblem.evaluate_true()` does not negate the outcome
+        # `SyntheticTestFunction.evaluate_true()` does not negate the outcome
         if self.test_problem.negate:
             Y_true = -Y_true
 
@@ -171,43 +288,44 @@ def get_Y_true(self, arm: Arm) -> Tensor:
 
         return Y_true
 
-    def poll_trial_status(
-        self, trials: Iterable[BaseTrial]
-    ) -> dict[TrialStatus, set[int]]:
-        return {TrialStatus.COMPLETED: {t.index for t in trials}}
 
-    @classmethod
-    # pyre-fixme [2]: Parameter `obj` must have a type other than `Any``
-    def serialize_init_args(cls, obj: Any) -> dict[str, Any]:
-        """Serialize the properties needed to initialize the runner.
-        Used for storage.
-        """
-        runner = checked_cast(BotorchTestProblemRunner, obj)
+class ParamBasedTestProblemRunner(SyntheticProblemRunner):
+    """
+    A `SyntheticProblemRunner` for `ParamBasedTestProblem`s. See
+    `SyntheticProblemRunner` for more information.
+    """
 
-        return {
-            "test_problem_module": runner._test_problem_class.__module__,
-            "test_problem_class_name": runner._test_problem_class.__name__,
-            "test_problem_kwargs": runner._test_problem_kwargs,
-            "outcome_names": runner._outcome_names,
-            "modified_bounds": runner._modified_bounds,
-        }
+    # This could easily be supported, but hasn't been hooked up
+    _is_constrained: bool = False
 
-    @classmethod
-    def deserialize_init_args(
-        cls,
-        args: dict[str, Any],
-        decoder_registry: Optional[TDecoderRegistry] = None,
-        class_decoder_registry: Optional[TClassDecoderRegistry] = None,
-    ) -> dict[str, Any]:
-        """Given a dictionary, deserialize the properties needed to initialize the
-        runner. Used for storage.
-        """
+    def __init__(
+        self,
+        *,
+        test_problem_class: type[ParamBasedTestProblem],
+        test_problem_kwargs: dict[str, Any],
+        outcome_names: list[str],
+        modified_bounds: Optional[list[tuple[float, float]]] = None,
+    ) -> None:
+        if modified_bounds is not None:
+            raise NotImplementedError(
+                f"modified_bounds is not supported for {test_problem_class.__name__}"
+            )
+        super().__init__(
+            test_problem_class=test_problem_class,
+            test_problem_kwargs=test_problem_kwargs,
+            outcome_names=outcome_names,
+            modified_bounds=modified_bounds,
+        )
+        self.test_problem: ParamBasedTestProblem = self.test_problem
 
-        module = importlib.import_module(args["test_problem_module"])
+    def get_Y_true(self, arm: Arm) -> Tensor:
+        """Evaluates the test problem.
 
-        return {
-            "test_problem_class": getattr(module, args["test_problem_class_name"]),
-            "test_problem_kwargs": args["test_problem_kwargs"],
-            "outcome_names": args["outcome_names"],
-            "modified_bounds": args["modified_bounds"],
-        }
+        Returns:
+            A `batch_shape x m`-dim tensor of ground truth (noiseless) evaluations.
+        """
+        Y_true = self.test_problem.evaluate_true(arm.parameters).view(-1)
+        # `ParamBasedTestProblem.evaluate_true()` does not negate the outcome
+        if self.test_problem.negate:
+            Y_true = -Y_true
+        return Y_true
diff --git a/ax/benchmark/runners/surrogate.py b/ax/benchmark/runners/surrogate.py
index 0054b64dc32..c990d9aa09a 100644
--- a/ax/benchmark/runners/surrogate.py
+++ b/ax/benchmark/runners/surrogate.py
@@ -6,7 +6,6 @@
 # pyre-strict
 
 import warnings
-from collections.abc import Iterable
 from typing import Any, Callable, Optional, Union
 
 import torch
@@ -68,7 +67,7 @@ def __init__(
         self.get_surrogate_and_datasets = get_surrogate_and_datasets
         self.name = name
         self._surrogate = surrogate
-        self._outcome_names = outcome_names
+        self.outcome_names = outcome_names
         self._datasets = datasets
         self.search_space = search_space
         self.noise_stds = noise_stds
@@ -89,10 +88,6 @@ def datasets(self) -> list[SupervisedDataset]:
             self.set_surrogate_and_datasets()
         return none_throws(self._datasets)
 
-    @property
-    def outcome_names(self) -> list[str]:
-        return self._outcome_names
-
     def get_noise_stds(self) -> Union[None, float, dict[str, float]]:
         return self.noise_stds
 
@@ -135,11 +130,6 @@ def run(self, trial: BaseTrial) -> dict[str, Any]:
         run_metadata["outcome_names"] = self.outcome_names
         return run_metadata
 
-    def poll_trial_status(
-        self, trials: Iterable[BaseTrial]
-    ) -> dict[TrialStatus, set[int]]:
-        return {TrialStatus.COMPLETED: {t.index for t in trials}}
-
     @classmethod
     # pyre-fixme[2]: Parameter annotation cannot be `Any`.
     def serialize_init_args(cls, obj: Any) -> dict[str, Any]:
diff --git a/ax/benchmark/tests/runners/test_botorch_test_problem.py b/ax/benchmark/tests/runners/test_botorch_test_problem.py
index 1ca0ab3e24d..ea929723e3d 100644
--- a/ax/benchmark/tests/runners/test_botorch_test_problem.py
+++ b/ax/benchmark/tests/runners/test_botorch_test_problem.py
@@ -8,35 +8,79 @@
 
 
 from itertools import product
-from typing import Union
 from unittest.mock import Mock
 
 import torch
-from ax.benchmark.runners.botorch_test import BotorchTestProblemRunner
+from ax.benchmark.runners.botorch_test import (
+    BotorchTestProblemRunner,
+    ParamBasedTestProblemRunner,
+)
 from ax.core.arm import Arm
 from ax.core.base_trial import TrialStatus
 from ax.core.trial import Trial
 from ax.utils.common.testutils import TestCase
 from ax.utils.common.typeutils import checked_cast
-from botorch.test_functions.base import ConstrainedBaseTestProblem
+from ax.utils.testing.benchmark_stubs import TestParamBasedTestProblem
+from botorch.test_functions.base import BaseTestProblem, ConstrainedBaseTestProblem
 from botorch.test_functions.synthetic import ConstrainedHartmann, Hartmann
 from botorch.utils.transforms import normalize
-
-
-class TestBotorchTestProblemRunner(TestCase):
-    def test_botorch_test_problem_runner(self) -> None:
-        for test_problem_class, modified_bounds, noise_std in product(
-            (Hartmann, ConstrainedHartmann), (None, [(0.0, 2.0)] * 6), (None, 0.1)
+from pyre_extensions import assert_is_instance
+
+
+class TestSyntheticRunner(TestCase):
+    def test_synthetic_runner(self) -> None:
+        botorch_cases = [
+            (
+                BotorchTestProblemRunner,
+                test_problem_class,
+                {"dim": 6},
+                modified_bounds,
+                noise_std,
+            )
+            for test_problem_class, modified_bounds, noise_std in product(
+                (Hartmann, ConstrainedHartmann),
+                (None, [(0.0, 2.0)] * 6),
+                (None, 0.1),
+            )
+        ]
+        param_based_cases = [
+            (
+                ParamBasedTestProblemRunner,
+                TestParamBasedTestProblem,
+                {"num_objectives": num_objectives, "dim": 6},
+                None,
+                noise_std,
+            )
+            for num_objectives, noise_std in product((1, 2), (None, 0.0, 1.0))
+        ]
+        for (
+            runner_cls,
+            test_problem_class,
+            test_problem_kwargs,
+            modified_bounds,
+            noise_std,
+        ) in (
+            botorch_cases + param_based_cases
         ):
-            test_problem = test_problem_class(dim=6).to(dtype=torch.double)
-            test_problem_kwargs: dict[str, Union[int, float]] = {"dim": 6}
             if noise_std is not None:
+                # pyre-fixme[6]: Incompatible parameter type: Expected int, got float
                 test_problem_kwargs["noise_std"] = noise_std
-            outcome_names = ["objective"]
+
+            num_objectives = (
+                test_problem_kwargs["num_objectives"]
+                if "num_objectives" in test_problem_kwargs
+                else 1
+            )
+            outcome_names = [f"objective_{i}" for i in range(num_objectives)]
             if test_problem_class == ConstrainedHartmann:
                 outcome_names = outcome_names + ["constraint"]
 
-            runner = BotorchTestProblemRunner(
+            runner = runner_cls(
+                # pyre-fixme[6]: Incompatible parameter type: In call
+                # `BotorchTestProblemRunner.__init__`, for argument
+                # `test_problem_class`, expected `Type[BaseTestProblem]` but got
+                # `Union[Type[ConstrainedHartmann], Type[Hartmann],
+                # Type[TestParamBasedTestProblem]]`.
                 test_problem_class=test_problem_class,
                 test_problem_kwargs=test_problem_kwargs,
                 outcome_names=outcome_names,
@@ -51,11 +95,9 @@ def test_botorch_test_problem_runner(self) -> None:
 
             with self.subTest(f"Test basic construction, {test_description}"):
                 self.assertIsInstance(runner.test_problem, test_problem_class)
-                self.assertEqual(runner.test_problem.dim, test_problem_kwargs["dim"])
-                self.assertEqual(runner.test_problem.bounds.dtype, torch.double)
                 self.assertEqual(
                     runner._is_constrained,
-                    isinstance(test_problem, ConstrainedBaseTestProblem),
+                    issubclass(test_problem_class, ConstrainedBaseTestProblem),
                 )
                 self.assertEqual(runner._modified_bounds, modified_bounds)
                 if noise_std is not None:
@@ -66,6 +108,17 @@ def test_botorch_test_problem_runner(self) -> None:
                 # check equality with different class
                 self.assertNotEqual(runner, Hartmann(dim=6))
                 self.assertEqual(runner, runner)
+                self.assertEqual(runner._is_moo, num_objectives > 1)
+                if issubclass(test_problem_class, BaseTestProblem):
+                    self.assertEqual(
+                        runner.test_problem.dim, test_problem_kwargs["dim"]
+                    )
+                    self.assertEqual(
+                        assert_is_instance(
+                            runner.test_problem, BaseTestProblem
+                        ).bounds.dtype,
+                        torch.double,
+                    )
 
             with self.subTest(f"test `get_Y_true()`, {test_description}"):
                 X = torch.rand(1, 6, dtype=torch.double)
@@ -80,16 +133,22 @@ def test_botorch_test_problem_runner(self) -> None:
                     )
                 else:
                     X_tf = X
-                obj = test_problem.evaluate_true(X_tf)
-                if test_problem.negate:
-                    obj = -obj
-                if runner._is_constrained:
-                    expected_Y = torch.cat(
-                        [obj.view(-1), test_problem.evaluate_slack(X_tf).view(-1)],
-                        dim=-1,
-                    )
+                test_problem = runner.test_problem
+                if issubclass(test_problem_class, BaseTestProblem):
+                    obj = test_problem.evaluate_true(X_tf)
+                    if test_problem.negate:
+                        obj = -obj
+                    if runner._is_constrained:
+                        expected_Y = torch.cat(
+                            [obj.view(-1), test_problem.evaluate_slack(X_tf).view(-1)],
+                            dim=-1,
+                        )
+                    else:
+                        expected_Y = obj
                 else:
-                    expected_Y = obj
+                    expected_Y = torch.full(
+                        torch.Size([2]), X.pow(2).sum().item(), dtype=torch.double
+                    )
                 self.assertTrue(torch.allclose(Y, expected_Y))
 
             with self.subTest(f"test `run()`, {test_description}"):
@@ -116,21 +175,19 @@ def test_botorch_test_problem_runner(self) -> None:
                 )
 
             with self.subTest(f"test `serialize_init_args()`, {test_description}"):
-                serialize_init_args = BotorchTestProblemRunner.serialize_init_args(
-                    obj=runner
-                )
+                serialize_init_args = runner_cls.serialize_init_args(obj=runner)
                 self.assertEqual(
                     serialize_init_args,
                     {
                         "test_problem_module": runner._test_problem_class.__module__,
                         "test_problem_class_name": runner._test_problem_class.__name__,
                         "test_problem_kwargs": runner._test_problem_kwargs,
-                        "outcome_names": runner._outcome_names,
+                        "outcome_names": runner.outcome_names,
                         "modified_bounds": runner._modified_bounds,
                     },
                 )
                 # test deserialize args
-                deserialize_init_args = BotorchTestProblemRunner.deserialize_init_args(
+                deserialize_init_args = runner_cls.deserialize_init_args(
                     serialize_init_args
                 )
                 self.assertEqual(
diff --git a/ax/benchmark/tests/stubs.py b/ax/benchmark/tests/stubs.py
new file mode 100644
index 00000000000..0d01d980a50
--- /dev/null
+++ b/ax/benchmark/tests/stubs.py
@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+from typing import Optional, Union
+
+import torch
+from ax.benchmark.runners.botorch_test import ParamBasedTestProblem
+
+
+class TestParamBasedTestProblem(ParamBasedTestProblem):
+    optimal_value: float = 0.0
+
+    def __init__(
+        self, num_objectives: int, noise_std: Optional[Union[float, list[float]]]
+    ) -> None:
+        self.num_objectives = num_objectives
+        self.noise_std = noise_std
+
+    # pyre-fixme[14]: Inconsistent override, as dict[str, float] is not a
+    # `TParameterization`
+    def evaluate_true(self, params: dict[str, float]) -> torch.Tensor:
+        value = sum(elt**2 for elt in params.values())
+        return value * torch.ones(self.num_objectives, dtype=torch.double)
diff --git a/ax/storage/json_store/registry.py b/ax/storage/json_store/registry.py
index 46ee42b8d6c..af8fd720538 100644
--- a/ax/storage/json_store/registry.py
+++ b/ax/storage/json_store/registry.py
@@ -23,7 +23,10 @@
     PyTorchCNNTorchvisionBenchmarkProblem,
     PyTorchCNNTorchvisionRunner,
 )
-from ax.benchmark.runners.botorch_test import BotorchTestProblemRunner
+from ax.benchmark.runners.botorch_test import (
+    BotorchTestProblemRunner,
+    ParamBasedTestProblemRunner,
+)
 from ax.benchmark.runners.surrogate import SurrogateRunner
 from ax.core import Experiment, ObservationFeatures
 from ax.core.arm import Arm
@@ -238,6 +241,7 @@
     OrEarlyStoppingStrategy: logical_early_stopping_strategy_to_dict,
     OrderConstraint: order_parameter_constraint_to_dict,
     OutcomeConstraint: outcome_constraint_to_dict,
+    ParamBasedTestProblemRunner: runner_to_dict,
     ParameterConstraint: parameter_constraint_to_dict,
     ParameterDistribution: parameter_distribution_to_dict,
     pathlib.Path: pathlib_to_dict,
@@ -363,6 +367,7 @@
     "OrEarlyStoppingStrategy": OrEarlyStoppingStrategy,
     "OrderConstraint": OrderConstraint,
     "OutcomeConstraint": OutcomeConstraint,
+    "ParamBasedTestProblemRunner": ParamBasedTestProblemRunner,
     "ParameterConstraint": ParameterConstraint,
     "ParameterConstraintType": ParameterConstraintType,
     "ParameterDistribution": ParameterDistribution,
diff --git a/ax/utils/testing/benchmark_stubs.py b/ax/utils/testing/benchmark_stubs.py
index ecfb9de88ec..80d3ac9daa8 100644
--- a/ax/utils/testing/benchmark_stubs.py
+++ b/ax/utils/testing/benchmark_stubs.py
@@ -6,9 +6,10 @@
 
 # pyre-strict
 
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 import numpy as np
+import torch
 from ax.benchmark.benchmark_method import BenchmarkMethod
 from ax.benchmark.benchmark_problem import (
     BenchmarkProblem,
@@ -21,6 +22,7 @@
     MOOSurrogateBenchmarkProblem,
     SOOSurrogateBenchmarkProblem,
 )
+from ax.benchmark.runners.botorch_test import ParamBasedTestProblem
 from ax.benchmark.runners.surrogate import SurrogateRunner
 from ax.core.experiment import Experiment
 from ax.core.optimization_config import (
@@ -218,3 +220,23 @@ def get_benchmark_result() -> BenchmarkResult:
 def get_aggregated_benchmark_result() -> AggregatedBenchmarkResult:
     result = get_benchmark_result()
     return AggregatedBenchmarkResult.from_benchmark_results([result, result])
+
+
+class TestParamBasedTestProblem(ParamBasedTestProblem):
+    optimal_value: float = 0.0
+
+    def __init__(
+        self,
+        num_objectives: int,
+        noise_std: Optional[Union[float, list[float]]] = None,
+        dim: int = 6,
+    ) -> None:
+        self.num_objectives = num_objectives
+        self.noise_std = noise_std
+        self.dim = dim
+
+    # pyre-fixme[14]: Inconsistent override, as dict[str, float] is not a
+    # `TParameterization`
+    def evaluate_true(self, params: dict[str, float]) -> torch.Tensor:
+        value = sum(elt**2 for elt in params.values())
+        return value * torch.ones(self.num_objectives, dtype=torch.double)

From 203c7e8f39fa7e9b9ff00f0d1e20c5fffc3ba7af Mon Sep 17 00:00:00 2001
From: Elizabeth Santorella <santorella@meta.com>
Date: Wed, 21 Aug 2024 07:21:51 -0700
Subject: [PATCH 3/3] Migrate Jenatton to use BenchmarkRunner and
 BenchmarkMetric (#2676)

Summary:
Pull Request resolved: https://github.com/facebook/Ax/pull/2676

This PR:
- Has Jenatton use `ParamBasedTestProblem` so that it can use `ParamBasedProblemRunner`, and also have it use `BenchmarkMetric`; get rid of specialized Jenatton runners and metrics. This enables Jenatton to handle noisy problems, whether noise levels are observed or not, like other benchmark problems, and will make it easy to add constraints or benefit from other new functionality.
- Does *not* clean up the now-unnecessary Jennaton metric file; that happens in the next diff.

Reviewed By: Balandat

Differential Revision: D61502458
---
 ax/benchmark/metrics/jenatton.py              |  69 +----
 .../problems/synthetic/hss/jenatton.py        |  56 ++++-
 ax/benchmark/tests/metrics/test_jennaton.py   | 235 +++++++++++++-----
 ax/benchmark/tests/test_benchmark.py          |  22 +-
 ax/storage/json_store/registry.py             |   3 -
 .../json_store/tests/test_json_store.py       |   4 +-
 6 files changed, 236 insertions(+), 153 deletions(-)

diff --git a/ax/benchmark/metrics/jenatton.py b/ax/benchmark/metrics/jenatton.py
index dd1da3205fd..4be7f5eae1b 100644
--- a/ax/benchmark/metrics/jenatton.py
+++ b/ax/benchmark/metrics/jenatton.py
@@ -5,78 +5,11 @@
 
 # pyre-strict
 
-from __future__ import annotations
+from typing import Optional
 
-from typing import Any, Optional
-
-import numpy as np
-import pandas as pd
-from ax.benchmark.metrics.base import BenchmarkMetricBase, GroundTruthMetricMixin
-from ax.core.base_trial import BaseTrial
-from ax.core.data import Data
-from ax.core.metric import MetricFetchE, MetricFetchResult
-from ax.utils.common.result import Err, Ok
 from ax.utils.common.typeutils import not_none
 
 
-class JenattonMetric(BenchmarkMetricBase):
-    """Jenatton metric for hierarchical search spaces."""
-
-    has_ground_truth: bool = True
-
-    def __init__(
-        self,
-        name: str = "jenatton",
-        noise_std: float = 0.0,
-        observe_noise_sd: bool = False,
-    ) -> None:
-        super().__init__(name=name)
-        self.noise_std = noise_std
-        self.observe_noise_sd = observe_noise_sd
-        self.lower_is_better = True
-
-    def fetch_trial_data(self, trial: BaseTrial, **kwargs: Any) -> MetricFetchResult:
-        try:
-            mean = [
-                jenatton_test_function(**arm.parameters)  # pyre-ignore [6]
-                for _, arm in trial.arms_by_name.items()
-            ]
-            if self.noise_std != 0:
-                mean = [m + self.noise_std * np.random.randn() for m in mean]
-            df = pd.DataFrame(
-                {
-                    "arm_name": [name for name, _ in trial.arms_by_name.items()],
-                    "metric_name": self.name,
-                    "mean": mean,
-                    "sem": self.noise_std if self.observe_noise_sd else None,
-                    "trial_index": trial.index,
-                }
-            )
-            return Ok(value=Data(df=df))
-
-        except Exception as e:
-            return Err(
-                MetricFetchE(message=f"Failed to fetch {self.name}", exception=e)
-            )
-
-    def make_ground_truth_metric(self) -> GroundTruthJenattonMetric:
-        return GroundTruthJenattonMetric(original_metric=self)
-
-
-class GroundTruthJenattonMetric(JenattonMetric, GroundTruthMetricMixin):
-    def __init__(self, original_metric: JenattonMetric) -> None:
-        """
-        Args:
-            original_metric: The original JenattonMetric to which this metric
-                corresponds.
-        """
-        super().__init__(
-            name=self.get_ground_truth_name(original_metric),
-            noise_std=0.0,
-            observe_noise_sd=False,
-        )
-
-
 def jenatton_test_function(
     x1: Optional[int] = None,
     x2: Optional[int] = None,
diff --git a/ax/benchmark/problems/synthetic/hss/jenatton.py b/ax/benchmark/problems/synthetic/hss/jenatton.py
index f545ac39400..67fa1755372 100644
--- a/ax/benchmark/problems/synthetic/hss/jenatton.py
+++ b/ax/benchmark/problems/synthetic/hss/jenatton.py
@@ -5,18 +5,52 @@
 
 # pyre-strict
 
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
 from ax.benchmark.benchmark_problem import BenchmarkProblem
-from ax.benchmark.metrics.jenatton import JenattonMetric
+from ax.benchmark.metrics.benchmark import BenchmarkMetric
+from ax.benchmark.metrics.jenatton import jenatton_test_function
+from ax.benchmark.runners.botorch_test import (
+    ParamBasedTestProblem,
+    ParamBasedTestProblemRunner,
+)
 from ax.core.objective import Objective
 from ax.core.optimization_config import OptimizationConfig
 from ax.core.parameter import ChoiceParameter, ParameterType, RangeParameter
 from ax.core.search_space import HierarchicalSearchSpace
-from ax.runners.synthetic import SyntheticRunner
+from ax.core.types import TParameterization
+
+
+@dataclass(kw_only=True)
+class Jenatton(ParamBasedTestProblem):
+    r"""Jenatton test function for hierarchical search spaces.
+
+    This function is taken from:
+
+    R. Jenatton, C. Archambeau, J. González, and M. Seeger. Bayesian
+    optimization with tree-structured dependencies. ICML 2017.
+    """
+
+    noise_std: Optional[float] = None
+    negate: bool = False
+    num_objectives: int = 1
+    optimal_value: float = 0.1
+    _is_constrained: bool = False
+
+    def evaluate_true(self, params: TParameterization) -> torch.Tensor:
+        # pyre-fixme: Incompatible parameter type [6]: In call
+        # `jenatton_test_function`, for 1st positional argument, expected
+        # `Optional[float]` but got `Union[None, bool, float, int, str]`.
+        value = jenatton_test_function(**params)
+        return torch.tensor(value)
 
 
 def get_jenatton_benchmark_problem(
     num_trials: int = 50,
     observe_noise_sd: bool = False,
+    noise_std: float = 0.0,
 ) -> BenchmarkProblem:
     search_space = HierarchicalSearchSpace(
         parameters=[
@@ -55,24 +89,28 @@ def get_jenatton_benchmark_problem(
             ),
         ]
     )
+    name = "Jenatton" + ("_observed_noise" if observe_noise_sd else "")
 
     optimization_config = OptimizationConfig(
         objective=Objective(
-            metric=JenattonMetric(observe_noise_sd=observe_noise_sd),
+            metric=BenchmarkMetric(
+                name=name, observe_noise_sd=observe_noise_sd, lower_is_better=True
+            ),
             minimize=True,
         )
     )
-
-    name = "Jenatton" + ("_observed_noise" if observe_noise_sd else "")
-
     return BenchmarkProblem(
         name=name,
         search_space=search_space,
         optimization_config=optimization_config,
-        runner=SyntheticRunner(),
+        runner=ParamBasedTestProblemRunner(
+            test_problem_class=Jenatton,
+            test_problem_kwargs={"noise_std": noise_std},
+            outcome_names=[name],
+        ),
         num_trials=num_trials,
-        is_noiseless=True,
+        is_noiseless=noise_std == 0.0,
         observe_noise_stds=observe_noise_sd,
         has_ground_truth=True,
-        optimal_value=0.1,
+        optimal_value=Jenatton.optimal_value,
     )
diff --git a/ax/benchmark/tests/metrics/test_jennaton.py b/ax/benchmark/tests/metrics/test_jennaton.py
index f7cb2474a13..3f09fa07e94 100644
--- a/ax/benchmark/tests/metrics/test_jennaton.py
+++ b/ax/benchmark/tests/metrics/test_jennaton.py
@@ -7,107 +7,216 @@
 
 import math
 from random import random
-from unittest import mock
 
-from ax.benchmark.metrics.jenatton import jenatton_test_function, JenattonMetric
+from ax.benchmark.metrics.benchmark import BenchmarkMetric, GroundTruthBenchmarkMetric
+
+from ax.benchmark.metrics.jenatton import jenatton_test_function
+from ax.benchmark.problems.synthetic.hss.jenatton import get_jenatton_benchmark_problem
+from ax.benchmark.runners.base import BenchmarkRunner
+from ax.benchmark.runners.botorch_test import ParamBasedTestProblemRunner
 from ax.core.arm import Arm
+from ax.core.data import Data
+from ax.core.experiment import Experiment
 from ax.core.trial import Trial
+from ax.core.types import TParameterization
 from ax.utils.common.testutils import TestCase
+from pyre_extensions import assert_is_instance
 
 
-class JenattonMetricTest(TestCase):
+class JenattonTest(TestCase):
 
     def test_jenatton_test_function(self) -> None:
+        benchmark_problem = get_jenatton_benchmark_problem()
+
         rand_params = {f"x{i}": random() for i in range(4, 8)}
         rand_params["r8"] = random()
         rand_params["r9"] = random()
 
+        cases: list[tuple[TParameterization, float]] = []
+
         for x3 in (0, 1):
-            self.assertAlmostEqual(
-                jenatton_test_function(
-                    x1=0,
-                    x2=0,
-                    x3=x3,
-                    **{**rand_params, "x4": 2.0, "r8": 0.05},
+            # list of (param dict, expected value)
+            cases.append(
+                (
+                    {
+                        "x1": 0,
+                        "x2": 0,
+                        "x3": x3,
+                        **{**rand_params, "x4": 2.0, "r8": 0.05},
+                    },
+                    4.15,
                 ),
-                4.15,
             )
-            self.assertAlmostEqual(
-                jenatton_test_function(
-                    x1=0,
-                    x2=1,
-                    x3=x3,
-                    **{**rand_params, "x5": 2.0, "r8": 0.05},
-                ),
-                4.25,
+            cases.append(
+                (
+                    {
+                        "x1": 0,
+                        "x2": 1,
+                        "x3": x3,
+                        **{**rand_params, "x5": 2.0, "r8": 0.05},
+                    },
+                    4.25,
+                )
             )
+
         for x2 in (0, 1):
+            cases.append(
+                (
+                    {
+                        "x1": 1,
+                        "x2": x2,
+                        "x3": 0,
+                        **{**rand_params, "x6": 2.0, "r9": 0.05},
+                    },
+                    4.35,
+                )
+            )
+            cases.append(
+                (
+                    {
+                        "x1": 1,
+                        "x2": x2,
+                        "x3": 1,
+                        **{**rand_params, "x7": 2.0, "r9": 0.05},
+                    },
+                    4.45,
+                )
+            )
+
+        for params, value in cases:
+            arm = Arm(parameters=params)
             self.assertAlmostEqual(
-                jenatton_test_function(
-                    x1=1,
-                    x2=x2,
-                    x3=0,
-                    **{**rand_params, "x6": 2.0, "r9": 0.05},
-                ),
-                4.35,
+                # pyre-fixme: Incompatible parameter type [6]: In call
+                # `jenatton_test_function`, for 1st positional argument,
+                # expected `Optional[float]` but got `Union[None, bool, float,
+                # int, str]`.
+                jenatton_test_function(**params),
+                value,
             )
             self.assertAlmostEqual(
-                jenatton_test_function(
-                    x1=1,
-                    x2=x2,
-                    x3=1,
-                    **{**rand_params, "x7": 2.0, "r9": 0.05},
-                ),
-                4.45,
+                assert_is_instance(benchmark_problem.runner, BenchmarkRunner)
+                .get_Y_true(arm)
+                .item(),
+                value,
+                places=6,
             )
 
-    def test_init(self) -> None:
-        metric = JenattonMetric()
-        self.assertEqual(metric.name, "jenatton")
+    def test_create_problem(self) -> None:
+        problem = get_jenatton_benchmark_problem()
+        objective = problem.optimization_config.objective
+        metric = objective.metric
+
+        self.assertEqual(metric.name, "Jenatton")
+        self.assertTrue(objective.minimize)
         self.assertTrue(metric.lower_is_better)
-        self.assertEqual(metric.noise_std, 0.0)
-        self.assertFalse(metric.observe_noise_sd)
-        metric = JenattonMetric(name="nottanej", noise_std=0.1, observe_noise_sd=True)
-        self.assertEqual(metric.name, "nottanej")
+        self.assertEqual(
+            assert_is_instance(
+                problem.runner, ParamBasedTestProblemRunner
+            ).test_problem.noise_std,
+            0.0,
+        )
+        self.assertTrue(problem.is_noiseless)
+        self.assertFalse(assert_is_instance(metric, BenchmarkMetric).observe_noise_sd)
+
+        problem = get_jenatton_benchmark_problem(
+            num_trials=10, noise_std=0.1, observe_noise_sd=True
+        )
+        objective = problem.optimization_config.objective
+        metric = objective.metric
         self.assertTrue(metric.lower_is_better)
-        self.assertEqual(metric.noise_std, 0.1)
-        self.assertTrue(metric.observe_noise_sd)
+        self.assertEqual(
+            assert_is_instance(
+                problem.runner, ParamBasedTestProblemRunner
+            ).test_problem.noise_std,
+            0.1,
+        )
+        self.assertFalse(problem.is_noiseless)
+        self.assertTrue(assert_is_instance(metric, BenchmarkMetric).observe_noise_sd)
 
     def test_fetch_trial_data(self) -> None:
-        arm = mock.Mock(spec=Arm)
-        arm.parameters = {"x1": 0, "x2": 1, "x5": 2.0, "r8": 0.05}
-        trial = mock.Mock(spec=Trial)
-        trial.arms_by_name = {"0_0": arm}
-        trial.index = 0
-
-        metric = JenattonMetric()
-        df = metric.fetch_trial_data(trial=trial).value.df  # pyre-ignore [16]
+        problem = get_jenatton_benchmark_problem()
+        arm = Arm(parameters={"x1": 0, "x2": 1, "x5": 2.0, "r8": 0.05}, name="0_0")
+
+        experiment = Experiment(
+            search_space=problem.search_space,
+            name="Jenatton",
+            optimization_config=problem.optimization_config,
+        )
+
+        trial = Trial(experiment=experiment)
+        trial.add_arm(arm)
+        metadata = problem.runner.run(trial=trial)
+        trial.update_run_metadata(metadata)
+
+        expected_metadata = {
+            "Ys": {"0_0": [4.25]},
+            "Ystds": {"0_0": [0.0]},
+            "outcome_names": ["Jenatton"],
+            "Ys_true": {"0_0": [4.25]},
+        }
+        self.assertEqual(metadata, expected_metadata)
+
+        metric = problem.optimization_config.objective.metric
+
+        df = assert_is_instance(metric.fetch_trial_data(trial=trial).value, Data).df
         self.assertEqual(len(df), 1)
         res_dict = df.iloc[0].to_dict()
         self.assertEqual(res_dict["arm_name"], "0_0")
-        self.assertEqual(res_dict["metric_name"], "jenatton")
+        self.assertEqual(res_dict["metric_name"], "Jenatton")
         self.assertEqual(res_dict["mean"], 4.25)
         self.assertTrue(math.isnan(res_dict["sem"]))
         self.assertEqual(res_dict["trial_index"], 0)
 
-        metric = JenattonMetric(name="nottanej", noise_std=0.1, observe_noise_sd=True)
-        df = metric.fetch_trial_data(trial=trial).value.df  # pyre-ignore [16]
+        problem = get_jenatton_benchmark_problem(noise_std=0.1, observe_noise_sd=True)
+        experiment = Experiment(
+            search_space=problem.search_space,
+            name="Jenatton",
+            optimization_config=problem.optimization_config,
+        )
+
+        trial = Trial(experiment=experiment)
+        trial.add_arm(arm)
+        metadata = problem.runner.run(trial=trial)
+        trial.update_run_metadata(metadata)
+
+        metric = problem.optimization_config.objective.metric
+        df = assert_is_instance(metric.fetch_trial_data(trial=trial).value, Data).df
         self.assertEqual(len(df), 1)
         res_dict = df.iloc[0].to_dict()
         self.assertEqual(res_dict["arm_name"], "0_0")
-        self.assertEqual(res_dict["metric_name"], "nottanej")
         self.assertNotEqual(res_dict["mean"], 4.25)
-        self.assertEqual(res_dict["sem"], 0.1)
+        self.assertAlmostEqual(res_dict["sem"], 0.1)
         self.assertEqual(res_dict["trial_index"], 0)
 
     def test_make_ground_truth_metric(self) -> None:
-        metric = JenattonMetric()
-        gt_metric = metric.make_ground_truth_metric()
-        self.assertIsInstance(gt_metric, JenattonMetric)
-        self.assertEqual(gt_metric.noise_std, 0.0)
-        self.assertFalse(gt_metric.observe_noise_sd)
-        metric = JenattonMetric(noise_std=0.1, observe_noise_sd=True)
+        problem = get_jenatton_benchmark_problem()
+
+        arm = Arm(parameters={"x1": 0, "x2": 1, "x5": 2.0, "r8": 0.05}, name="0_0")
+
+        experiment = Experiment(
+            search_space=problem.search_space,
+            name="Jenatton",
+            optimization_config=problem.optimization_config,
+        )
+
+        trial = Trial(experiment=experiment)
+        trial.add_arm(arm)
+        problem.runner.run(trial=trial)
+        metadata = problem.runner.run(trial=trial)
+        trial.update_run_metadata(metadata)
+
+        metric = assert_is_instance(
+            problem.optimization_config.objective.metric, BenchmarkMetric
+        )
         gt_metric = metric.make_ground_truth_metric()
-        self.assertIsInstance(gt_metric, JenattonMetric)
-        self.assertEqual(gt_metric.noise_std, 0.0)
-        self.assertFalse(gt_metric.observe_noise_sd)
+        self.assertIsInstance(gt_metric, GroundTruthBenchmarkMetric)
+        runner = assert_is_instance(problem.runner, ParamBasedTestProblemRunner)
+        self.assertEqual(runner.test_problem.noise_std, 0.0)
+        self.assertFalse(
+            assert_is_instance(gt_metric, BenchmarkMetric).observe_noise_sd
+        )
+
+        self.assertIsInstance(metric, BenchmarkMetric)
+        self.assertNotIsInstance(metric, GroundTruthBenchmarkMetric)
+        self.assertEqual(runner.test_problem.noise_std, 0.0)
+        self.assertFalse(metric.observe_noise_sd)
diff --git a/ax/benchmark/tests/test_benchmark.py b/ax/benchmark/tests/test_benchmark.py
index 3d0ae2eeda3..366c5611122 100644
--- a/ax/benchmark/tests/test_benchmark.py
+++ b/ax/benchmark/tests/test_benchmark.py
@@ -278,16 +278,22 @@ def test_create_benchmark_experiment(self) -> None:
 
     def test_replication_sobol_synthetic(self) -> None:
         method = get_sobol_benchmark_method()
-        problem = get_single_objective_benchmark_problem()
-        res = benchmark_replication(problem=problem, method=method, seed=0)
+        problems = [
+            get_single_objective_benchmark_problem(),
+            get_problem("jenatton", num_trials=6),
+        ]
+        for problem in problems:
+            res = benchmark_replication(problem=problem, method=method, seed=0)
 
-        self.assertEqual(
-            min(problem.num_trials, not_none(method.scheduler_options.total_trials)),
-            len(not_none(res.experiment).trials),
-        )
+            self.assertEqual(
+                min(
+                    problem.num_trials, not_none(method.scheduler_options.total_trials)
+                ),
+                len(not_none(res.experiment).trials),
+            )
 
-        self.assertTrue(np.isfinite(res.score_trace).all())
-        self.assertTrue(np.all(res.score_trace <= 100))
+            self.assertTrue(np.isfinite(res.score_trace).all())
+            self.assertTrue(np.all(res.score_trace <= 100))
 
     def test_replication_sobol_surrogate(self) -> None:
         method = get_sobol_benchmark_method()
diff --git a/ax/storage/json_store/registry.py b/ax/storage/json_store/registry.py
index af8fd720538..12d2ebde923 100644
--- a/ax/storage/json_store/registry.py
+++ b/ax/storage/json_store/registry.py
@@ -17,7 +17,6 @@
 )
 from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult
 from ax.benchmark.metrics.benchmark import BenchmarkMetric, GroundTruthBenchmarkMetric
-from ax.benchmark.metrics.jenatton import JenattonMetric
 from ax.benchmark.problems.hpo.pytorch_cnn import PyTorchCNNMetric
 from ax.benchmark.problems.hpo.torchvision import (
     PyTorchCNNTorchvisionBenchmarkProblem,
@@ -213,7 +212,6 @@
     Hartmann6Metric: metric_to_dict,
     ImprovementGlobalStoppingStrategy: improvement_global_stopping_strategy_to_dict,
     Interval: botorch_component_to_dict,
-    JenattonMetric: metric_to_dict,
     L2NormMetric: metric_to_dict,
     LogNormalPrior: botorch_component_to_dict,
     MapData: map_data_to_dict,
@@ -337,7 +335,6 @@
     "HierarchicalSearchSpace": HierarchicalSearchSpace,
     "ImprovementGlobalStoppingStrategy": ImprovementGlobalStoppingStrategy,
     "Interval": Interval,
-    "JenattonMetric": JenattonMetric,
     "LifecycleStage": LifecycleStage,
     "ListSurrogate": Surrogate,  # For backwards compatibility
     "L2NormMetric": L2NormMetric,
diff --git a/ax/storage/json_store/tests/test_json_store.py b/ax/storage/json_store/tests/test_json_store.py
index 02e84bc9326..b53139ea3e5 100644
--- a/ax/storage/json_store/tests/test_json_store.py
+++ b/ax/storage/json_store/tests/test_json_store.py
@@ -13,7 +13,7 @@
 
 import numpy as np
 import torch
-from ax.benchmark.metrics.jenatton import JenattonMetric
+from ax.benchmark.problems.synthetic.hss.jenatton import get_jenatton_benchmark_problem
 from ax.core.metric import Metric
 from ax.core.objective import Objective
 from ax.core.runner import Runner
@@ -192,7 +192,6 @@
     ("HierarchicalSearchSpace", get_hierarchical_search_space),
     ("ImprovementGlobalStoppingStrategy", get_improvement_global_stopping_strategy),
     ("Interval", get_interval),
-    ("JenattonMetric", JenattonMetric),
     ("MapData", get_map_data),
     ("MapData", get_map_data),
     ("MapKeyInfo", get_map_key_info),
@@ -209,6 +208,7 @@
     ("OrderConstraint", get_order_constraint),
     ("OutcomeConstraint", get_outcome_constraint),
     ("Path", get_pathlib_path),
+    ("Jenatton", get_jenatton_benchmark_problem),
     ("PercentileEarlyStoppingStrategy", get_percentile_early_stopping_strategy),
     (
         "PercentileEarlyStoppingStrategy",