Skip to content

Commit

Permalink
Remove tracking metrics setup for noisy benchmarks (#2706)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #2706

No longer needed after the changes in the previous diff.

Reviewed By: Balandat

Differential Revision: D61415525
  • Loading branch information
esantorella authored and facebook-github-bot committed Aug 25, 2024
1 parent 21eed91 commit 881fad5
Show file tree
Hide file tree
Showing 17 changed files with 27 additions and 585 deletions.
172 changes: 7 additions & 165 deletions ax/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,11 @@
from ax.benchmark.benchmark_method import BenchmarkMethod
from ax.benchmark.benchmark_problem import BenchmarkProblem
from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult
from ax.benchmark.metrics.base import BenchmarkMetricBase, GroundTruthMetricMixin
from ax.core.experiment import Experiment
from ax.core.metric import Metric
from ax.core.objective import MultiObjective, Objective
from ax.core.optimization_config import (
MultiObjectiveOptimizationConfig,
OptimizationConfig,
)
from ax.core.outcome_constraint import ObjectiveThreshold, OutcomeConstraint
from ax.core.utils import get_model_times
from ax.service.scheduler import Scheduler
from ax.utils.common.logger import get_logger
from ax.utils.common.random import with_rng_seed
from ax.utils.common.typeutils import checked_cast, not_none

logger: Logger = get_logger(__name__)

Expand Down Expand Up @@ -88,25 +79,10 @@ def _create_benchmark_experiment(
Returns:
The Experiment object to be used for benchmarking.
"""
tracking_metrics = problem.tracking_metrics
if not problem.is_noiseless and problem.has_ground_truth:
# Make the ground truth counterparts for each metric defined on the problem,
# which will be added as tracking metrics on the Experiment object below.
# In the analysis, a modified OptimziationConfig referencing those metrics
# will be passed to the `Scheduler.get_trace()` method, which allows to extract
# the optimziation trace based on the ground truth outcomes (without noise).
# If the problem is known to be noiseless, this is unneccesary and we can just
# use the observations made during the optimization loop directly.
gt_metric_dict = make_ground_truth_metrics(problem=problem)
tracking_metrics = tracking_metrics + list(gt_metric_dict.values())
return Experiment(
name=f"{problem.name}|{method_name}_{int(time())}",
search_space=problem.search_space,
optimization_config=problem.optimization_config,
tracking_metrics=tracking_metrics, # pyre-ignore [6]: Incompatible
# parameter type: In call `Experiment.__init__`, for argument
# `tracking_metrics`, expected `Optional[List[Metric]]` but got
# `Union[List[Union[BenchmarkMetricBase, Metric]], List[BenchmarkMetricBase]]`.
runner=problem.runner,
)

Expand All @@ -124,7 +100,12 @@ def benchmark_replication(
seed: The seed to use for this replication.
"""

experiment = _create_benchmark_experiment(problem=problem, method_name=method.name)
experiment = Experiment(
name=f"{problem.name}|{method.name}_{int(time())}",
search_space=problem.search_space,
optimization_config=problem.optimization_config,
runner=problem.runner,
)

scheduler = Scheduler(
experiment=experiment,
Expand All @@ -135,24 +116,7 @@ def benchmark_replication(
with with_rng_seed(seed=seed):
scheduler.run_n_trials(max_trials=problem.num_trials)

if not problem.is_noiseless and problem.has_ground_truth:
# We modify the optimization config so we can use `Scheduler.get_trace()`
# to use the true (not corrupted by noise) observations that were logged
# as tracking metrics on the Experiment object. If the problem is known to
# be noiseless, this is unnecssary and we can just use the observations
# made during the optimization loop directly.
analysis_opt_config = make_ground_truth_optimization_config(
experiment=experiment
)
else:
analysis_opt_config = experiment.optimization_config

optimization_trace = np.asarray(
scheduler.get_trace(optimization_config=analysis_opt_config)
)

new_optimization_trace = problem.get_opt_trace(experiment=experiment)
np.testing.assert_allclose(optimization_trace, new_optimization_trace)
optimization_trace = problem.get_opt_trace(experiment=experiment)

try:
# Catch any errors that may occur during score computation, such as errors
Expand Down Expand Up @@ -217,125 +181,3 @@ def benchmark_multiple_problems_methods(
benchmark_one_method_problem(problem=p, method=m, seeds=seeds)
for p, m in product(problems, methods)
]


def make_ground_truth_metrics(
problem: BenchmarkProblem,
include_tracking_metrics: bool = True,
) -> dict[str, Metric]:
"""Makes a ground truth version for each metric defined on the problem.
Args:
problem: The BenchmarkProblem to test against (can be synthetic or real).
include_tracking_metrics: Whether or not to include tracking metrics.
Returns:
A dict mapping (original) metric names to their respective ground truth metric.
"""
if not problem.has_ground_truth:
raise ValueError(
"Cannot create ground truth metrics for problems that "
"do not have a ground truth."
)
metrics: list[BenchmarkMetricBase] = [
checked_cast(BenchmarkMetricBase, metric)
for metric in problem.optimization_config.metrics.values()
]
if include_tracking_metrics:
metrics = metrics + problem.tracking_metrics
return {metric.name: metric.make_ground_truth_metric() for metric in metrics}


def make_ground_truth_optimization_config(
experiment: Experiment,
) -> OptimizationConfig:
"""Makes a clone of the OptimizationConfig on the experiment in which each metric
is replaced by its respective "ground truth" counterpart, which has been added to
the experiment's tracking metrics in `_create_benchmark_experiment` and which
returns the ground truth (i.e., uncorrupted by noise) observations.
"""
optimization_config = not_none(experiment.optimization_config)

if optimization_config.risk_measure is not None:
raise NotImplementedError("Support for risk measures is not yet implemented.")

# dict for caching metric lookup
gt_metric_dict: dict[str, BenchmarkMetricBase] = {}

def get_gt_metric(metric: Metric) -> BenchmarkMetricBase:
"""Look up corresponding ground truth metric of the experiment. Will error
out if no corresponding ground truth metric exists."""
if not isinstance(metric, BenchmarkMetricBase):
raise ValueError(
"Only BenchmarkMetricBase metrics are supported for ground truth "
f"metrics. Got {type(metric)}."
)

if metric.name in gt_metric_dict:
return gt_metric_dict[metric.name]

for tracking_metric in experiment.tracking_metrics:
if getattr(tracking_metric, "is_ground_truth", False):
# TODO: Figure out if there is a better way to match the ground truth
# metric and the original metric.
ground_truth_name = tracking_metric.name
orig_name = checked_cast(
GroundTruthMetricMixin, tracking_metric
).get_original_name(ground_truth_name)
if orig_name == metric.name:
tracking_metric = checked_cast(BenchmarkMetricBase, tracking_metric)
gt_metric_dict[metric.name] = tracking_metric
return tracking_metric
raise ValueError(f"Ground truth metric for metric {metric.name} not found!")

# convert outcome constraints
if optimization_config.outcome_constraints is not None:
gt_outcome_constraints = [
OutcomeConstraint(
metric=get_gt_metric(oc.metric),
op=oc.op,
bound=oc.bound,
relative=oc.relative,
)
for oc in optimization_config.outcome_constraints
]
else:
gt_outcome_constraints = None

# we need to distinguish MOO and non-MOO problems
if not optimization_config.is_moo_problem:
gt_objective = Objective(
metric=get_gt_metric(optimization_config.objective.metric)
)

return OptimizationConfig(
objective=gt_objective, outcome_constraints=gt_outcome_constraints
)

gt_objective = MultiObjective(
metrics=[
get_gt_metric(metric) for metric in optimization_config.objective.metrics
]
)
# there may be objective thresholds to also convert
objective_thresholds = checked_cast(
MultiObjectiveOptimizationConfig, optimization_config
).objective_thresholds
if objective_thresholds is not None:
gt_objective_thresholds = [
ObjectiveThreshold(
metric=get_gt_metric(ot.metric),
bound=ot.bound,
relative=ot.relative,
op=ot.op,
)
for ot in objective_thresholds
]
else:
gt_objective_thresholds = None

return MultiObjectiveOptimizationConfig(
objective=gt_objective,
outcome_constraints=gt_outcome_constraints,
objective_thresholds=gt_objective_thresholds,
)
50 changes: 1 addition & 49 deletions ax/benchmark/benchmark_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,11 @@

import logging
from dataclasses import dataclass
from typing import Any

from ax.modelbridge.generation_strategy import GenerationStep, GenerationStrategy
from ax.modelbridge.generation_strategy import GenerationStrategy
from ax.service.utils.scheduler_options import SchedulerOptions, TrialType
from ax.utils.common.base import Base
from ax.utils.common.logger import get_logger
from ax.utils.common.typeutils import not_none


logger: logging.Logger = get_logger("BenchmarkMethod")
Expand All @@ -28,52 +26,13 @@ class BenchmarkMethod(Base):
Note: If `BenchmarkMethod.scheduler_options.total_trials` is less than
`BenchmarkProblem.num_trials` then only the number of trials specified in the
former will be run.
Note: The `generation_strategy` passed in is assumed to be in its "base state",
as it will be cloned and reset.
"""

name: str
generation_strategy: GenerationStrategy
scheduler_options: SchedulerOptions
distribute_replications: bool = False

def __post_init__(self) -> None:
# We (I think?) in general don't want to fit tracking metrics during our
# benchmarks. Further, not setting `fit_tracking_metrics=False`causes
# issues with the ground truth metrics created automatically when running
# the benchmark - in fact, things will error out deep inside the modeling
# stack since the model gets both noisy (benchmark) and noiseless (ground
# truth) observations. While support for this is something we shold add
# for models, in the context of benchmarking we actually want to avoid
# fitting the ground truth metrics at all.

# Clone the GS so as to not modify the original one in-place below.
# Note that this assumes that the GS passed in is in its base state.
gs_cloned = self.generation_strategy.clone_reset()

for node in gs_cloned._nodes:
if isinstance(node, GenerationStep):
if node.model_kwargs is None:
node.model_kwargs = {}
if node.model_kwargs.get("fit_tracking_metrics", True):
logger.info(
"Setting `fit_tracking_metrics` in a GenerationStep to False.",
)
not_none(node.model_kwargs)["fit_tracking_metrics"] = False
for model_spec in node.model_specs:
if model_spec.model_kwargs is None:
model_spec.model_kwargs = {}
elif model_spec.model_kwargs.get("fit_tracking_metrics", True):
logger.info(
"Setting `fit_tracking_metrics` in a GenerationNode's "
"model_spec to False."
)
not_none(model_spec.model_kwargs)["fit_tracking_metrics"] = False

# hack around not being able to update frozen attribute of a dataclass
_assign_frozen_attr(self, name="generation_strategy", value=gs_cloned)


def get_benchmark_scheduler_options(
timeout_hours: int = 4,
Expand Down Expand Up @@ -103,10 +62,3 @@ def get_benchmark_scheduler_options(
trial_type=TrialType.TRIAL if batch_size == 1 else TrialType.BATCH_TRIAL,
batch_size=batch_size,
)


def _assign_frozen_attr(obj: Any, name: str, value: Any) -> None: # pyre-ignore [2]
"""Assign a new value to an attribute of a frozen dataclass.
This is an ugly hack and shouldn't be used broadly.
"""
object.__setattr__(obj, name, value)
15 changes: 0 additions & 15 deletions ax/benchmark/benchmark_problem.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
import numpy as np
import pandas as pd

from ax.benchmark.metrics.base import BenchmarkMetricBase

from ax.benchmark.metrics.benchmark import BenchmarkMetric
from ax.benchmark.runners.base import BenchmarkRunner
from ax.benchmark.runners.botorch_test import BotorchTestProblemRunner
Expand Down Expand Up @@ -72,12 +70,6 @@ class BenchmarkProblem(Base):
observe_noise_stds: If boolean, whether the standard deviation of the
observation noise is observed for all metrics. If a dictionary,
whether noise levels are observed on a per-metric basis.
has_ground_truth: Whether the Runner produces underlying ground truth
values, which are not observed in real noisy problems but may be
known in benchmarks.
tracking_metrics: Tracking metrics are not optimized, and for the
purpose of benchmarking, they will not be fit. The ground truth may
be provided as `tracking_metrics`.
optimal_value: The best ground-truth objective value. Hypervolume for
multi-objective problems. If the best value is not known, it is
conventional to set it to a value that is almost certainly better
Expand All @@ -91,13 +83,10 @@ class BenchmarkProblem(Base):
optimization_config: OptimizationConfig
num_trials: int
observe_noise_stds: Union[bool, dict[str, bool]] = False
has_ground_truth: bool = True
tracking_metrics: list[BenchmarkMetricBase] = field(default_factory=list)
optimal_value: float

search_space: SearchSpace = field(repr=False)
runner: BenchmarkRunner = field(repr=False)
is_noiseless: bool

def get_oracle_experiment(self, experiment: Experiment) -> Experiment:
records = []
Expand Down Expand Up @@ -263,8 +252,6 @@ def create_single_objective_problem_from_botorch(
),
num_trials=num_trials,
observe_noise_stds=observe_noise_sd,
is_noiseless=test_problem.noise_std in (None, 0.0),
has_ground_truth=True, # all synthetic problems have ground truth
optimal_value=optimal_value,
)

Expand Down Expand Up @@ -356,8 +343,6 @@ def create_multi_objective_problem_from_botorch(
optimization_config=optimization_config,
runner=runner,
num_trials=num_trials,
is_noiseless=test_problem.noise_std in (None, 0.0),
observe_noise_stds=observe_noise_sd,
has_ground_truth=True,
optimal_value=test_problem.max_hv,
)
Loading

0 comments on commit 881fad5

Please sign in to comment.