From a753d3e73edde726e7eda895cbcbb34d7bb6a39f Mon Sep 17 00:00:00 2001 From: Matthew Grange Date: Tue, 21 Nov 2023 07:26:32 -0800 Subject: [PATCH] Create output message, comparing baseline to optimal result. Single Objective only (1/n) (#1997) Summary: For single objective cases, if both a baseline trial and an optimal trial are present in trials_df, add a message to self.markdown_messages with the % improvement between baseline to optimal. Next step: add support for MOO cases Reviewed By: bernardbeckerman Differential Revision: D51073855 --- ax/service/tests/test_report_utils.py | 446 +++++++++++++++++++++++++- ax/service/utils/report_utils.py | 190 +++++++++++ 2 files changed, 635 insertions(+), 1 deletion(-) diff --git a/ax/service/tests/test_report_utils.py b/ax/service/tests/test_report_utils.py index dd290be0b6a..6577d24add2 100644 --- a/ax/service/tests/test_report_utils.py +++ b/ax/service/tests/test_report_utils.py @@ -15,11 +15,15 @@ from ax.core.arm import Arm from ax.core.metric import Metric from ax.core.objective import MultiObjective, Objective -from ax.core.optimization_config import MultiObjectiveOptimizationConfig +from ax.core.optimization_config import ( + MultiObjectiveOptimizationConfig, + OptimizationConfig, +) from ax.core.outcome_constraint import ObjectiveThreshold from ax.core.types import ComparisonOp from ax.modelbridge.registry import Models from ax.service.utils.report_utils import ( + _format_comparison_string, _get_cross_validation_plots, _get_curve_plot_dropdown, _get_metric_name_pairs, @@ -27,6 +31,8 @@ _get_objective_v_param_plots, _get_shortest_unique_suffix_dict, _objective_vs_true_objective_scatter, + BASELINE_ARM_NAME, + compare_to_baseline, compute_maximum_map_values, exp_to_df, Experiment, @@ -40,6 +46,7 @@ get_branin_experiment, get_branin_experiment_with_multi_objective, get_branin_experiment_with_timestamp_map_metric, + get_branin_search_space, get_experiment_with_observations, get_high_dimensional_branin_experiment, get_multi_type_experiment, @@ -517,3 +524,440 @@ def test_get_metric_name_pairs(self) -> None: list(metric_name_pairs), list(itertools.combinations([f"m{i}" for i in range(4)], 2)), ) + + def test_compare_to_baseline(self) -> None: + """Test that compare to baseline parses arm df properly, + obtains the objective metric values based + on the provided OptimizationConfig, and + produces the intended text + """ + self.maxDiff = None + OBJECTIVE_METRIC = "foo" + + data = [ + {"trial_index": 0, "arm_name": BASELINE_ARM_NAME, OBJECTIVE_METRIC: 0.2}, + {"trial_index": 1, "arm_name": "dummy", OBJECTIVE_METRIC: 0.5}, + {"trial_index": 2, "arm_name": "optimal", OBJECTIVE_METRIC: 2.5}, + {"trial_index": 3, "arm_name": "bad_optimal", OBJECTIVE_METRIC: 0.05}, + ] + arms_df = pd.DataFrame(data) + + with patch( + "ax.service.utils.report_utils.exp_to_df", + return_value=arms_df, + ): + true_obj_metric = Metric(name=OBJECTIVE_METRIC, lower_is_better=False) + experiment = Experiment( + search_space=get_branin_search_space(), + tracking_metrics=[true_obj_metric], + ) + + optimization_config = OptimizationConfig( + objective=Objective(metric=true_obj_metric, minimize=False), + outcome_constraints=[], + ) + experiment.optimization_config = optimization_config + + comparison_arm_names = ["optimal"] + + result = compare_to_baseline( + experiment=experiment, + optimization_config=None, + comparison_arm_names=comparison_arm_names, + ) + + output_text = _format_comparison_string( + comparison_arm_names[0], OBJECTIVE_METRIC, 1150.0, 0.2, 2.5 + ) + + self.assertNotEqual(result, None) + self.assertEqual(result, output_text) + + bad_comparison_arm_names = ["bad_optimal"] + # because result increased from baseline, no improvement result returned + bad_result = compare_to_baseline( + experiment=experiment, + optimization_config=None, + comparison_arm_names=bad_comparison_arm_names, + ) + self.assertEqual(bad_result, None) + + def test_compare_to_baseline_pass_in_opt(self) -> None: + """Test that compare to baseline parses arm df properly, + obtains the objective metric values based + on the provided OptimizationConfig, and + produces the intended text + """ + self.maxDiff = None + OBJECTIVE_METRIC = "foo" + + data = [ + {"trial_index": 0, "arm_name": BASELINE_ARM_NAME, OBJECTIVE_METRIC: 0.2}, + {"trial_index": 1, "arm_name": "dummy", OBJECTIVE_METRIC: 0.5}, + {"trial_index": 2, "arm_name": "optimal", OBJECTIVE_METRIC: 2.5}, + {"trial_index": 3, "arm_name": "bad_optimal", OBJECTIVE_METRIC: 0.05}, + ] + arms_df = pd.DataFrame(data) + + with patch( + "ax.service.utils.report_utils.exp_to_df", + return_value=arms_df, + ): + true_obj_metric = Metric(name=OBJECTIVE_METRIC, lower_is_better=False) + experiment = Experiment( + search_space=get_branin_search_space(), + tracking_metrics=[true_obj_metric], + optimization_config=None, + ) + + optimization_config = OptimizationConfig( + objective=Objective(metric=true_obj_metric, minimize=False), + outcome_constraints=[], + ) + experiment.optimization_config = optimization_config + + comparison_arm_names = ["optimal"] + + result = compare_to_baseline( + experiment=experiment, + optimization_config=optimization_config, + comparison_arm_names=comparison_arm_names, + ) + + output_text = _format_comparison_string( + comparison_arm_names[0], OBJECTIVE_METRIC, 1150.0, 0.2, 2.5 + ) + + self.assertNotEqual(result, None) + self.assertEqual(result, output_text) + + def test_compare_to_baseline_minimize(self) -> None: + """Test that compare to baseline parses arm df properly, + obtains the objective metric values based + on the provided OptimizationConfig, and + produces the intended text. + For the minimize case + """ + self.maxDiff = None + OBJECTIVE_METRIC = "foo" + + data = [ + {"trial_index": 0, "arm_name": BASELINE_ARM_NAME, OBJECTIVE_METRIC: 0.2}, + {"trial_index": 1, "arm_name": "dummy", OBJECTIVE_METRIC: 0.5}, + {"trial_index": 2, "arm_name": "optimal", OBJECTIVE_METRIC: 0.1}, + {"trial_index": 3, "arm_name": "bad_optimal", OBJECTIVE_METRIC: 1.0}, + ] + arms_df = pd.DataFrame(data) + + with patch( + "ax.service.utils.report_utils.exp_to_df", + return_value=arms_df, + ): + true_obj_metric = Metric(name=OBJECTIVE_METRIC, lower_is_better=True) + experiment = Experiment( + search_space=get_branin_search_space(), + tracking_metrics=[true_obj_metric], + ) + + optimization_config = OptimizationConfig( + objective=Objective(metric=true_obj_metric, minimize=True), + outcome_constraints=[], + ) + experiment.optimization_config = optimization_config + + comparison_arm_names = ["optimal"] + + result = compare_to_baseline( + experiment=experiment, + optimization_config=None, + comparison_arm_names=comparison_arm_names, + ) + + output_text = _format_comparison_string( + comparison_arm_names[0], OBJECTIVE_METRIC, 50.0, 0.2, 0.1 + ) + + self.assertNotEqual(result, None) + self.assertEqual(result, output_text) + + bad_comparison_arm_names = ["bad_optimal"] + # because result increased from baseline, no improvement result returned + bad_result = compare_to_baseline( + experiment=experiment, + optimization_config=None, + comparison_arm_names=bad_comparison_arm_names, + ) + self.assertEqual(bad_result, None) + + def test_compare_to_baseline_edge_case(self) -> None: + """Test that compare to baseline parses arm df properly, + obtains the objective metric values based + on the provided OptimizationConfig, and + produces the intended text + """ + self.maxDiff = None + OBJECTIVE_METRIC = "foo" + + true_obj_metric = Metric(name=OBJECTIVE_METRIC, lower_is_better=True) + experiment = Experiment( + search_space=get_branin_search_space(), + tracking_metrics=[true_obj_metric], + ) + + optimization_config = OptimizationConfig( + objective=Objective(metric=true_obj_metric, minimize=True), + outcome_constraints=[], + ) + experiment.optimization_config = optimization_config + comparison_arm_names = ["optimal"] + + # baseline value is 0 + data = [ + {"trial_index": 0, "arm_name": BASELINE_ARM_NAME, OBJECTIVE_METRIC: 0.0}, + {"trial_index": 1, "arm_name": "optimal", OBJECTIVE_METRIC: 1.0}, + ] + arms_df = pd.DataFrame(data) + + with patch( + "ax.service.utils.report_utils.exp_to_df", + return_value=arms_df, + ): + with self.assertLogs("ax", level=INFO) as log: + self.assertEqual( + compare_to_baseline( + experiment=experiment, + optimization_config=None, + comparison_arm_names=comparison_arm_names, + ), + None, + ) + self.assertTrue( + any( + ( + "compare_to_baseline: baseline has value of 0" + + ", can't compute percent change." + ) + in log_str + for log_str in log.output + ), + log.output, + ) + + # no best arm names + with patch( + "ax.service.utils.report_utils.exp_to_df", + return_value=arms_df, + ): + with self.assertLogs("ax", level=INFO) as log: + self.assertEqual( + compare_to_baseline( + experiment=experiment, + optimization_config=None, + comparison_arm_names=None, + ), + None, + ) + self.assertTrue( + any( + ( + "compare_to_baseline: comparison_arm_names not provided." + + " Returning None." + ) + in log_str + for log_str in log.output + ), + log.output, + ) + + # no optimization config + with patch( + "ax.service.utils.report_utils.exp_to_df", + return_value=arms_df, + ): + with self.assertLogs("ax", level=INFO) as log: + exp_no_opt = Experiment( + search_space=get_branin_search_space(), + tracking_metrics=[true_obj_metric], + optimization_config=None, + ) + self.assertEqual( + compare_to_baseline( + experiment=exp_no_opt, + optimization_config=None, + comparison_arm_names=comparison_arm_names, + ), + None, + ) + self.assertEqual(exp_no_opt.optimization_config, None) + self.assertTrue( + any( + ( + "compare_to_baseline: optimization_config neither provided in inputs nor present on experiment." + ) + in log_str + for log_str in log.output + ), + log.output, + ) + + def test_compare_to_baseline_arms_not_found(self) -> None: + self.maxDiff = None + OBJECTIVE_METRIC = "foo" + + true_obj_metric = Metric(name=OBJECTIVE_METRIC, lower_is_better=True) + experiment = Experiment( + search_space=get_branin_search_space(), + tracking_metrics=[true_obj_metric], + ) + + optimization_config = OptimizationConfig( + objective=Objective(metric=true_obj_metric, minimize=True), + outcome_constraints=[], + ) + experiment.optimization_config = optimization_config + comparison_arm_names = ["optimal"] + + # baseline value is 0 + data = [ + {"trial_index": 0, "arm_name": BASELINE_ARM_NAME, OBJECTIVE_METRIC: 0.0}, + {"trial_index": 1, "arm_name": "optimal", OBJECTIVE_METRIC: 1.0}, + ] + arms_df = pd.DataFrame(data) + + # no arms df + with patch( + "ax.service.utils.report_utils.exp_to_df", + return_value=None, + ): + with self.assertLogs("ax", level=INFO) as log: + self.assertEqual( + compare_to_baseline( + experiment=experiment, + optimization_config=None, + comparison_arm_names=comparison_arm_names, + ), + None, + ) + self.assertTrue( + any( + ("compare_to_baseline: arms_df is None.") in log_str + for log_str in log.output + ), + log.output, + ) + + # best arms df is none + with patch( + "ax.service.utils.report_utils.exp_to_df", + return_value=arms_df, + ): + with self.assertLogs("ax", level=INFO) as log: + comparison_arm_not_found = ["unknown_arm"] + self.assertEqual( + compare_to_baseline( + experiment=experiment, + optimization_config=None, + comparison_arm_names=comparison_arm_not_found, + ), + None, + ) + self.assertTrue( + any( + ("compare_to_baseline: comparison_arm_df has no rows.") + in log_str + for log_str in log.output + ), + log.output, + ) + + # baseline not found in arms_df + with patch( + "ax.service.utils.report_utils.exp_to_df", + return_value=arms_df, + ): + baseline_arm_name = "not_baseline_arm_in_dataframe" + experiment_with_status_quo = experiment + experiment_with_status_quo.status_quo = Arm( + name=baseline_arm_name, + parameters={"x1": 0, "x2": 0}, + ) + with self.assertLogs("ax", level=INFO) as log: + self.assertEqual( + compare_to_baseline( + experiment=experiment_with_status_quo, + optimization_config=None, + comparison_arm_names=comparison_arm_names, + ), + None, + ) + self.assertTrue( + any( + ( + f"compare_to_baseline: baseline row: {baseline_arm_name=}" + + " not found in arms" + ) + in log_str + for log_str in log.output + ), + log.output, + ) + + def test_compare_to_baseline_moo(self) -> None: + """Test that compare to baseline errors out correctly + for multi objective problems + + """ + self.maxDiff = None + OBJECTIVE_METRIC = "foo" + + data = [ + {"trial_index": 0, "arm_name": BASELINE_ARM_NAME, OBJECTIVE_METRIC: 0.2}, + {"trial_index": 1, "arm_name": "dummy", OBJECTIVE_METRIC: 0.5}, + {"trial_index": 2, "arm_name": "optimal", OBJECTIVE_METRIC: 0.1}, + {"trial_index": 3, "arm_name": "bad_optimal", OBJECTIVE_METRIC: 1.0}, + ] + arms_df = pd.DataFrame(data) + + with patch( + "ax.service.utils.report_utils.exp_to_df", + return_value=arms_df, + ): + true_obj_metric = Metric(name=OBJECTIVE_METRIC, lower_is_better=True) + experiment = Experiment( + search_space=get_branin_search_space(), + tracking_metrics=[true_obj_metric], + ) + + optimization_config = MultiObjectiveOptimizationConfig( + objective=MultiObjective( + objectives=[ + Objective(metric=Metric("m0")), + Objective(metric=Metric("m1")), + Objective(metric=Metric("m2")), + Objective(metric=Metric("m3")), + Objective(metric=Metric("m4")), + ] + ) + ) + experiment.optimization_config = optimization_config + self.assertEqual(True, experiment.is_moo_problem) + + comparison_arm_names = ["optimal"] + + with self.assertLogs("ax", level=INFO) as log: + self.assertEqual( + compare_to_baseline( + experiment=experiment, + optimization_config=None, + comparison_arm_names=comparison_arm_names, + ), + None, + ) + self.assertTrue( + any( + "compare_to_baseline: not yet implemented for moo problems" + in log_str + for log_str in log.output + ), + log.output, + ) diff --git a/ax/service/utils/report_utils.py b/ax/service/utils/report_utils.py index 0431349bf1d..312b834ebb9 100644 --- a/ax/service/utils/report_utils.py +++ b/ax/service/utils/report_utils.py @@ -35,6 +35,7 @@ from ax.core.metric import Metric from ax.core.multi_type_experiment import MultiTypeExperiment from ax.core.objective import MultiObjective, ScalarizedObjective +from ax.core.optimization_config import OptimizationConfig from ax.core.trial import BaseTrial from ax.early_stopping.strategies.base import BaseEarlyStoppingStrategy from ax.exceptions.core import DataRequiredError, UserInputError @@ -81,6 +82,7 @@ "This may hide outliers. You can autoscale the axes to see all trials." ) FEASIBLE_COL_NAME = "is_feasible" +BASELINE_ARM_NAME = "baseline_arm" def _get_cross_validation_plots(model: ModelBridge) -> List[go.Figure]: @@ -1116,3 +1118,191 @@ def _warn_and_create_warning_plot(warning_msg: str) -> go.Figure: .update_xaxes(showgrid=False, showticklabels=False, zeroline=False) .update_yaxes(showgrid=False, showticklabels=False, zeroline=False) ) + + +def _format_comparison_string( + comparison_arm_name: str, + objective_name: str, + percent_change: float, + baseline_value: float, + comparison_value: float, +) -> str: + baseline_arm_name = BASELINE_ARM_NAME + return ( + f"{comparison_arm_name=} " + + "improves your objective metric " + + f"{objective_name} by {percent_change}%. " + + f" {baseline_arm_name=} was improved " + + f"from {baseline_value=}" + + f" to {comparison_value=}" + ) + + +def _construct_comparison_message( + objective_name: str, + objective_minimize: bool, + baseline_arm_name: str, + baseline_value: float, + comparison_arm_name: str, + comparison_value: float, + digits: int = 2, +) -> Optional[str]: + # TODO: allow for user configured digits value + if baseline_value == 0: + logger.info( + "compare_to_baseline: baseline has value of 0" + + ", can't compute percent change." + ) + return None + + if (objective_minimize and (baseline_value < comparison_value)) or ( + not objective_minimize and (baseline_value > comparison_value) + ): + logger.info( + f"compare_to_baseline: comparison arm {comparison_arm_name}" + + f" did not beat baseline arm {baseline_arm_name}. " + ) + return None + percent_change = round( + ((abs(comparison_value - baseline_value)) / baseline_value) * 100, digits + ) + + return _format_comparison_string( + comparison_arm_name, + objective_name, + percent_change, + baseline_value, + comparison_value, + ) + + +def _build_result_tuple( + objective_name: str, + objective_minimize: bool, + baseline_arm_name: str, + baseline_value: float, + comparison_row: pd.DataFrame, +) -> Tuple[str, bool, str, float, str, float]: + comparison_arm_name = checked_cast(str, comparison_row["arm_name"]) + comparison_value = checked_cast(float, comparison_row[objective_name]) + + result = ( + objective_name, + objective_minimize, + baseline_arm_name, + baseline_value, + comparison_arm_name, + comparison_value, + ) + return result + + +def maybe_extract_baseline_comparison_values( + experiment: Experiment, + optimization_config: Optional[OptimizationConfig], + comparison_arm_names: Optional[List[str]], +) -> Optional[List[Tuple[str, bool, str, float, str, float]]]: + """ + Extracts the baseline values from the experiment, for use in + comparing the baseline arm to the optimal results. + Requires the user specifies the names of the arms to compare to. + + Returns: + List of tuples containing: + (metric_name, + minimize, + comparison_arm_name, + baseline_arm_name, + baseline_value, + comparison_arm_value) + """ + # TODO: incorporate model uncertainty when available + # TODO: extract and use best arms if comparison_arm_names is not provided. + # Can do this automatically using optimization_config. + if not comparison_arm_names: + logger.info( + "compare_to_baseline: comparison_arm_names not provided. Returning None." + ) + return None + if not optimization_config: + if experiment.optimization_config is None: + logger.info( + "compare_to_baseline: optimization_config neither" + + " provided in inputs nor present on experiment." + ) + return None + optimization_config = experiment.optimization_config + + arms_df = exp_to_df(experiment) + if arms_df is None: + logger.info("compare_to_baseline: arms_df is None.") + return None + + if experiment.is_moo_problem: + logger.info("compare_to_baseline: not yet implemented for moo problems") + return None + # TODO: compare_to_baseline for multi-objective optimization + + comparison_arm_df = arms_df[arms_df["arm_name"] == comparison_arm_names[0]] + + if comparison_arm_df is None or len(comparison_arm_df) == 0: + logger.info("compare_to_baseline: comparison_arm_df has no rows.") + return None + + baseline_arm_name = ( + BASELINE_ARM_NAME + if not experiment.status_quo + else not_none(experiment.status_quo).name + ) + + baseline_rows = arms_df[arms_df["arm_name"] == baseline_arm_name] + if len(baseline_rows) == 0: + logger.info( + f"compare_to_baseline: baseline row: {baseline_arm_name=} not found in arms" + ) + return None + + objective_name = optimization_config.objective.metric.name + baseline_value = baseline_rows.iloc[0][objective_name] + comparison_row = comparison_arm_df.iloc[0] + + return [ + _build_result_tuple( + objective_name=objective_name, + objective_minimize=optimization_config.objective.minimize, + baseline_arm_name=baseline_arm_name, + baseline_value=baseline_value, + comparison_row=comparison_row, + ) + ] + + +def compare_to_baseline( + experiment: Experiment, + optimization_config: Optional[OptimizationConfig], + comparison_arm_names: Optional[List[str]], +) -> Optional[str]: + """Calculate metric improvement of the experiment against baseline. + Returns the message(s) added to markdown_messages""" + # TODO: add baseline_arm_name as a parameter + + comparison_list = maybe_extract_baseline_comparison_values( + experiment=experiment, + optimization_config=optimization_config, + comparison_arm_names=comparison_arm_names, + ) + if not comparison_list: + return None + comparison_list = not_none(comparison_list) + result_message = "" + + for idx, result_tuple in enumerate(comparison_list): + comparison_message = _construct_comparison_message(*result_tuple) + if comparison_message: + result_message = ( + result_message + + not_none(comparison_message) + + ("
" if idx != len(comparison_list) - 1 else "") + ) + + return result_message if result_message else None