From 9d93085f746684e3523e45e477623a978dae21a2 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com>
Date: Mon, 22 Nov 2021 18:56:41 +0100
Subject: [PATCH 1/7] change worst possible result of r2 (#340)

---
 autoPyTorch/pipeline/components/training/metrics/metrics.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index b669e4ede..0d82b9622 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -36,7 +36,8 @@
                                     worst_possible_result=MAXINT,
                                     greater_is_better=False)
 r2 = make_metric('r2',
-                 sklearn.metrics.r2_score)
+                 sklearn.metrics.r2_score,
+                 worst_possible_result=-MAXINT)
 
 # Standard Classification Scores
 accuracy = make_metric('accuracy',

From f7e9cf8d98cfb97be37629650624f082b3317b51 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com>
Date: Mon, 22 Nov 2021 19:12:55 +0100
Subject: [PATCH 2/7] Update README.md with link for master branch

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a1818caba..389ab7902 100755
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ While early AutoML frameworks focused on optimizing traditional ML pipelines and
 
 Auto-PyTorch is mainly developed to support tabular data (classification, regression).
 The newest features in Auto-PyTorch for tabular data are described in the paper ["Auto-PyTorch Tabular: Multi-Fidelity MetaLearning for Efficient and Robust AutoDL"](https://arxiv.org/abs/2006.13799) (see below for bibtex ref).
-Also, find the documentation [here](https://automl.github.io/Auto-PyTorch/development).
+Also, find the documentation [here](https://automl.github.io/Auto-PyTorch/master).
 
 ***From v0.1.0, AutoPyTorch has been updated to further improve usability, robustness and efficiency by using SMAC as the underlying optimization package as well as changing the code structure. Therefore, moving from v0.0.2 to v0.1.0 will break compatibility. 
 In case you would like to use the old API, you can find it at [`master_old`](https://github.com/automl/Auto-PyTorch/tree/master-old).***

From 5b65244a3da9047112b9634546587f2cbdcdf463 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com>
Date: Mon, 22 Nov 2021 22:55:01 +0100
Subject: [PATCH 3/7] [FIX formatting in docs (#342)

* fix formatting in docs

* Update examples/40_advanced/example_resampling_strategy.py
---
 .../example_custom_configuration_space.py     | 153 +++++++++---------
 .../40_advanced/example_parallel_n_jobs.py    |  11 +-
 .../example_resampling_strategy.py            |  30 ++--
 .../40_advanced/example_run_with_portfolio.py |  84 +++++-----
 4 files changed, 144 insertions(+), 134 deletions(-)

diff --git a/examples/40_advanced/example_custom_configuration_space.py b/examples/40_advanced/example_custom_configuration_space.py
index c64a4fca1..985d9d9ff 100644
--- a/examples/40_advanced/example_custom_configuration_space.py
+++ b/examples/40_advanced/example_custom_configuration_space.py
@@ -5,6 +5,7 @@
 
 The following example shows how adjust the configuration space of
 the search. Currently, there are two changes that can be made to the space:-
+
 1. Adjust individual hyperparameters in the pipeline
 2. Include or exclude components:
     a) include: Dictionary containing components to include. Key is the node
@@ -57,80 +58,78 @@ def get_search_space_updates():
     return updates
 
 
-if __name__ == '__main__':
-
-    ############################################################################
-    # Data Loading
-    # ============
-    X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True)
-    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
-        X,
-        y,
-        random_state=1,
-    )
-
-    ############################################################################
-    # Build and fit a classifier with include components
-    # ==================================================
-    api = TabularClassificationTask(
-        search_space_updates=get_search_space_updates(),
-        include_components={'network_backbone': ['MLPBackbone', 'ResNetBackbone'],
-                            'encoder': ['OneHotEncoder']}
-    )
-
-    ############################################################################
-    # Search for an ensemble of machine learning algorithms
-    # =====================================================
-    api.search(
-        X_train=X_train.copy(),
-        y_train=y_train.copy(),
-        X_test=X_test.copy(),
-        y_test=y_test.copy(),
-        optimize_metric='accuracy',
-        total_walltime_limit=150,
-        func_eval_time_limit_secs=30
-    )
-
-    ############################################################################
-    # Print the final ensemble performance
-    # ====================================
-    y_pred = api.predict(X_test)
-    score = api.score(y_pred, y_test)
-    print(score)
-    print(api.show_models())
-
-    # Print statistics from search
-    print(api.sprint_statistics())
-
-    ############################################################################
-    # Build and fit a classifier with exclude components
-    # ==================================================
-    api = TabularClassificationTask(
-        search_space_updates=get_search_space_updates(),
-        exclude_components={'network_backbone': ['MLPBackbone'],
-                            'encoder': ['OneHotEncoder']}
-    )
-
-    ############################################################################
-    # Search for an ensemble of machine learning algorithms
-    # =====================================================
-    api.search(
-        X_train=X_train,
-        y_train=y_train,
-        X_test=X_test.copy(),
-        y_test=y_test.copy(),
-        optimize_metric='accuracy',
-        total_walltime_limit=150,
-        func_eval_time_limit_secs=30
-    )
-
-    ############################################################################
-    # Print the final ensemble performance
-    # ====================================
-    y_pred = api.predict(X_test)
-    score = api.score(y_pred, y_test)
-    print(score)
-    print(api.show_models())
-
-    # Print statistics from search
-    print(api.sprint_statistics())
+############################################################################
+# Data Loading
+# ============
+X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+    X,
+    y,
+    random_state=1,
+)
+
+############################################################################
+# Build and fit a classifier with include components
+# ==================================================
+api = TabularClassificationTask(
+    search_space_updates=get_search_space_updates(),
+    include_components={'network_backbone': ['MLPBackbone', 'ResNetBackbone'],
+                        'encoder': ['OneHotEncoder']}
+)
+
+############################################################################
+# Search for an ensemble of machine learning algorithms
+# =====================================================
+api.search(
+    X_train=X_train.copy(),
+    y_train=y_train.copy(),
+    X_test=X_test.copy(),
+    y_test=y_test.copy(),
+    optimize_metric='accuracy',
+    total_walltime_limit=150,
+    func_eval_time_limit_secs=30
+)
+
+############################################################################
+# Print the final ensemble performance
+# ====================================
+y_pred = api.predict(X_test)
+score = api.score(y_pred, y_test)
+print(score)
+print(api.show_models())
+
+# Print statistics from search
+print(api.sprint_statistics())
+
+############################################################################
+# Build and fit a classifier with exclude components
+# ==================================================
+api = TabularClassificationTask(
+    search_space_updates=get_search_space_updates(),
+    exclude_components={'network_backbone': ['MLPBackbone'],
+                        'encoder': ['OneHotEncoder']}
+)
+
+############################################################################
+# Search for an ensemble of machine learning algorithms
+# =====================================================
+api.search(
+    X_train=X_train,
+    y_train=y_train,
+    X_test=X_test.copy(),
+    y_test=y_test.copy(),
+    optimize_metric='accuracy',
+    total_walltime_limit=150,
+    func_eval_time_limit_secs=30
+)
+
+############################################################################
+# Print the final ensemble performance
+# ====================================
+y_pred = api.predict(X_test)
+score = api.score(y_pred, y_test)
+print(score)
+print(api.show_models())
+
+# Print statistics from search
+print(api.sprint_statistics())
diff --git a/examples/40_advanced/example_parallel_n_jobs.py b/examples/40_advanced/example_parallel_n_jobs.py
index 698f3ad61..d345c6fca 100644
--- a/examples/40_advanced/example_parallel_n_jobs.py
+++ b/examples/40_advanced/example_parallel_n_jobs.py
@@ -1,10 +1,11 @@
 """
-======================
-Tabular Classification
-======================
+============================================
+Tabular Classification with n parallel jobs
+============================================
 
 The following example shows how to fit a sample classification model parallely on 2 cores
 with AutoPyTorch
+
 """
 import os
 import tempfile as tmp
@@ -60,9 +61,9 @@
     ############################################################################
     # Print the final ensemble performance
     # ====================================
-    print(api.run_history, api.trajectory)
     y_pred = api.predict(X_test)
     score = api.score(y_pred, y_test)
     print(score)
     # Print the final ensemble built by AutoPyTorch
-    print(api.show_models())
+    print(api.sprint_statistics())
+
diff --git a/examples/40_advanced/example_resampling_strategy.py b/examples/40_advanced/example_resampling_strategy.py
index 6735fffee..d02859f1b 100644
--- a/examples/40_advanced/example_resampling_strategy.py
+++ b/examples/40_advanced/example_resampling_strategy.py
@@ -26,10 +26,13 @@
 from autoPyTorch.api.tabular_classification import TabularClassificationTask
 from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
 
+############################################################################
+# Default Resampling Strategy
+# ============================
 
 ############################################################################
 # Data Loading
-# ============
+# ------------
 X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True)
 X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
     X,
@@ -39,7 +42,7 @@
 
 ############################################################################
 # Build and fit a classifier with default resampling strategy
-# ===========================================================
+# -----------------------------------------------------------
 api = TabularClassificationTask(
     # 'HoldoutValTypes.holdout_validation' with 'val_share': 0.33
     # is the default argument setting for TabularClassificationTask.
@@ -51,7 +54,7 @@
 
 ############################################################################
 # Search for an ensemble of machine learning algorithms
-# =====================================================
+# -----------------------------------------------------
 api.search(
     X_train=X_train,
     y_train=y_train,
@@ -64,7 +67,7 @@
 
 ############################################################################
 # Print the final ensemble performance
-# ====================================
+# ------------------------------------
 y_pred = api.predict(X_test)
 score = api.score(y_pred, y_test)
 print(score)
@@ -76,9 +79,13 @@
 
 ############################################################################
 
+############################################################################
+# Cross validation Resampling Strategy
+# =====================================
+
 ############################################################################
 # Build and fit a classifier with Cross validation resampling strategy
-# ====================================================================
+# --------------------------------------------------------------------
 api = TabularClassificationTask(
     resampling_strategy=CrossValTypes.k_fold_cross_validation,
     resampling_strategy_args={'num_splits': 3}
@@ -86,7 +93,8 @@
 
 ############################################################################
 # Search for an ensemble of machine learning algorithms
-# =====================================================
+# -----------------------------------------------------------------------
+
 api.search(
     X_train=X_train,
     y_train=y_train,
@@ -99,7 +107,7 @@
 
 ############################################################################
 # Print the final ensemble performance
-# ====================================
+# ------------
 y_pred = api.predict(X_test)
 score = api.score(y_pred, y_test)
 print(score)
@@ -111,9 +119,13 @@
 
 ############################################################################
 
+############################################################################
+# Stratified Resampling Strategy
+# ===============================
+
 ############################################################################
 # Build and fit a classifier with Stratified resampling strategy
-# ==============================================================
+# --------------------------------------------------------------
 api = TabularClassificationTask(
     # For demonstration purposes, we use
     # Stratified hold out validation. However,
@@ -124,7 +136,7 @@
 
 ############################################################################
 # Search for an ensemble of machine learning algorithms
-# =====================================================
+# -----------------------------------------------------
 api.search(
     X_train=X_train,
     y_train=y_train,
diff --git a/examples/40_advanced/example_run_with_portfolio.py b/examples/40_advanced/example_run_with_portfolio.py
index 01d8bef15..fef230fc5 100644
--- a/examples/40_advanced/example_run_with_portfolio.py
+++ b/examples/40_advanced/example_run_with_portfolio.py
@@ -24,50 +24,48 @@
 from autoPyTorch.api.tabular_classification import TabularClassificationTask
 
 
-if __name__ == '__main__':
+############################################################################
+# Data Loading
+# ============
+X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+    X,
+    y,
+    random_state=42,
+)
 
-    ############################################################################
-    # Data Loading
-    # ============
-    X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True)
-    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
-        X,
-        y,
-        random_state=42,
-    )
+############################################################################
+# Build and fit a classifier
+# ==========================
+api = TabularClassificationTask(
+    seed=42,
+)
 
-    ############################################################################
-    # Build and fit a classifier
-    # ==========================
-    api = TabularClassificationTask(
-        seed=42,
-    )
+############################################################################
+# Search for an ensemble of machine learning algorithms
+# =====================================================
+api.search(
+    X_train=X_train,
+    y_train=y_train,
+    X_test=X_test.copy(),
+    y_test=y_test.copy(),
+    optimize_metric='accuracy',
+    total_walltime_limit=300,
+    func_eval_time_limit_secs=50,
+    # Setting this option to "greedy"
+    # will make smac run the configurations
+    # present in 'autoPyTorch/configs/greedy_portfolio.json'
+    portfolio_selection="greedy"
+)
 
-    ############################################################################
-    # Search for an ensemble of machine learning algorithms
-    # =====================================================
-    api.search(
-        X_train=X_train,
-        y_train=y_train,
-        X_test=X_test.copy(),
-        y_test=y_test.copy(),
-        optimize_metric='accuracy',
-        total_walltime_limit=300,
-        func_eval_time_limit_secs=50,
-        # Setting this option to "greedy"
-        # will make smac run the configurations
-        # present in 'autoPyTorch/configs/greedy_portfolio.json'
-        portfolio_selection="greedy"
-    )
+############################################################################
+# Print the final ensemble performance
+# ====================================
+y_pred = api.predict(X_test)
+score = api.score(y_pred, y_test)
+print(score)
+# Print the final ensemble built by AutoPyTorch
+print(api.show_models())
 
-    ############################################################################
-    # Print the final ensemble performance
-    # ====================================
-    y_pred = api.predict(X_test)
-    score = api.score(y_pred, y_test)
-    print(score)
-    # Print the final ensemble built by AutoPyTorch
-    print(api.show_models())
-
-    # Print statistics from search
-    print(api.sprint_statistics())
+# Print statistics from search
+print(api.sprint_statistics())

From ef8d21ae982883548a8e77a3690eefa810d3ae07 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com>
Date: Tue, 23 Nov 2021 11:14:57 +0100
Subject: [PATCH 4/7] Update README.md, remove cat requirements.txt

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 389ab7902..3fb449fb1 100755
--- a/README.md
+++ b/README.md
@@ -56,7 +56,6 @@ git submodule update --init --recursive
 conda create -n auto-pytorch python=3.8
 conda activate auto-pytorch
 conda install swig
-cat requirements.txt | xargs -n 1 -L 1 pip install
 python setup.py install
 
 ```

From 4dd22fdb67914917e2d505d3240b2a53f3cafb41 Mon Sep 17 00:00:00 2001
From: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com>
Date: Wed, 1 Dec 2021 17:50:40 +0100
Subject: [PATCH 5/7] [feat] Add an object that realizes the perf over time viz
 (#331)

* [feat] Add an object that realizes the perf over time viz

* [fix] Modify TODOs and add comments to avoid complications

* [refactor] [feat] Format visualizer API and integrate this feature into BaseTask

* [refactor] Separate a shared raise error process as a function

* [refactor] Gather params in Dataclass to look smarter

* [refactor] Merge extraction from history to the result manager

Since this feature was added in a previous PR, we now rely on this
feature to extract the history.
To handle the order by the start time issue, I added the sort by endtime
feature.

* [feat] Merge the viz in the latest version

* [fix] Fix nan --> worst val so that we can always handle by number

* [fix] Fix mypy issues

* [test] Add test for get_start_time

* [test] Add test for order by end time

* [test] Add tests for ensemble results

* [test] Add tests for merging ensemble results and run history

* [test] Add the tests in the case of ensemble_results is None

* [fix] Alternate datetime to timestamp in tests to pass universally

Since the mapping of timestamp to datetime variates on machine,
the tests failed in the previous version.
In this version, we changed the datetime in the tests to the fixed
timestamp so that the tests will pass universally.

* [fix] Fix status_msg --> status_type because it does not need to be str

* [fix] Change the name for the homogeniety

* [fix] Fix based on the file name change

* [test] Add tests for set_plot_args

* [test] Add tests for plot_perf_over_time in BaseTask

* [refactor] Replace redundant lines by pytest parametrization

* [test] Add tests for _get_perf_and_time

* [fix] Remove viz attribute based on Ravin's comment

* [fix] Fix doc-string based on Ravin's comments

* [refactor] Hide color label settings extraction in dataclass

Since this process makes the method in BaseTask redundant and this was
pointed out by Ravin, I made this process a method of dataclass so that
we can easily fetch this information.
Note that since the color and label information always depend on the
optimization results, we always need to pass metric results to ensure
we only get related keys.

* [test] Add tests for color label dicts extraction

* [test] Add tests for checking if plt.show is called or not

* [refactor] Address Ravin's comments and add TODO for the refactoring

* [refactor] Change KeyError in EnsembleResults to empty

Since it is not convenient to not be able to instantiate EnsembleResults
in the case when we do not have any histories,
I changed the functionality so that we can still instantiate even when
the results are empty.
In this case, we have empty arrays and it also matches the developers
intuition.

* [refactor] Prohibit external updates to make objects more robust

* [fix] Remove a member variable _opt_scores since it is confusing

Since opt_scores are taken from cost in run_history and metric_dict
takes from additional_info, it was confusing for me where I should
refer to what. By removing this, we can always refer to additional_info
when fetching information and metrics are always available as a raw
value. Although I changed a lot, the functionality did not change and
it is easier to add any other functionalities now.

* [example] Add an example how to plot performance over time

* [fix] Fix unexpected train loss when using cross validation

* [fix] Remove __main__ from example based on the Ravin's comment

* [fix] Move results_xxx to utils from API

* [enhance] Change example for the plot over time to save fig

Since the plt.show() does not work on some environments,
I changed the example so that everyone can run at least this example.
---
 autoPyTorch/api/base_task.py                  |  59 +-
 autoPyTorch/api/results_manager.py            | 326 ---------
 autoPyTorch/evaluation/train_evaluator.py     |  13 +-
 autoPyTorch/utils/results_manager.py          | 686 ++++++++++++++++++
 autoPyTorch/utils/results_visualizer.py       | 310 ++++++++
 .../40_advanced/example_plot_over_time.py     |  82 +++
 test/test_api/test_results_manager.py         | 232 ------
 .../runhistory.json}                          |   0
 test/test_utils/test_results_manager.py       | 484 ++++++++++++
 test/test_utils/test_results_visualizer.py    | 274 +++++++
 10 files changed, 1903 insertions(+), 563 deletions(-)
 delete mode 100644 autoPyTorch/api/results_manager.py
 create mode 100644 autoPyTorch/utils/results_manager.py
 create mode 100644 autoPyTorch/utils/results_visualizer.py
 create mode 100644 examples/40_advanced/example_plot_over_time.py
 delete mode 100644 test/test_api/test_results_manager.py
 rename test/{test_api/.tmp_api/runhistory_B.json => test_utils/runhistory.json} (100%)
 create mode 100644 test/test_utils/test_results_manager.py
 create mode 100644 test/test_utils/test_results_visualizer.py

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index a997c505b..edd505d86 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -21,6 +21,8 @@
 
 import joblib
 
+import matplotlib.pyplot as plt
+
 import numpy as np
 
 import pandas as pd
@@ -29,7 +31,7 @@
 from smac.stats.stats import Stats
 from smac.tae import StatusType
 
-from autoPyTorch.api.results_manager import ResultsManager, SearchResults
+from autoPyTorch import metrics
 from autoPyTorch.automl_common.common.utils.backend import Backend, create
 from autoPyTorch.constants import (
     REGRESSION_TASKS,
@@ -58,6 +60,8 @@
 )
 from autoPyTorch.utils.parallel import preload_modules
 from autoPyTorch.utils.pipeline import get_configuration_space, get_dataset_requirements
+from autoPyTorch.utils.results_manager import MetricResults, ResultsManager, SearchResults
+from autoPyTorch.utils.results_visualizer import ColorLabelSettings, PlotSettingParams, ResultsVisualizer
 from autoPyTorch.utils.single_thread_client import SingleThreadedClient
 from autoPyTorch.utils.stopwatch import StopWatch
 
@@ -1479,3 +1483,56 @@ def sprint_statistics(self) -> str:
             scoring_functions=self._scoring_functions,
             metric=self._metric
         )
+
+    def plot_perf_over_time(
+        self,
+        metric_name: str,
+        ax: Optional[plt.Axes] = None,
+        plot_setting_params: PlotSettingParams = PlotSettingParams(),
+        color_label_settings: ColorLabelSettings = ColorLabelSettings(),
+        *args: Any,
+        **kwargs: Any
+    ) -> None:
+        """
+        Visualize the performance over time using matplotlib.
+        The plot related arguments are based on matplotlib.
+        Please refer to the matplotlib documentation for more details.
+
+        Args:
+            metric_name (str):
+                The name of metric to visualize.
+                The names are available in
+                    * autoPyTorch.metrics.CLASSIFICATION_METRICS
+                    * autoPyTorch.metrics.REGRESSION_METRICS
+            ax (Optional[plt.Axes]):
+                axis to plot (subplots of matplotlib).
+                If None, it will be created automatically.
+            plot_setting_params (PlotSettingParams):
+                Parameters for the plot.
+            color_label_settings (ColorLabelSettings):
+                The settings of a pair of color and label for each plot.
+            args, kwargs (Any):
+                Arguments for the ax.plot.
+        """
+
+        if not hasattr(metrics, metric_name):
+            raise ValueError(
+                f'metric_name must be in {list(metrics.CLASSIFICATION_METRICS.keys())} '
+                f'or {list(metrics.REGRESSION_METRICS.keys())}, but got {metric_name}'
+            )
+        if len(self.ensemble_performance_history) == 0:
+            raise RuntimeError('Visualization is available only after ensembles are evaluated.')
+
+        results = MetricResults(
+            metric=getattr(metrics, metric_name),
+            run_history=self.run_history,
+            ensemble_performance_history=self.ensemble_performance_history
+        )
+
+        colors, labels = color_label_settings.extract_dicts(results)
+
+        ResultsVisualizer().plot_perf_over_time(  # type: ignore
+            results=results, plot_setting_params=plot_setting_params,
+            colors=colors, labels=labels, ax=ax,
+            *args, **kwargs
+        )
diff --git a/autoPyTorch/api/results_manager.py b/autoPyTorch/api/results_manager.py
deleted file mode 100644
index e52d21613..000000000
--- a/autoPyTorch/api/results_manager.py
+++ /dev/null
@@ -1,326 +0,0 @@
-import io
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-from ConfigSpace.configuration_space import Configuration
-
-import numpy as np
-
-import scipy
-
-from smac.runhistory.runhistory import RunHistory, RunValue
-from smac.tae import StatusType
-from smac.utils.io.traj_logging import TrajEntry
-
-from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
-
-
-# TODO remove StatusType.RUNNING at some point in the future when the new SMAC 0.13.2
-#  is the new minimum required version!
-STATUS2MSG = {
-    StatusType.SUCCESS: 'Success',
-    StatusType.DONOTADVANCE: 'Success (but did not advance to higher budget)',
-    StatusType.TIMEOUT: 'Timeout',
-    StatusType.CRASHED: 'Crash',
-    StatusType.ABORT: 'Abort',
-    StatusType.MEMOUT: 'Memory out'
-}
-
-
-def cost2metric(cost: float, metric: autoPyTorchMetric) -> float:
-    """
-    Revert cost metric evaluated in SMAC to the original metric.
-
-    The conversion is defined in:
-        autoPyTorch/pipeline/components/training/metrics/utils.py::calculate_loss
-        cost = metric._optimum - metric._sign * original_metric_value
-        ==> original_metric_value = metric._sign * (metric._optimum - cost)
-    """
-    return metric._sign * (metric._optimum - cost)
-
-
-def _extract_metrics_info(
-    run_value: RunValue,
-    scoring_functions: List[autoPyTorchMetric]
-) -> Dict[str, float]:
-    """
-    Extract the metric information given a run_value
-    and a list of metrics of interest.
-
-    Args:
-        run_value (RunValue):
-            The information for each config evaluation.
-        scoring_functions (List[autoPyTorchMetric]):
-            The list of metrics to retrieve the info.
-    """
-
-    if run_value.status not in (StatusType.SUCCESS, StatusType.DONOTADVANCE):
-        # Additional info for metrics is not available in this case.
-        return {metric.name: np.nan for metric in scoring_functions}
-
-    cost_info = run_value.additional_info['opt_loss']
-    avail_metrics = cost_info.keys()
-
-    return {
-        metric.name: cost2metric(cost=cost_info[metric.name], metric=metric)
-        if metric.name in avail_metrics else np.nan
-        for metric in scoring_functions
-    }
-
-
-class SearchResults:
-    def __init__(
-        self,
-        metric: autoPyTorchMetric,
-        scoring_functions: List[autoPyTorchMetric],
-        run_history: RunHistory
-    ):
-        self.metric_dict: Dict[str, List[float]] = {
-            metric.name: []
-            for metric in scoring_functions
-        }
-        self._opt_scores: List[float] = []
-        self._fit_times: List[float] = []
-        self.configs: List[Configuration] = []
-        self.status_types: List[str] = []
-        self.budgets: List[float] = []
-        self.config_ids: List[int] = []
-        self.is_traditionals: List[bool] = []
-        self.additional_infos: List[Optional[Dict[str, Any]]] = []
-        self.rank_test_scores: np.ndarray = np.array([])
-        self._scoring_functions = scoring_functions
-        self._metric = metric
-
-        self._extract_results_from_run_history(run_history)
-
-    @property
-    def opt_scores(self) -> np.ndarray:
-        return np.asarray(self._opt_scores)
-
-    @property
-    def fit_times(self) -> np.ndarray:
-        return np.asarray(self._fit_times)
-
-    def update(
-        self,
-        config: Configuration,
-        status: str,
-        budget: float,
-        fit_time: float,
-        config_id: int,
-        is_traditional: bool,
-        additional_info: Dict[str, Any],
-        score: float,
-        metric_info: Dict[str, float]
-    ) -> None:
-
-        self.status_types.append(status)
-        self.configs.append(config)
-        self.budgets.append(budget)
-        self.config_ids.append(config_id)
-        self.is_traditionals.append(is_traditional)
-        self.additional_infos.append(additional_info)
-        self._fit_times.append(fit_time)
-        self._opt_scores.append(score)
-
-        for metric_name, val in metric_info.items():
-            self.metric_dict[metric_name].append(val)
-
-    def clear(self) -> None:
-        self._opt_scores = []
-        self._fit_times = []
-        self.configs = []
-        self.status_types = []
-        self.budgets = []
-        self.config_ids = []
-        self.additional_infos = []
-        self.is_traditionals = []
-        self.rank_test_scores = np.array([])
-
-    def _extract_results_from_run_history(self, run_history: RunHistory) -> None:
-        """
-        Extract the information to match this class format.
-
-        Args:
-            run_history (RunHistory):
-                The history of config evals from SMAC.
-        """
-
-        self.clear()  # Delete cache before the extraction
-
-        for run_key, run_value in run_history.data.items():
-            config_id = run_key.config_id
-            config = run_history.ids_config[config_id]
-
-            status_msg = STATUS2MSG.get(run_value.status, None)
-            if run_value.status in (StatusType.STOP, StatusType.RUNNING):
-                continue
-            elif status_msg is None:
-                raise ValueError(f'Unexpected run status: {run_value.status}')
-
-            is_traditional = False  # If run is not successful, unsure ==> not True ==> False
-            if run_value.additional_info is not None:
-                is_traditional = run_value.additional_info['configuration_origin'] == 'traditional'
-
-            self.update(
-                status=status_msg,
-                config=config,
-                budget=run_key.budget,
-                fit_time=run_value.time,
-                score=cost2metric(cost=run_value.cost, metric=self._metric),
-                metric_info=_extract_metrics_info(run_value=run_value, scoring_functions=self._scoring_functions),
-                is_traditional=is_traditional,
-                additional_info=run_value.additional_info,
-                config_id=config_id
-            )
-
-        self.rank_test_scores = scipy.stats.rankdata(
-            -1 * self._metric._sign * self.opt_scores,  # rank order
-            method='min'
-        )
-
-
-class ResultsManager:
-    def __init__(self, *args: Any, **kwargs: Any):
-        """
-        Attributes:
-            run_history (RunHistory):
-                A `SMAC Runshistory <https://automl.github.io/SMAC3/master/apidoc/smac.runhistory.runhistory.html>`_
-                object that holds information about the runs of the target algorithm made during search
-            ensemble_performance_history (List[Dict[str, Any]]):
-                The list of ensemble performance in the optimization.
-                The list includes the `timestamp`, `result on train set`, and `result on test set`
-            trajectory (List[TrajEntry]):
-                A list of all incumbent configurations during search
-        """
-        self.run_history: RunHistory = RunHistory()
-        self.ensemble_performance_history: List[Dict[str, Any]] = []
-        self.trajectory: List[TrajEntry] = []
-
-    def _check_run_history(self) -> None:
-        if self.run_history is None:
-            raise RuntimeError("No Run History found, search has not been called.")
-
-        if self.run_history.empty():
-            raise RuntimeError("Run History is empty. Something went wrong, "
-                               "SMAC was not able to fit any model?")
-
-    def get_incumbent_results(
-        self,
-        metric: autoPyTorchMetric,
-        include_traditional: bool = False
-    ) -> Tuple[Configuration, Dict[str, Union[int, str, float]]]:
-        """
-        Get Incumbent config and the corresponding results
-
-        Args:
-            metric (autoPyTorchMetric):
-                A metric that is evaluated when searching with fit AutoPytorch.
-            include_traditional (bool):
-                Whether to include results from tradtional pipelines
-
-        Returns:
-            Configuration (CS.ConfigurationSpace):
-                The incumbent configuration
-            Dict[str, Union[int, str, float]]:
-                Additional information about the run of the incumbent configuration.
-        """
-        self._check_run_history()
-
-        results = SearchResults(metric=metric, scoring_functions=[], run_history=self.run_history)
-
-        if not include_traditional:
-            non_traditional = ~np.array(results.is_traditionals)
-            scores = results.opt_scores[non_traditional]
-            indices = np.arange(len(results.configs))[non_traditional]
-        else:
-            scores = results.opt_scores
-            indices = np.arange(len(results.configs))
-
-        incumbent_idx = indices[np.nanargmax(metric._sign * scores)]
-        incumbent_config = results.configs[incumbent_idx]
-        incumbent_results = results.additional_infos[incumbent_idx]
-
-        assert incumbent_results is not None  # mypy check
-        return incumbent_config, incumbent_results
-
-    def get_search_results(
-        self,
-        scoring_functions: List[autoPyTorchMetric],
-        metric: autoPyTorchMetric
-    ) -> SearchResults:
-        """
-        This attribute is populated with data from `self.run_history`
-        and contains information about the configurations, and their
-        corresponding metric results, status of run, parameters and
-        the budget
-
-        Args:
-            scoring_functions (List[autoPyTorchMetric]):
-                Metrics to show in the results.
-            metric (autoPyTorchMetric):
-                A metric that is evaluated when searching with fit AutoPytorch.
-
-        Returns:
-            SearchResults:
-                An instance that contains the results from search
-        """
-        self._check_run_history()
-        return SearchResults(metric=metric, scoring_functions=scoring_functions, run_history=self.run_history)
-
-    def sprint_statistics(
-        self,
-        dataset_name: str,
-        scoring_functions: List[autoPyTorchMetric],
-        metric: autoPyTorchMetric
-    ) -> str:
-        """
-        Prints statistics about the SMAC search.
-
-        These statistics include:
-
-        1. Optimisation Metric
-        2. Best Optimisation score achieved by individual pipelines
-        3. Total number of target algorithm runs
-        4. Total number of successful target algorithm runs
-        5. Total number of crashed target algorithm runs
-        6. Total number of target algorithm runs that exceeded the time limit
-        7. Total number of successful target algorithm runs that exceeded the memory limit
-
-        Args:
-            dataset_name (str):
-                The dataset name that was used in the run.
-            scoring_functions (List[autoPyTorchMetric]):
-                Metrics to show in the results.
-            metric (autoPyTorchMetric):
-                A metric that is evaluated when searching with fit AutoPytorch.
-
-        Returns:
-            (str):
-                Formatted string with statistics
-        """
-        search_results = self.get_search_results(scoring_functions, metric)
-        success_msgs = (STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.DONOTADVANCE])
-        sio = io.StringIO()
-        sio.write("autoPyTorch results:\n")
-        sio.write(f"\tDataset name: {dataset_name}\n")
-        sio.write(f"\tOptimisation Metric: {metric}\n")
-
-        num_runs = len(search_results.status_types)
-        num_success = sum([s in success_msgs for s in search_results.status_types])
-        num_crash = sum([s == STATUS2MSG[StatusType.CRASHED] for s in search_results.status_types])
-        num_timeout = sum([s == STATUS2MSG[StatusType.TIMEOUT] for s in search_results.status_types])
-        num_memout = sum([s == STATUS2MSG[StatusType.MEMOUT] for s in search_results.status_types])
-
-        if num_success > 0:
-            best_score = metric._sign * np.nanmax(metric._sign * search_results.opt_scores)
-            sio.write(f"\tBest validation score: {best_score}\n")
-
-        sio.write(f"\tNumber of target algorithm runs: {num_runs}\n")
-        sio.write(f"\tNumber of successful target algorithm runs: {num_success}\n")
-        sio.write(f"\tNumber of crashed target algorithm runs: {num_crash}\n")
-        sio.write(f"\tNumber of target algorithms that exceeded the time "
-                  f"limit: {num_timeout}\n")
-        sio.write(f"\tNumber of target algorithms that exceeded the memory "
-                  f"limit: {num_memout}\n")
-
-        return sio.getvalue()
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
index 010948b55..37926a8c0 100644
--- a/autoPyTorch/evaluation/train_evaluator.py
+++ b/autoPyTorch/evaluation/train_evaluator.py
@@ -254,10 +254,15 @@ def fit_predict_and_loss(self) -> None:
 
             # train_losses is a list of dicts. It is
             # computed using the target metric (self.metric).
-            train_loss = np.average([train_losses[i][str(self.metric)]
-                                     for i in range(self.num_folds)],
-                                    weights=train_fold_weights,
-                                    )
+            train_loss = {}
+            for metric in train_losses[0].keys():
+                train_loss[metric] = np.average(
+                    [
+                        train_losses[i][metric]
+                        for i in range(self.num_folds)
+                    ],
+                    weights=train_fold_weights
+                )
 
             opt_loss = {}
             # self.logger.debug("OPT LOSSES: {}".format(opt_losses if opt_losses is not None else None))
diff --git a/autoPyTorch/utils/results_manager.py b/autoPyTorch/utils/results_manager.py
new file mode 100644
index 000000000..c1860b0f6
--- /dev/null
+++ b/autoPyTorch/utils/results_manager.py
@@ -0,0 +1,686 @@
+import io
+from datetime import datetime
+from typing import Any, Dict, List, Tuple, Union
+
+from ConfigSpace.configuration_space import Configuration
+
+import numpy as np
+
+import scipy
+
+from smac.runhistory.runhistory import RunHistory, RunKey, RunValue
+from smac.tae import StatusType
+from smac.utils.io.traj_logging import TrajEntry
+
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+
+
+# TODO remove StatusType.RUNNING at some point in the future when the new SMAC 0.13.2
+#  is the new minimum required version!
+STATUS_TYPES = [
+    StatusType.SUCCESS,
+    # Success (but did not advance to higher budget such as cutoff by hyperband)
+    StatusType.DONOTADVANCE,
+    StatusType.TIMEOUT,
+    StatusType.CRASHED,
+    StatusType.ABORT,
+    StatusType.MEMOUT
+]
+
+
+def cost2metric(cost: float, metric: autoPyTorchMetric) -> float:
+    """
+    Revert cost metric evaluated in SMAC to the original metric.
+
+    The conversion is defined in:
+        autoPyTorch/pipeline/components/training/metrics/utils.py::calculate_loss
+        cost = metric._optimum - metric._sign * original_metric_value
+        ==> original_metric_value = metric._sign * (metric._optimum - cost)
+    """
+    return metric._sign * (metric._optimum - cost)
+
+
+def get_start_time(run_history: RunHistory) -> float:
+    """
+    Get start time of optimization.
+
+    Args:
+        run_history (RunHistory):
+            The history of config evals from SMAC.
+
+    Returns:
+        starttime (float):
+            The start time of the first training.
+    """
+
+    start_times = []
+    for run_value in run_history.data.values():
+        if run_value.status in (StatusType.STOP, StatusType.RUNNING):
+            continue
+        elif run_value.status not in STATUS_TYPES:
+            raise ValueError(f'Unexpected run status: {run_value.status}')
+
+        start_times.append(run_value.starttime)
+
+    return float(np.min(start_times))  # mypy redefinition
+
+
+def _extract_metrics_info(
+    run_value: RunValue,
+    scoring_functions: List[autoPyTorchMetric],
+    inference_name: str
+) -> Dict[str, float]:
+    """
+    Extract the metric information given a run_value
+    and a list of metrics of interest.
+
+    Args:
+        run_value (RunValue):
+            The information for each config evaluation.
+        scoring_functions (List[autoPyTorchMetric]):
+            The list of metrics to retrieve the info.
+        inference_name (str):
+            The name of the inference. Either `train`, `opt` or `test`.
+
+    Returns:
+        metric_info (Dict[str, float]):
+            The metric values of interest.
+            Since the metrics in additional_info are `cost`,
+            we transform them into the original form.
+    """
+
+    if run_value.status not in (StatusType.SUCCESS, StatusType.DONOTADVANCE):
+        # Additional info for metrics is not available in this case.
+        return {metric.name: metric._worst_possible_result for metric in scoring_functions}
+
+    inference_choices = ['train', 'opt', 'test']
+    if inference_name not in inference_choices:
+        raise ValueError(f'inference_name must be in {inference_choices}, but got {inference_choices}')
+
+    cost_info = run_value.additional_info[f'{inference_name}_loss']
+    avail_metrics = cost_info.keys()
+
+    return {
+        metric.name: cost2metric(cost=cost_info[metric.name], metric=metric)
+        if metric.name in avail_metrics else metric._worst_possible_result
+        for metric in scoring_functions
+    }
+
+
+class EnsembleResults:
+    def __init__(
+        self,
+        metric: autoPyTorchMetric,
+        ensemble_performance_history: List[Dict[str, Any]],
+        order_by_endtime: bool = False
+    ):
+        """
+        The wrapper class for ensemble_performance_history.
+        This class extracts the information from ensemble_performance_history
+        and allows other class to easily handle the history.
+
+        Attributes:
+            train_scores (List[float]):
+                The ensemble scores on the training dataset.
+            test_scores (List[float]):
+                The ensemble scores on the test dataset.
+            end_times (List[float]):
+                The end time of the end of each ensemble evaluation.
+                Each element is a float timestamp.
+            empty (bool):
+                Whether the ensemble history about `self.metric` is empty or not.
+            metric (autoPyTorchMetric):
+                The information about the metric to contain.
+                In the case when such a metric does not exist in the record,
+                This class raises KeyError.
+        """
+        self._test_scores: List[float] = []
+        self._train_scores: List[float] = []
+        self._end_times: List[float] = []
+        self._metric = metric
+        self._empty = True  # Initial state is empty.
+        self._instantiated = False
+
+        self._extract_results_from_ensemble_performance_history(ensemble_performance_history)
+        if order_by_endtime:
+            self._sort_by_endtime()
+
+        self._instantiated = True
+
+    @property
+    def train_scores(self) -> np.ndarray:
+        return np.asarray(self._train_scores)
+
+    @property
+    def test_scores(self) -> np.ndarray:
+        return np.asarray(self._test_scores)
+
+    @property
+    def end_times(self) -> np.ndarray:
+        return np.asarray(self._end_times)
+
+    @property
+    def metric_name(self) -> str:
+        return self._metric.name
+
+    def empty(self) -> bool:
+        """ This is not property to follow coding conventions. """
+        return self._empty
+
+    def _update(self, data: Dict[str, Any]) -> None:
+        if self._instantiated:
+            raise RuntimeError(
+                'EnsembleResults should not be overwritten once instantiated. '
+                'Instantiate new object rather than using update.'
+            )
+
+        self._train_scores.append(data[f'train_{self.metric_name}'])
+        self._test_scores.append(data[f'test_{self.metric_name}'])
+        self._end_times.append(datetime.timestamp(data['Timestamp']))
+
+    def _sort_by_endtime(self) -> None:
+        """
+        Since the default order is by start time
+        and parallel computation might change the order of ending,
+        this method provides the feature to sort by end time.
+        Note that this method is destructive.
+        """
+        if self._instantiated:
+            raise RuntimeError(
+                'EnsembleResults should not be overwritten once instantiated. '
+                'Instantiate new object with order_by_endtime=True.'
+            )
+
+        order = np.argsort(self._end_times)
+
+        self._train_scores = self.train_scores[order].tolist()
+        self._test_scores = self.test_scores[order].tolist()
+        self._end_times = self.end_times[order].tolist()
+
+    def _extract_results_from_ensemble_performance_history(
+        self,
+        ensemble_performance_history: List[Dict[str, Any]]
+    ) -> None:
+        """
+        Extract information to from `ensemble_performance_history`
+        to match the format of this class format.
+
+        Args:
+            ensemble_performance_history (List[Dict[str, Any]]):
+                The history of the ensemble performance from EnsembleBuilder.
+                Its key must be either `train_xxx`, `test_xxx` or `Timestamp`.
+        """
+
+        if (
+            len(ensemble_performance_history) == 0
+            or f'train_{self.metric_name}' not in ensemble_performance_history[0].keys()
+        ):
+            self._empty = True
+            return
+
+        self._empty = False  # We can extract ==> not empty
+        for data in ensemble_performance_history:
+            self._update(data)
+
+
+class SearchResults:
+    def __init__(
+        self,
+        metric: autoPyTorchMetric,
+        scoring_functions: List[autoPyTorchMetric],
+        run_history: RunHistory,
+        order_by_endtime: bool = False
+    ):
+        """
+        The wrapper class for run_history.
+        This class extracts the information from run_history
+        and allows other class to easily handle the history.
+        Note that the data is sorted by starttime by default and
+        metric_dict has the original form of metric value, i.e. not necessarily cost.
+
+        Attributes:
+            train_metric_dict (Dict[str, List[float]]):
+                The extracted train metric information at each evaluation.
+                Each list keeps the metric information specified by scoring_functions and metric.
+            opt_metric_dict (Dict[str, List[float]]):
+                The extracted opt metric information at each evaluation.
+                Each list keeps the metric information specified by scoring_functions and metric.
+            test_metric_dict (Dict[str, List[float]]):
+                The extracted test metric information at each evaluation.
+                Each list keeps the metric information specified by scoring_functions and metric.
+            fit_times (List[float]):
+                The time needed to fit each model.
+            end_times (List[float]):
+                The end time of the end of each evaluation.
+                Each element is a float timestamp.
+            configs (List[Configuration]):
+                The configurations at each evaluation.
+            status_types (List[StatusType]):
+                The list of status types of each evaluation (e.g. success, crush).
+            budgets (List[float]):
+                The budgets used for each evaluation.
+                Here, budget refers to the definition in Hyperband or Successive halving.
+            config_ids (List[int]):
+                The ID of each configuration. Since we use cutoff such as in Hyperband,
+                we need to store it to know whether each configuration is a suvivor.
+            is_traditionals (List[bool]):
+                Whether each configuration is from traditional machine learning methods.
+            additional_infos (List[Dict[str, float]]):
+                It usually serves as the source of each metric at each evaluation.
+                In other words, train or test performance is extracted from this info.
+            rank_opt_scores (np.ndarray):
+                The rank of each evaluation among all the evaluations.
+            metric (autoPyTorchMetric):
+                The metric of the main interest.
+            scoring_functions (List[autoPyTorchMetric]):
+                The list of metrics to contain in the additional_infos.
+        """
+        if metric not in scoring_functions:
+            scoring_functions.append(metric)
+
+        self.train_metric_dict: Dict[str, List[float]] = {metric.name: [] for metric in scoring_functions}
+        self.opt_metric_dict: Dict[str, List[float]] = {metric.name: [] for metric in scoring_functions}
+        self.test_metric_dict: Dict[str, List[float]] = {metric.name: [] for metric in scoring_functions}
+
+        self._fit_times: List[float] = []
+        self._end_times: List[float] = []
+        self.configs: List[Configuration] = []
+        self.status_types: List[StatusType] = []
+        self.budgets: List[float] = []
+        self.config_ids: List[int] = []
+        self.is_traditionals: List[bool] = []
+        self.additional_infos: List[Dict[str, float]] = []
+        self.rank_opt_scores: np.ndarray = np.array([])
+        self._scoring_functions = scoring_functions
+        self._metric = metric
+        self._instantiated = False
+
+        self._extract_results_from_run_history(run_history)
+        if order_by_endtime:
+            self._sort_by_endtime()
+
+        self._instantiated = True
+
+    @property
+    def train_scores(self) -> np.ndarray:
+        """ training metric values at each evaluation """
+        return np.asarray(self.train_metric_dict[self.metric_name])
+
+    @property
+    def opt_scores(self) -> np.ndarray:
+        """ validation metric values at each evaluation """
+        return np.asarray(self.opt_metric_dict[self.metric_name])
+
+    @property
+    def test_scores(self) -> np.ndarray:
+        """ test metric values at each evaluation """
+        return np.asarray(self.test_metric_dict[self.metric_name])
+
+    @property
+    def fit_times(self) -> np.ndarray:
+        return np.asarray(self._fit_times)
+
+    @property
+    def end_times(self) -> np.ndarray:
+        return np.asarray(self._end_times)
+
+    @property
+    def metric_name(self) -> str:
+        return self._metric.name
+
+    def _update(
+        self,
+        config: Configuration,
+        run_key: RunKey,
+        run_value: RunValue
+    ) -> None:
+
+        if self._instantiated:
+            raise RuntimeError(
+                'SearchResults should not be overwritten once instantiated. '
+                'Instantiate new object rather than using update.'
+            )
+        elif run_value.status in (StatusType.STOP, StatusType.RUNNING):
+            return
+        elif run_value.status not in STATUS_TYPES:
+            raise ValueError(f'Unexpected run status: {run_value.status}')
+
+        is_traditional = False  # If run is not successful, unsure ==> not True ==> False
+        if run_value.additional_info is not None:
+            is_traditional = run_value.additional_info['configuration_origin'] == 'traditional'
+
+        self.status_types.append(run_value.status)
+        self.configs.append(config)
+        self.budgets.append(run_key.budget)
+        self.config_ids.append(run_key.config_id)
+        self.is_traditionals.append(is_traditional)
+        self.additional_infos.append(run_value.additional_info)
+        self._fit_times.append(run_value.time)
+        self._end_times.append(run_value.endtime)
+
+        for inference_name in ['train', 'opt', 'test']:
+            metric_info = _extract_metrics_info(
+                run_value=run_value,
+                scoring_functions=self._scoring_functions,
+                inference_name=inference_name
+            )
+            for metric_name, val in metric_info.items():
+                getattr(self, f'{inference_name}_metric_dict')[metric_name].append(val)
+
+    def _sort_by_endtime(self) -> None:
+        """
+        Since the default order is by start time
+        and parallel computation might change the order of ending,
+        this method provides the feature to sort by end time.
+        Note that this method is destructive.
+        """
+        if self._instantiated:
+            raise RuntimeError(
+                'SearchResults should not be overwritten once instantiated. '
+                'Instantiate new object with order_by_endtime=True.'
+            )
+
+        order = np.argsort(self._end_times)
+
+        self.train_metric_dict = {name: [arr[idx] for idx in order] for name, arr in self.train_metric_dict.items()}
+        self.opt_metric_dict = {name: [arr[idx] for idx in order] for name, arr in self.opt_metric_dict.items()}
+        self.test_metric_dict = {name: [arr[idx] for idx in order] for name, arr in self.test_metric_dict.items()}
+
+        self._fit_times = [self._fit_times[idx] for idx in order]
+        self._end_times = [self._end_times[idx] for idx in order]
+        self.status_types = [self.status_types[idx] for idx in order]
+        self.budgets = [self.budgets[idx] for idx in order]
+        self.config_ids = [self.config_ids[idx] for idx in order]
+        self.is_traditionals = [self.is_traditionals[idx] for idx in order]
+        self.additional_infos = [self.additional_infos[idx] for idx in order]
+
+        # Don't use numpy slicing to avoid version dependency (cast config to object might cause issues)
+        self.configs = [self.configs[idx] for idx in order]
+
+        # Only rank_opt_scores is np.ndarray
+        self.rank_opt_scores = self.rank_opt_scores[order]
+
+    def _extract_results_from_run_history(self, run_history: RunHistory) -> None:
+        """
+        Extract the information to match this class format.
+
+        Args:
+            run_history (RunHistory):
+                The history of config evals from SMAC.
+        """
+
+        for run_key, run_value in run_history.data.items():
+            config = run_history.ids_config[run_key.config_id]
+            self._update(config=config, run_key=run_key, run_value=run_value)
+
+        self.rank_opt_scores = scipy.stats.rankdata(
+            -1 * self._metric._sign * self.opt_scores,  # rank order
+            method='min'
+        )
+
+
+class MetricResults:
+    def __init__(
+        self,
+        metric: autoPyTorchMetric,
+        run_history: RunHistory,
+        ensemble_performance_history: List[Dict[str, Any]]
+    ):
+        """
+        The wrapper class for ensemble_performance_history.
+        This class extracts the information from ensemble_performance_history
+        and allows other class to easily handle the history.
+        Note that all the data is sorted by endtime!
+
+        Attributes:
+            start_time (float):
+                The timestamp at the very beginning of the optimization.
+            cum_times (np.ndarray):
+                The runtime needed to reach the end of each evaluation.
+                The time unit is second.
+            metric (autoPyTorchMetric):
+                The information about the metric to contain.
+            search_results (SearchResults):
+                The instance to fetch the metric values of `self.metric`
+                from run_history.
+            ensemble_results (EnsembleResults):
+                The instance to fetch the metric values of `self.metric`
+                from ensemble_performance_history.
+                If there is no information available, self.empty() returns True.
+            data (Dict[str, np.ndarray]):
+                Keys are `{single, ensemble}::{train, opt, test}::{metric.name}`.
+                Each array contains the evaluated values for the corresponding category.
+        """
+        self.start_time = get_start_time(run_history)
+        self.metric = metric
+        self.search_results = SearchResults(
+            metric=metric,
+            run_history=run_history,
+            scoring_functions=[],
+            order_by_endtime=True
+        )
+        self.ensemble_results = EnsembleResults(
+            metric=metric,
+            ensemble_performance_history=ensemble_performance_history,
+            order_by_endtime=True
+        )
+
+        if (
+            not self.ensemble_results.empty()
+            and self.search_results.end_times[-1] < self.ensemble_results.end_times[-1]
+        ):
+            # Augment runtime table with the final available end time
+            self.cum_times = np.hstack(
+                [self.search_results.end_times - self.start_time,
+                 [self.ensemble_results.end_times[-1] - self.start_time]]
+            )
+        else:
+            self.cum_times = self.search_results.end_times - self.start_time
+
+        self.data: Dict[str, np.ndarray] = {}
+        self._extract_results()
+
+    def _extract_results(self) -> None:
+        """ Extract metric values of `self.metric` and store them in `self.data`. """
+        metric_name = self.metric.name
+        for inference_name in ['train', 'test', 'opt']:
+            # TODO: Extract information from self.search_results
+            data = getattr(self.search_results, f'{inference_name}_metric_dict')[metric_name]
+            self.data[f'single::{inference_name}::{metric_name}'] = np.array(data)
+
+            if self.ensemble_results.empty() or inference_name == 'opt':
+                continue
+
+            data = getattr(self.ensemble_results, f'{inference_name}_scores')
+            self.data[f'ensemble::{inference_name}::{metric_name}'] = np.array(data)
+
+    def get_ensemble_merged_data(self) -> Dict[str, np.ndarray]:
+        """
+        Merge the ensemble performance data to the closest time step
+        available in the run_history.
+        One performance metric will be allocated to one time step.
+        Other time steps will be filled by the worst possible value.
+
+        Returns:
+            data (Dict[str, np.ndarray]):
+                Merged data as mentioned above
+        """
+
+        data = {k: v.copy() for k, v in self.data.items()}  # deep copy
+
+        if self.ensemble_results.empty():  # no ensemble data available
+            return data
+
+        train_scores, test_scores = self.ensemble_results.train_scores, self.ensemble_results.test_scores
+        end_times = self.ensemble_results.end_times
+        cur, timestep_size, sign = 0, self.cum_times.size, self.metric._sign
+        key_train, key_test = f'ensemble::train::{self.metric.name}', f'ensemble::test::{self.metric.name}'
+
+        train_perfs = np.full_like(self.cum_times, self.metric._worst_possible_result)
+        test_perfs = np.full_like(self.cum_times, self.metric._worst_possible_result)
+
+        for timestamp, train_score, test_score in zip(end_times, train_scores, test_scores):
+            avail_time = timestamp - self.start_time
+            while cur < timestep_size and self.cum_times[cur] < avail_time:
+                # Guarantee that cum_times[cur] >= avail_time
+                cur += 1
+
+            # results[cur] is the closest available checkpoint after or at the avail_time
+            # ==> Assign this data to that checkpoint
+            time_index = min(cur, timestep_size - 1)
+            # If there already exists a previous allocated value, update by a better value
+            train_perfs[time_index] = sign * max(sign * train_perfs[time_index], sign * train_score)
+            test_perfs[time_index] = sign * max(sign * test_perfs[time_index], sign * test_score)
+
+        data.update({key_train: train_perfs, key_test: test_perfs})
+        return data
+
+
+class ResultsManager:
+    def __init__(self, *args: Any, **kwargs: Any):
+        """
+        This module is used to gather result information for BaseTask.
+        In other words, this module is supposed to be wrapped by BaseTask.
+
+        Attributes:
+            run_history (RunHistory):
+                A `SMAC Runshistory <https://automl.github.io/SMAC3/master/apidoc/smac.runhistory.runhistory.html>`_
+                object that holds information about the runs of the target algorithm made during search
+            ensemble_performance_history (List[Dict[str, Any]]):
+                The history of the ensemble performance from EnsembleBuilder.
+                Its keys are `train_xxx`, `test_xxx` or `Timestamp`.
+            trajectory (List[TrajEntry]):
+                A list of all incumbent configurations during search
+        """
+        self.run_history: RunHistory = RunHistory()
+        self.ensemble_performance_history: List[Dict[str, Any]] = []
+        self.trajectory: List[TrajEntry] = []
+
+    def _check_run_history(self) -> None:
+        if self.run_history is None:
+            raise RuntimeError("No Run History found, search has not been called.")
+
+        if self.run_history.empty():
+            raise RuntimeError("Run History is empty. Something went wrong, "
+                               "SMAC was not able to fit any model?")
+
+    def get_incumbent_results(
+        self,
+        metric: autoPyTorchMetric,
+        include_traditional: bool = False
+    ) -> Tuple[Configuration, Dict[str, Union[int, str, float]]]:
+        """
+        Get Incumbent config and the corresponding results
+
+        Args:
+            metric (autoPyTorchMetric):
+                A metric that is evaluated when searching with fit AutoPytorch.
+            include_traditional (bool):
+                Whether to include results from tradtional pipelines
+
+        Returns:
+            Configuration (CS.ConfigurationSpace):
+                The incumbent configuration
+            Dict[str, Union[int, str, float]]:
+                Additional information about the run of the incumbent configuration.
+        """
+        self._check_run_history()
+
+        results = SearchResults(metric=metric, scoring_functions=[], run_history=self.run_history)
+
+        if not include_traditional:
+            non_traditional = ~np.array(results.is_traditionals)
+            scores = results.opt_scores[non_traditional]
+            indices = np.arange(len(results.configs))[non_traditional]
+        else:
+            scores = results.opt_scores
+            indices = np.arange(len(results.configs))
+
+        incumbent_idx = indices[np.argmax(metric._sign * scores)]
+        incumbent_config = results.configs[incumbent_idx]
+        incumbent_results = results.additional_infos[incumbent_idx]
+
+        assert incumbent_results is not None  # mypy check
+        return incumbent_config, incumbent_results
+
+    def get_search_results(
+        self,
+        scoring_functions: List[autoPyTorchMetric],
+        metric: autoPyTorchMetric
+    ) -> SearchResults:
+        """
+        This attribute is populated with data from `self.run_history`
+        and contains information about the configurations, and their
+        corresponding metric results, status of run, parameters and
+        the budget
+
+        Args:
+            scoring_functions (List[autoPyTorchMetric]):
+                Metrics to show in the results.
+            metric (autoPyTorchMetric):
+                A metric that is evaluated when searching with fit AutoPytorch.
+
+        Returns:
+            SearchResults:
+                An instance that contains the results from search
+        """
+        self._check_run_history()
+        return SearchResults(metric=metric, scoring_functions=scoring_functions, run_history=self.run_history)
+
+    def sprint_statistics(
+        self,
+        dataset_name: str,
+        scoring_functions: List[autoPyTorchMetric],
+        metric: autoPyTorchMetric
+    ) -> str:
+        """
+        Prints statistics about the SMAC search.
+
+        These statistics include:
+
+        1. Optimisation Metric
+        2. Best Optimisation score achieved by individual pipelines
+        3. Total number of target algorithm runs
+        4. Total number of successful target algorithm runs
+        5. Total number of crashed target algorithm runs
+        6. Total number of target algorithm runs that exceeded the time limit
+        7. Total number of successful target algorithm runs that exceeded the memory limit
+
+        Args:
+            dataset_name (str):
+                The dataset name that was used in the run.
+            scoring_functions (List[autoPyTorchMetric]):
+                Metrics to show in the results.
+            metric (autoPyTorchMetric):
+                A metric that is evaluated when searching with fit AutoPytorch.
+
+        Returns:
+            (str):
+                Formatted string with statistics
+        """
+        search_results = self.get_search_results(scoring_functions, metric)
+        success_status = (StatusType.SUCCESS, StatusType.DONOTADVANCE)
+        sio = io.StringIO()
+        sio.write("autoPyTorch results:\n")
+        sio.write(f"\tDataset name: {dataset_name}\n")
+        sio.write(f"\tOptimisation Metric: {metric}\n")
+
+        num_runs = len(search_results.status_types)
+        num_success = sum([s in success_status for s in search_results.status_types])
+        num_crash = sum([s == StatusType.CRASHED for s in search_results.status_types])
+        num_timeout = sum([s == StatusType.TIMEOUT for s in search_results.status_types])
+        num_memout = sum([s == StatusType.MEMOUT for s in search_results.status_types])
+
+        if num_success > 0:
+            best_score = metric._sign * np.max(metric._sign * search_results.opt_scores)
+            sio.write(f"\tBest validation score: {best_score}\n")
+
+        sio.write(f"\tNumber of target algorithm runs: {num_runs}\n")
+        sio.write(f"\tNumber of successful target algorithm runs: {num_success}\n")
+        sio.write(f"\tNumber of crashed target algorithm runs: {num_crash}\n")
+        sio.write(f"\tNumber of target algorithms that exceeded the time "
+                  f"limit: {num_timeout}\n")
+        sio.write(f"\tNumber of target algorithms that exceeded the memory "
+                  f"limit: {num_memout}\n")
+
+        return sio.getvalue()
diff --git a/autoPyTorch/utils/results_visualizer.py b/autoPyTorch/utils/results_visualizer.py
new file mode 100644
index 000000000..64c87ba94
--- /dev/null
+++ b/autoPyTorch/utils/results_visualizer.py
@@ -0,0 +1,310 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Dict, Optional, Tuple
+
+import matplotlib.pyplot as plt
+
+import numpy as np
+
+from autoPyTorch.utils.results_manager import MetricResults
+
+
+plt.rcParams["font.family"] = "Times New Roman"
+plt.rcParams["font.size"] = 18
+
+
+@dataclass(frozen=True)
+class ColorLabelSettings:
+    """
+    The settings for each plot.
+    If None is provided, those plots are omitted.
+
+    Attributes:
+        single_train (Optional[Tuple[Optional[str], Optional[str]]]):
+            The setting for the plot of the optimal single train result.
+        single_opt (Optional[Tuple[Optional[str], Optional[str]]]):
+            The setting for the plot of the optimal single result used in optimization.
+        single_test (Optional[Tuple[Optional[str], Optional[str]]]):
+            The setting for the plot of the optimal single test result.
+        ensemble_train (Optional[Tuple[Optional[str], Optional[str]]]):
+            The setting for the plot of the optimal ensemble train result.
+        ensemble_test (Optional[Tuple[Optional[str], Optional[str]]]):
+            The setting for the plot of the optimal ensemble test result.
+    """
+    single_train: Optional[Tuple[Optional[str], Optional[str]]] = ('red', None)
+    single_opt: Optional[Tuple[Optional[str], Optional[str]]] = ('blue', None)
+    single_test: Optional[Tuple[Optional[str], Optional[str]]] = ('green', None)
+    ensemble_train: Optional[Tuple[Optional[str], Optional[str]]] = ('brown', None)
+    ensemble_test: Optional[Tuple[Optional[str], Optional[str]]] = ('purple', None)
+
+    def extract_dicts(
+        self,
+        results: MetricResults
+    ) -> Tuple[Dict[str, Optional[str]], Dict[str, Optional[str]]]:
+        """
+        Args:
+            results (MetricResults):
+                The results of the optimization in the base task API.
+                It determines what keys to include.
+
+        Returns:
+            colors, labels (Tuple[Dict[str, Optional[str]], Dict[str, Optional[str]]]):
+                The dicts for colors and labels.
+                The keys are determined by results and each label and color
+                are determined by each instantiation.
+                Note that the keys include the metric name.
+        """
+
+        colors, labels = {}, {}
+
+        for key, color_label in vars(self).items():
+            if color_label is None:
+                continue
+
+            prefix = '::'.join(key.split('_'))
+            try:
+                new_key = [key for key in results.data.keys() if key.startswith(prefix)][0]
+                colors[new_key], labels[new_key] = color_label
+            except IndexError:  # ensemble does not always have results
+                pass
+
+        return colors, labels
+
+
+@dataclass(frozen=True)
+class PlotSettingParams:
+    """
+    Parameters for the plot environment.
+
+    Attributes:
+        n_points (int):
+            The number of points to plot.
+        xlabel (Optional[str]):
+            The label in the x axis.
+        ylabel (Optional[str]):
+            The label in the y axis.
+        xscale (str):
+            The scale of x axis.
+        yscale (str):
+            The scale of y axis.
+        title (Optional[str]):
+            The title of the subfigure.
+        xlim (Tuple[float, float]):
+            The range of x axis.
+        ylim (Tuple[float, float]):
+            The range of y axis.
+        legend (bool):
+            Whether to have legend in the figure.
+        legend_loc (str):
+            The location of the legend.
+        show (bool):
+            Whether to show the plot.
+        args, kwargs (Any):
+            Arguments for the ax.plot.
+    """
+    n_points: int = 20
+    xscale: str = 'linear'
+    yscale: str = 'linear'
+    xlabel: Optional[str] = None
+    ylabel: Optional[str] = None
+    title: Optional[str] = None
+    xlim: Optional[Tuple[float, float]] = None
+    ylim: Optional[Tuple[float, float]] = None
+    legend: bool = True
+    legend_loc: str = 'best'
+    show: bool = False
+    figsize: Optional[Tuple[int, int]] = None
+
+
+class ScaleChoices(Enum):
+    linear = 'linear'
+    log = 'log'
+
+
+def _get_perf_and_time(
+    cum_results: np.ndarray,
+    cum_times: np.ndarray,
+    plot_setting_params: PlotSettingParams,
+    worst_val: float
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Get the performance and time step to plot.
+
+    Args:
+        cum_results (np.ndarray):
+            The cumulated performance per evaluation.
+        cum_times (np.ndarray):
+            The cumulated runtime at the end of each evaluation.
+        plot_setting_params (PlotSettingParams):
+            Parameters for the plot.
+        worst_val (float):
+            The worst possible value given a metric.
+
+    Returns:
+        check_points (np.ndarray):
+            The time in second where the plot will happen.
+        perf_by_time_step (np.ndarray):
+            The best performance at the corresponding time in second
+            where the plot will happen.
+    """
+
+    scale_choices = [s.name for s in ScaleChoices]
+    if plot_setting_params.xscale not in scale_choices or plot_setting_params.yscale not in scale_choices:
+        raise ValueError(f'xscale and yscale must be in {scale_choices}, '
+                         f'but got xscale={plot_setting_params.xscale}, yscale={plot_setting_params.yscale}')
+
+    n_evals, runtime_lb, runtime_ub = cum_results.size, cum_times[0], cum_times[-1]
+
+    if plot_setting_params.xscale == 'log':
+        # Take the even time interval in the log scale and revert
+        check_points = np.exp(np.linspace(np.log(runtime_lb), np.log(runtime_ub), plot_setting_params.n_points))
+    else:
+        check_points = np.linspace(runtime_lb, runtime_ub, plot_setting_params.n_points)
+
+    check_points += 1e-8  # Prevent float error
+
+    # The worst possible value is always at the head
+    perf_by_time_step = np.full_like(check_points, worst_val)
+    cur = 0
+
+    for i, check_point in enumerate(check_points):
+        while cur < n_evals and cum_times[cur] <= check_point:
+            # Guarantee that cum_times[cur] > check_point
+            # ==> cum_times[cur - 1] <= check_point
+            cur += 1
+        if cur:  # filter cur - 1 == -1
+            # results[cur - 1] was obtained before or at the checkpoint
+            # ==> The best performance up to this checkpoint
+            perf_by_time_step[i] = cum_results[cur - 1]
+
+    if plot_setting_params.yscale == 'log' and np.any(perf_by_time_step < 0):
+        raise ValueError('log scale is not available when performance metric can be negative.')
+
+    return check_points, perf_by_time_step
+
+
+class ResultsVisualizer:
+    @staticmethod
+    def _set_plot_args(
+        ax: plt.Axes,
+        plot_setting_params: PlotSettingParams
+    ) -> None:
+        if plot_setting_params.xlim is not None:
+            ax.set_xlim(*plot_setting_params.xlim)
+        if plot_setting_params.ylim is not None:
+            ax.set_ylim(*plot_setting_params.ylim)
+
+        if plot_setting_params.xlabel is not None:
+            ax.set_xlabel(plot_setting_params.xlabel)
+        if plot_setting_params.ylabel is not None:
+            ax.set_ylabel(plot_setting_params.ylabel)
+
+        ax.set_xscale(plot_setting_params.xscale)
+        ax.set_yscale(plot_setting_params.yscale)
+        if plot_setting_params.xscale == 'log' or plot_setting_params.yscale == 'log':
+            ax.grid(True, which='minor', color='gray', linestyle=':')
+
+        ax.grid(True, which='major', color='black')
+
+        if plot_setting_params.legend:
+            ax.legend(loc=plot_setting_params.legend_loc)
+
+        if plot_setting_params.title is not None:
+            ax.set_title(plot_setting_params.title)
+        if plot_setting_params.show:
+            plt.show()
+
+    @staticmethod
+    def _plot_individual_perf_over_time(
+        ax: plt.Axes,
+        cum_times: np.ndarray,
+        cum_results: np.ndarray,
+        worst_val: float,
+        plot_setting_params: PlotSettingParams,
+        label: Optional[str] = None,
+        color: Optional[str] = None,
+        *args: Any,
+        **kwargs: Any
+    ) -> None:
+        """
+        Plot the incumbent performance of the AutoPytorch over time.
+        This method is created to make plot_perf_over_time more readable
+        and it is not supposed to be used only in this class, but not from outside.
+
+        Args:
+            ax (plt.Axes):
+                axis to plot (subplots of matplotlib).
+            cum_times (np.ndarray):
+                The cumulated time until each end of config evaluation.
+            results (np.ndarray):
+                The cumulated performance per evaluation.
+            worst_val (float):
+                The worst possible value given a metric.
+            plot_setting_params (PlotSettingParams):
+                Parameters for the plot.
+            label (Optional[str]):
+                The name of the plot.
+            color (Optional[str]):
+                Color of the plot.
+            args, kwargs (Any):
+                Arguments for the ax.plot.
+        """
+        check_points, perf_by_time_step = _get_perf_and_time(
+            cum_results=cum_results,
+            cum_times=cum_times,
+            plot_setting_params=plot_setting_params,
+            worst_val=worst_val
+        )
+
+        ax.plot(check_points, perf_by_time_step, color=color, label=label, *args, **kwargs)
+
+    def plot_perf_over_time(
+        self,
+        results: MetricResults,
+        plot_setting_params: PlotSettingParams,
+        colors: Dict[str, Optional[str]],
+        labels: Dict[str, Optional[str]],
+        ax: Optional[plt.Axes] = None,
+        *args: Any,
+        **kwargs: Any
+    ) -> None:
+        """
+        Plot the incumbent performance of the AutoPytorch over time.
+
+        Args:
+            results (MetricResults):
+                The module that handles results from various sources.
+            plot_setting_params (PlotSettingParams):
+                Parameters for the plot.
+            labels (Dict[str, Optional[str]]):
+                The name of the plot.
+            colors (Dict[str, Optional[str]]):
+                Color of the plot.
+            ax (Optional[plt.Axes]):
+                axis to plot (subplots of matplotlib).
+                If None, it will be created automatically.
+            args, kwargs (Any):
+                Arguments for the ax.plot.
+        """
+        if ax is None:
+            _, ax = plt.subplots(nrows=1, ncols=1)
+
+        data = results.get_ensemble_merged_data()
+        cum_times = results.cum_times
+        minimize = (results.metric._sign == -1)
+
+        for key in data.keys():
+            _label, _color, _perfs = labels[key], colors[key], data[key]
+            # Take the best results over time
+            _cum_perfs = np.minimum.accumulate(_perfs) if minimize else np.maximum.accumulate(_perfs)
+
+            self._plot_individual_perf_over_time(  # type: ignore
+                ax=ax, cum_results=_cum_perfs, cum_times=cum_times,
+                plot_setting_params=plot_setting_params,
+                worst_val=results.metric._worst_possible_result,
+                label=_label if _label is not None else ' '.join(key.split('::')),
+                color=_color,
+                *args, **kwargs
+            )
+
+        self._set_plot_args(ax=ax, plot_setting_params=plot_setting_params)
diff --git a/examples/40_advanced/example_plot_over_time.py b/examples/40_advanced/example_plot_over_time.py
new file mode 100644
index 000000000..9c103452e
--- /dev/null
+++ b/examples/40_advanced/example_plot_over_time.py
@@ -0,0 +1,82 @@
+"""
+==============================
+Plot the Performance over Time
+==============================
+
+Auto-Pytorch uses SMAC to fit individual machine learning algorithms
+and then ensembles them together using `Ensemble Selection
+<https://www.cs.cornell.edu/~caruana/ctp/ct.papers/caruana.icml04.icdm06long.pdf>`_.
+
+The following examples shows how to plot both the performance
+of the individual models and their respective ensemble.
+
+Additionally, as we are compatible with matplotlib,
+you can input any args or kwargs that are compatible with ax.plot.
+In the case when you would like to create multipanel visualization,
+please input plt.Axes obtained from matplotlib.pyplot.subplots.
+
+"""
+import warnings
+
+import numpy as np
+import pandas as pd
+
+from sklearn import model_selection
+
+import matplotlib.pyplot as plt
+
+from autoPyTorch.api.tabular_classification import TabularClassificationTask
+from autoPyTorch.utils.results_visualizer import PlotSettingParams
+
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+
+############################################################################
+# Task Definition
+# ===============
+n_samples, dim = 100, 2
+X = np.random.random((n_samples, dim)) * 2 - 1
+y = ((X ** 2).sum(axis=-1) < 2 / np.pi).astype(np.int32)
+print(y)
+
+X, y = pd.DataFrame(X), pd.DataFrame(y)
+X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)
+
+############################################################################
+# API Instantiation and Searching
+# ===============================
+api = TabularClassificationTask(seed=42)
+
+api.search(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test,
+           optimize_metric='accuracy', total_walltime_limit=120, func_eval_time_limit_secs=10)
+
+############################################################################
+# Create Setting Parameters Object
+# ================================
+metric_name = 'accuracy'
+
+params = PlotSettingParams(
+    xscale='log',
+    xlabel='Runtime',
+    ylabel='Accuracy',
+    title='Toy Example',
+    show=False  # If you would like to show, make it True
+)
+
+############################################################################
+# Plot with the Specified Setting Parameters
+# ==========================================
+_, ax = plt.subplots()
+
+api.plot_perf_over_time(
+    ax=ax,  # You do not have to provide.
+    metric_name=metric_name,
+    plot_setting_params=params,
+    marker='*',
+    markersize=10
+)
+
+# plt.show() might cause issue depending on environments
+plt.savefig('example_plot_over_time.png')
diff --git a/test/test_api/test_results_manager.py b/test/test_api/test_results_manager.py
deleted file mode 100644
index 4c6e7a7ae..000000000
--- a/test/test_api/test_results_manager.py
+++ /dev/null
@@ -1,232 +0,0 @@
-import json
-import os
-from test.test_api.utils import make_dict_run_history_data
-from unittest.mock import MagicMock
-
-import ConfigSpace.hyperparameters as CSH
-from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
-
-import numpy as np
-
-import pytest
-
-from smac.runhistory.runhistory import RunHistory, StatusType
-
-from autoPyTorch.api.base_task import BaseTask
-from autoPyTorch.api.results_manager import ResultsManager, STATUS2MSG, SearchResults, cost2metric
-from autoPyTorch.metrics import accuracy, balanced_accuracy, log_loss
-
-
-def _check_status(status):
-    """ Based on runhistory_B.json """
-    ans = [
-        STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS],
-        STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS],
-        STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS],
-        STATUS2MSG[StatusType.CRASHED], STATUS2MSG[StatusType.SUCCESS],
-        STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS],
-        STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS],
-        STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS],
-        STATUS2MSG[StatusType.TIMEOUT], STATUS2MSG[StatusType.TIMEOUT],
-    ]
-    assert isinstance(status, list)
-    assert isinstance(status[0], str)
-    assert status == ans
-
-
-def _check_costs(costs):
-    """ Based on runhistory_B.json """
-    ans = [0.15204678362573099, 0.4444444444444444, 0.5555555555555556, 0.29824561403508776,
-           0.4444444444444444, 0.4444444444444444, 1.0, 0.5555555555555556, 0.4444444444444444,
-           0.15204678362573099, 0.15204678362573099, 0.4035087719298246, 0.4444444444444444,
-           0.4444444444444444, 1.0, 1.0]
-    assert np.allclose(1 - np.array(costs), ans)
-    assert isinstance(costs, np.ndarray)
-    assert costs.dtype is np.dtype(np.float)
-
-
-def _check_fit_times(fit_times):
-    """ Based on runhistory_B.json """
-    ans = [3.154788017272949, 3.2763524055480957, 22.723600149154663, 4.990685224533081, 10.684926509857178,
-           9.947429180145264, 11.687273979187012, 8.478890419006348, 5.485020637512207, 11.514830589294434,
-           15.370736837387085, 23.846530199050903, 6.757539510726929, 15.061991930007935, 50.010520696640015,
-           22.011935234069824]
-
-    assert np.allclose(fit_times, ans)
-    assert isinstance(fit_times, np.ndarray)
-    assert fit_times.dtype is np.dtype(np.float)
-
-
-def _check_budgets(budgets):
-    """ Based on runhistory_B.json """
-    ans = [5.555555555555555, 5.555555555555555, 5.555555555555555, 5.555555555555555,
-           5.555555555555555, 5.555555555555555, 5.555555555555555, 5.555555555555555,
-           5.555555555555555, 16.666666666666664, 50.0, 16.666666666666664, 16.666666666666664,
-           16.666666666666664, 50.0, 50.0]
-    assert np.allclose(budgets, ans)
-    assert isinstance(budgets, list)
-    assert isinstance(budgets[0], float)
-
-
-def _check_additional_infos(status_types, additional_infos):
-    for i, status in enumerate(status_types):
-        info = additional_infos[i]
-        if status in (STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.DONOTADVANCE]):
-            metric_info = info.get('opt_loss', None)
-            assert metric_info is not None
-        elif info is not None:
-            metric_info = info.get('opt_loss', None)
-            assert metric_info is None
-
-
-def _check_metric_dict(metric_dict, status_types):
-    assert isinstance(metric_dict['accuracy'], list)
-    assert metric_dict['accuracy'][0] > 0
-    assert isinstance(metric_dict['balanced_accuracy'], list)
-    assert metric_dict['balanced_accuracy'][0] > 0
-
-    for key, vals in metric_dict.items():
-        # ^ is a XOR operator
-        # True and False / False and True must be fulfilled
-        assert all([(s == STATUS2MSG[StatusType.SUCCESS]) ^ isnan
-                    for s, isnan in zip(status_types, np.isnan(vals))])
-
-
-def test_extract_results_from_run_history():
-    # test the raise error for the `status_msg is None`
-    run_history = RunHistory()
-    cs = ConfigurationSpace()
-    config = Configuration(cs, {})
-    run_history.add(
-        config=config,
-        cost=0.0,
-        time=1.0,
-        status=StatusType.CAPPED,
-    )
-    with pytest.raises(ValueError) as excinfo:
-        SearchResults(metric=accuracy, scoring_functions=[], run_history=run_history)
-
-        assert excinfo._excinfo[0] == ValueError
-
-
-def test_search_results_sprint_statistics():
-    api = BaseTask()
-    for method in ['get_search_results', 'sprint_statistics', 'get_incumbent_results']:
-        with pytest.raises(RuntimeError) as excinfo:
-            getattr(api, method)()
-
-        assert excinfo._excinfo[0] == RuntimeError
-
-    run_history_data = json.load(open(os.path.join(os.path.dirname(__file__),
-                                                   '.tmp_api/runhistory_B.json'),
-                                      mode='r'))['data']
-    api._results_manager.run_history = MagicMock()
-    api.run_history.empty = MagicMock(return_value=False)
-
-    # The run_history has 16 runs + 1 run interruption ==> 16 runs
-    api.run_history.data = make_dict_run_history_data(run_history_data)
-    api._metric = accuracy
-    api.dataset_name = 'iris'
-    api._scoring_functions = [accuracy, balanced_accuracy]
-    api.search_space = MagicMock(spec=ConfigurationSpace)
-    search_results = api.get_search_results()
-
-    _check_status(search_results.status_types)
-    _check_costs(search_results.opt_scores)
-    _check_fit_times(search_results.fit_times)
-    _check_budgets(search_results.budgets)
-    _check_metric_dict(search_results.metric_dict, search_results.status_types)
-    _check_additional_infos(status_types=search_results.status_types,
-                            additional_infos=search_results.additional_infos)
-
-    # config_ids can duplicate because of various budget size
-    config_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 1, 10, 11, 12, 10, 13]
-    assert config_ids == search_results.config_ids
-
-    # assert that contents of search_results are of expected types
-    assert isinstance(search_results.rank_test_scores, np.ndarray)
-    assert search_results.rank_test_scores.dtype is np.dtype(np.int)
-    assert isinstance(search_results.configs, list)
-
-    n_success, n_timeout, n_memoryout, n_crashed = 13, 2, 0, 1
-    msg = ["autoPyTorch results:", f"\tDataset name: {api.dataset_name}",
-           f"\tOptimisation Metric: {api._metric.name}",
-           f"\tBest validation score: {max(search_results.opt_scores)}",
-           "\tNumber of target algorithm runs: 16", f"\tNumber of successful target algorithm runs: {n_success}",
-           f"\tNumber of crashed target algorithm runs: {n_crashed}",
-           f"\tNumber of target algorithms that exceeded the time limit: {n_timeout}",
-           f"\tNumber of target algorithms that exceeded the memory limit: {n_memoryout}"]
-
-    assert isinstance(api.sprint_statistics(), str)
-    assert all([m1 == m2 for m1, m2 in zip(api.sprint_statistics().split("\n"), msg)])
-
-
-@pytest.mark.parametrize('run_history', (None, RunHistory()))
-def test_check_run_history(run_history):
-    manager = ResultsManager()
-    manager.run_history = run_history
-
-    with pytest.raises(RuntimeError) as excinfo:
-        manager._check_run_history()
-
-    assert excinfo._excinfo[0] == RuntimeError
-
-
-T, NT = 'traditional', 'non-traditional'
-SCORES = [0.1 * (i + 1) for i in range(10)]
-
-
-@pytest.mark.parametrize('include_traditional', (True, False))
-@pytest.mark.parametrize('metric', (accuracy, log_loss))
-@pytest.mark.parametrize('origins', ([T] * 5 + [NT] * 5, [T, NT] * 5, [NT] * 5 + [T] * 5))
-@pytest.mark.parametrize('scores', (SCORES, SCORES[::-1]))
-def test_get_incumbent_results(include_traditional, metric, origins, scores):
-    manager = ResultsManager()
-    cs = ConfigurationSpace()
-    cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1))
-
-    configs = [0.1 * (i + 1) for i in range(len(scores))]
-    if metric.name == "log_loss":
-        # This is to detect mis-computation in reversion
-        metric._optimum = 0.1
-
-    best_cost, best_idx = np.inf, -1
-    for idx, (a, origin, score) in enumerate(zip(configs, origins, scores)):
-        config = Configuration(cs, {'a': a})
-
-        # conversion defined in:
-        # autoPyTorch/pipeline/components/training/metrics/utils.py::calculate_loss
-        cost = metric._optimum - metric._sign * score
-        manager.run_history.add(
-            config=config,
-            cost=cost,
-            time=1.0,
-            status=StatusType.SUCCESS,
-            additional_info={'opt_loss': {metric.name: score},
-                             'configuration_origin': origin}
-        )
-        if cost > best_cost:
-            continue
-
-        if include_traditional:
-            best_cost, best_idx = cost, idx
-        elif origin != T:
-            best_cost, best_idx = cost, idx
-
-    incumbent_config, incumbent_results = manager.get_incumbent_results(
-        metric=metric,
-        include_traditional=include_traditional
-    )
-
-    assert isinstance(incumbent_config, Configuration)
-    assert isinstance(incumbent_results, dict)
-    best_score, best_a = scores[best_idx], configs[best_idx]
-    assert np.allclose(
-        [best_score, best_score, best_a],
-        [cost2metric(best_cost, metric),
-         incumbent_results['opt_loss'][metric.name],
-         incumbent_config['a']]
-    )
-
-    if not include_traditional:
-        assert incumbent_results['configuration_origin'] != T
diff --git a/test/test_api/.tmp_api/runhistory_B.json b/test/test_utils/runhistory.json
similarity index 100%
rename from test/test_api/.tmp_api/runhistory_B.json
rename to test/test_utils/runhistory.json
diff --git a/test/test_utils/test_results_manager.py b/test/test_utils/test_results_manager.py
new file mode 100644
index 000000000..60ee11f42
--- /dev/null
+++ b/test/test_utils/test_results_manager.py
@@ -0,0 +1,484 @@
+import json
+import os
+from datetime import datetime
+from test.test_api.utils import make_dict_run_history_data
+from unittest.mock import MagicMock
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
+
+import numpy as np
+
+import pytest
+
+from smac.runhistory.runhistory import RunHistory, RunKey, RunValue, StatusType
+
+from autoPyTorch.api.base_task import BaseTask
+from autoPyTorch.metrics import accuracy, balanced_accuracy, log_loss
+from autoPyTorch.utils.results_manager import (
+    EnsembleResults,
+    MetricResults,
+    ResultsManager,
+    SearchResults,
+    cost2metric,
+    get_start_time
+)
+
+
+T, NT = 'traditional', 'non-traditional'
+SCORES = [0.1 * (i + 1) for i in range(10)]
+END_TIMES = [8, 4, 3, 6, 0, 7, 1, 9, 2, 5]
+
+
+def _check_status(status):
+    """ Based on runhistory.json """
+    ans = [
+        StatusType.SUCCESS, StatusType.SUCCESS,
+        StatusType.SUCCESS, StatusType.SUCCESS,
+        StatusType.SUCCESS, StatusType.SUCCESS,
+        StatusType.CRASHED, StatusType.SUCCESS,
+        StatusType.SUCCESS, StatusType.SUCCESS,
+        StatusType.SUCCESS, StatusType.SUCCESS,
+        StatusType.SUCCESS, StatusType.SUCCESS,
+        StatusType.TIMEOUT, StatusType.TIMEOUT,
+    ]
+    assert isinstance(status, list)
+    assert isinstance(status[0], StatusType)
+    assert status == ans
+
+
+def _check_costs(costs):
+    """ Based on runhistory.json """
+    ans = [0.15204678362573099, 0.4444444444444444, 0.5555555555555556, 0.29824561403508776,
+           0.4444444444444444, 0.4444444444444444, 1.0, 0.5555555555555556, 0.4444444444444444,
+           0.15204678362573099, 0.15204678362573099, 0.4035087719298246, 0.4444444444444444,
+           0.4444444444444444, 1.0, 1.0]
+    assert np.allclose(1 - np.array(costs), ans)
+    assert isinstance(costs, np.ndarray)
+    assert costs.dtype is np.dtype(np.float)
+
+
+def _check_end_times(end_times):
+    """ Based on runhistory.json """
+    ans = [1637342642.7887495, 1637342647.2651122, 1637342675.2555833, 1637342681.334954,
+           1637342693.2717755, 1637342704.341065, 1637342726.1866672, 1637342743.3274522,
+           1637342749.9442234, 1637342762.5487585, 1637342779.192385, 1637342804.3368232,
+           1637342820.8067145, 1637342846.0210106, 1637342897.1205413, 1637342928.7456856]
+
+    assert np.allclose(end_times, ans)
+    assert isinstance(end_times, np.ndarray)
+    assert end_times.dtype is np.dtype(np.float)
+
+
+def _check_fit_times(fit_times):
+    """ Based on runhistory.json """
+    ans = [3.154788017272949, 3.2763524055480957, 22.723600149154663, 4.990685224533081, 10.684926509857178,
+           9.947429180145264, 11.687273979187012, 8.478890419006348, 5.485020637512207, 11.514830589294434,
+           15.370736837387085, 23.846530199050903, 6.757539510726929, 15.061991930007935, 50.010520696640015,
+           22.011935234069824]
+
+    assert np.allclose(fit_times, ans)
+    assert isinstance(fit_times, np.ndarray)
+    assert fit_times.dtype is np.dtype(np.float)
+
+
+def _check_budgets(budgets):
+    """ Based on runhistory.json """
+    ans = [5.555555555555555, 5.555555555555555, 5.555555555555555, 5.555555555555555,
+           5.555555555555555, 5.555555555555555, 5.555555555555555, 5.555555555555555,
+           5.555555555555555, 16.666666666666664, 50.0, 16.666666666666664, 16.666666666666664,
+           16.666666666666664, 50.0, 50.0]
+    assert np.allclose(budgets, ans)
+    assert isinstance(budgets, list)
+    assert isinstance(budgets[0], float)
+
+
+def _check_additional_infos(status_types, additional_infos):
+    for i, status in enumerate(status_types):
+        info = additional_infos[i]
+        if status in (StatusType.SUCCESS, StatusType.DONOTADVANCE):
+            metric_info = info.get('opt_loss', None)
+            assert metric_info is not None
+        elif info is not None:
+            metric_info = info.get('opt_loss', None)
+            assert metric_info is None
+
+
+def _check_metric_dict(metric_dict, status_types, worst_val):
+    assert isinstance(metric_dict['accuracy'], list)
+    assert metric_dict['accuracy'][0] > 0
+    assert isinstance(metric_dict['balanced_accuracy'], list)
+    assert metric_dict['balanced_accuracy'][0] > 0
+
+    for key, vals in metric_dict.items():
+        # ^ is a XOR operator
+        # True and False / False and True must be fulfilled
+        assert all([(s == StatusType.SUCCESS) ^ np.isclose([val], [worst_val])
+                    for s, val in zip(status_types, vals)])
+
+
+def _check_metric_results(scores, metric, run_history, ensemble_performance_history):
+    if metric.name == 'accuracy':  # Check the case when ensemble does not have the metric name
+        dummy_history = [{'Timestamp': datetime(2000, 1, 1), 'train_log_loss': 1, 'test_log_loss': 1}]
+        mr = MetricResults(metric, run_history, dummy_history)
+        # ensemble_results should be None because ensemble evaluated log_loss
+        assert mr.ensemble_results.empty()
+        data = mr.get_ensemble_merged_data()
+        # since ensemble_results is None, merged_data must be identical to the run_history data
+        assert all(np.allclose(data[key], mr.data[key]) for key in data.keys())
+
+    mr = MetricResults(metric, run_history, ensemble_performance_history)
+    perfs = np.array([cost2metric(s, metric) for s in scores])
+    modified_scores = scores[::2] + [0]
+    modified_scores.insert(2, 0)
+    ens_perfs = np.array([s for s in modified_scores])
+    assert np.allclose(mr.data[f'single::train::{metric.name}'], perfs)
+    assert np.allclose(mr.data[f'single::opt::{metric.name}'], perfs)
+    assert np.allclose(mr.data[f'single::test::{metric.name}'], perfs)
+    assert np.allclose(mr.data[f'ensemble::train::{metric.name}'], ens_perfs)
+    assert np.allclose(mr.data[f'ensemble::test::{metric.name}'], ens_perfs)
+
+    # the end times of synthetic ensemble is [0.25, 0.45, 0.45, 0.65, 0.85, 0.85]
+    # the end times of synthetic run history is 0.1 * np.arange(1, 9) or 0.1 * np.arange(2, 10)
+    ensemble_ends_later = mr.search_results.end_times[-1] < mr.ensemble_results.end_times[-1]
+    indices = [2, 4, 4, 6, 8, 8] if ensemble_ends_later else [1, 3, 3, 5, 7, 7]
+
+    merged_data = mr.get_ensemble_merged_data()
+    worst_val = metric._worst_possible_result
+    minimize = metric._sign == -1
+    ans = np.full_like(mr.cum_times, worst_val)
+    for idx, s in zip(indices, mr.ensemble_results.train_scores):
+        ans[idx] = min(ans[idx], s) if minimize else max(ans[idx], s)
+
+    assert np.allclose(ans, merged_data[f'ensemble::train::{metric.name}'])
+    assert np.allclose(ans, merged_data[f'ensemble::test::{metric.name}'])
+
+
+def test_extract_results_from_run_history():
+    # test the raise error for the `status_msg is None`
+    run_history = RunHistory()
+    cs = ConfigurationSpace()
+    config = Configuration(cs, {})
+    run_history.add(
+        config=config,
+        cost=0.0,
+        time=1.0,
+        status=StatusType.CAPPED,
+    )
+    with pytest.raises(ValueError) as excinfo:
+        SearchResults(metric=accuracy, scoring_functions=[], run_history=run_history)
+
+    assert excinfo._excinfo[0] == ValueError
+
+
+def test_raise_error_in_update_and_sort_by_time():
+    cs = ConfigurationSpace()
+    cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1))
+    config = Configuration(cs, {'a': 0.1})
+
+    sr = SearchResults(metric=accuracy, scoring_functions=[], run_history=RunHistory())
+    er = EnsembleResults(metric=accuracy, ensemble_performance_history=[])
+
+    with pytest.raises(RuntimeError) as excinfo:
+        sr._update(
+            config=config,
+            run_key=RunKey(config_id=0, instance_id=0, seed=0),
+            run_value=RunValue(
+                cost=0, time=1, status=StatusType.SUCCESS,
+                starttime=0, endtime=1, additional_info={}
+            )
+        )
+
+    assert excinfo._excinfo[0] == RuntimeError
+
+    with pytest.raises(RuntimeError) as excinfo:
+        sr._sort_by_endtime()
+
+    assert excinfo._excinfo[0] == RuntimeError
+
+    with pytest.raises(RuntimeError) as excinfo:
+        er._update(data={})
+
+    assert excinfo._excinfo[0] == RuntimeError
+
+    with pytest.raises(RuntimeError) as excinfo:
+        er._sort_by_endtime()
+
+
+@pytest.mark.parametrize('starttimes', (list(range(10)), list(range(10))[::-1]))
+@pytest.mark.parametrize('status_types', (
+    [StatusType.SUCCESS] * 9 + [StatusType.STOP],
+    [StatusType.RUNNING] + [StatusType.SUCCESS] * 9
+))
+def test_get_start_time(starttimes, status_types):
+    run_history = RunHistory()
+    cs = ConfigurationSpace()
+    cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1))
+    endtime = 1e9
+    kwargs = dict(cost=1.0, endtime=endtime)
+    for starttime, status_type in zip(starttimes, status_types):
+        config = Configuration(cs, {'a': 0.1 * starttime})
+        run_history.add(
+            config=config,
+            starttime=starttime,
+            time=endtime - starttime,
+            status=status_type,
+            **kwargs
+        )
+    starttime = get_start_time(run_history)
+
+    # this rule is strictly defined on the inputs defined from pytest
+    ans = min(t for s, t in zip(status_types, starttimes) if s == StatusType.SUCCESS)
+    assert starttime == ans
+
+
+def test_raise_error_in_get_start_time():
+    # test the raise error for the `status_msg is None`
+    run_history = RunHistory()
+    cs = ConfigurationSpace()
+    config = Configuration(cs, {})
+    run_history.add(
+        config=config,
+        cost=0.0,
+        time=1.0,
+        status=StatusType.CAPPED,
+    )
+
+    with pytest.raises(ValueError) as excinfo:
+        get_start_time(run_history)
+
+    assert excinfo._excinfo[0] == ValueError
+
+
+def test_search_results_sort_by_endtime():
+    run_history = RunHistory()
+    n_configs = len(SCORES)
+    cs = ConfigurationSpace()
+    cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1))
+    order = np.argsort(END_TIMES)
+    ans = np.array(SCORES)[order].tolist()
+    status_types = [StatusType.SUCCESS, StatusType.DONOTADVANCE] * (n_configs // 2)
+
+    for i, (fixed_val, et, status) in enumerate(zip(SCORES, END_TIMES, status_types)):
+        config = Configuration(cs, {'a': fixed_val})
+        run_history.add(
+            config=config, cost=fixed_val,
+            status=status, budget=fixed_val,
+            time=et - fixed_val, starttime=fixed_val, endtime=et,
+            additional_info={
+                'a': fixed_val,
+                'configuration_origin': [T, NT][i % 2],
+                'train_loss': {accuracy.name: fixed_val - 0.1},
+                'opt_loss': {accuracy.name: fixed_val},
+                'test_loss': {accuracy.name: fixed_val + 0.1}
+            }
+        )
+
+    sr = SearchResults(accuracy, scoring_functions=[], run_history=run_history, order_by_endtime=True)
+    assert sr.budgets == ans
+    assert np.allclose(accuracy._optimum - accuracy._sign * sr.opt_scores, ans)
+    assert np.allclose(accuracy._optimum - accuracy._sign * sr.train_scores, np.array(ans) - accuracy._sign * 0.1)
+    assert np.allclose(accuracy._optimum - accuracy._sign * sr.test_scores, np.array(ans) + accuracy._sign * 0.1)
+    assert np.allclose(1 - sr.opt_scores, ans)
+    assert sr._end_times == list(range(n_configs))
+    assert all(c.get('a') == val for val, c in zip(ans, sr.configs))
+    assert all(info['a'] == val for val, info in zip(ans, sr.additional_infos))
+    assert np.all(np.array([s for s in status_types])[order] == np.array(sr.status_types))
+    assert sr.is_traditionals == np.array([True, False] * 5)[order].tolist()
+    assert np.allclose(sr.fit_times, np.subtract(np.arange(n_configs), ans))
+
+
+def test_ensemble_results():
+    order = np.argsort(END_TIMES)
+    end_times = [datetime.timestamp(datetime(2000, et + 1, 1)) for et in END_TIMES]
+    ensemble_performance_history = [
+        {'Timestamp': datetime(2000, et + 1, 1), 'train_accuracy': s1, 'test_accuracy': s2}
+        for et, s1, s2 in zip(END_TIMES, SCORES, SCORES[::-1])
+    ]
+
+    er = EnsembleResults(log_loss, ensemble_performance_history)
+    assert er.empty()
+
+    er = EnsembleResults(accuracy, ensemble_performance_history)
+    assert er._train_scores == SCORES
+    assert np.allclose(er.train_scores, SCORES)
+    assert er._test_scores == SCORES[::-1]
+    assert np.allclose(er.test_scores, SCORES[::-1])
+    assert np.allclose(er.end_times, end_times)
+
+    er = EnsembleResults(accuracy, ensemble_performance_history, order_by_endtime=True)
+    assert np.allclose(er.train_scores, np.array(SCORES)[order])
+    assert np.allclose(er.test_scores, np.array(SCORES[::-1])[order])
+    assert np.allclose(er.end_times, np.array(end_times)[order])
+
+
+@pytest.mark.parametrize('metric', (accuracy, log_loss))
+@pytest.mark.parametrize('scores', (SCORES[:8], SCORES[:8][::-1]))
+@pytest.mark.parametrize('ensemble_ends_later', (True, False))
+def test_metric_results(metric, scores, ensemble_ends_later):
+    # since datetime --> timestamp variates between machines and float64 might not
+    # be able to handle time precisely enough, we might need to change t0 in the future.
+    # Basically, it happens because this test is checking by the precision of milli second
+    t0, ms_unit = (1970, 1, 1, 9, 0, 0), 100000
+    ensemble_performance_history = [
+        {'Timestamp': datetime(*t0, ms_unit * 2 * (i + 1) + ms_unit // 2),
+         f'train_{metric.name}': s,
+         f'test_{metric.name}': s}
+        for i, s in enumerate(scores[::2])
+    ]
+    # Add a record with the exact same stamp as the last one
+    ensemble_performance_history.append(
+        {'Timestamp': datetime(*t0, ms_unit * 8 + ms_unit // 2),
+         f'train_{metric.name}': 0,
+         f'test_{metric.name}': 0}
+    )
+    # Add a record with the exact same stamp as a middle one
+    ensemble_performance_history.append(
+        {'Timestamp': datetime(*t0, ms_unit * 4 + ms_unit // 2),
+         f'train_{metric.name}': 0,
+         f'test_{metric.name}': 0}
+    )
+
+    run_history = RunHistory()
+    cs = ConfigurationSpace()
+    cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1))
+
+    for i, fixed_val in enumerate(scores):
+        config = Configuration(cs, {'a': fixed_val})
+        st = datetime.timestamp(datetime(*t0, ms_unit * (i + 1 - ensemble_ends_later)))
+        et = datetime.timestamp(datetime(*t0, ms_unit * (i + 2 - ensemble_ends_later)))
+        run_history.add(
+            config=config, cost=1, budget=0,
+            time=0.1, starttime=st, endtime=et,
+            status=StatusType.SUCCESS,
+            additional_info={
+                'configuration_origin': T,
+                'train_loss': {f'{metric.name}': fixed_val},
+                'opt_loss': {f'{metric.name}': fixed_val},
+                'test_loss': {f'{metric.name}': fixed_val}
+            }
+        )
+    _check_metric_results(scores, metric, run_history, ensemble_performance_history)
+
+
+def test_search_results_sprint_statistics():
+    api = BaseTask()
+    for method in ['get_search_results', 'sprint_statistics', 'get_incumbent_results']:
+        with pytest.raises(RuntimeError) as excinfo:
+            getattr(api, method)()
+
+        assert excinfo._excinfo[0] == RuntimeError
+
+    run_history_data = json.load(open(os.path.join(os.path.dirname(__file__),
+                                                   'runhistory.json'),
+                                      mode='r'))['data']
+    api._results_manager.run_history = MagicMock()
+    api.run_history.empty = MagicMock(return_value=False)
+
+    # The run_history has 16 runs + 1 run interruption ==> 16 runs
+    api.run_history.data = make_dict_run_history_data(run_history_data)
+    api._metric = accuracy
+    api.dataset_name = 'iris'
+    api._scoring_functions = [accuracy, balanced_accuracy]
+    api.search_space = MagicMock(spec=ConfigurationSpace)
+    worst_val = api._metric._worst_possible_result
+    search_results = api.get_search_results()
+
+    _check_status(search_results.status_types)
+    _check_costs(search_results.opt_scores)
+    _check_end_times(search_results.end_times)
+    _check_fit_times(search_results.fit_times)
+    _check_budgets(search_results.budgets)
+    _check_metric_dict(search_results.opt_metric_dict, search_results.status_types, worst_val)
+    _check_additional_infos(status_types=search_results.status_types,
+                            additional_infos=search_results.additional_infos)
+
+    # config_ids can duplicate because of various budget size
+    config_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 1, 10, 11, 12, 10, 13]
+    assert config_ids == search_results.config_ids
+
+    # assert that contents of search_results are of expected types
+    assert isinstance(search_results.rank_opt_scores, np.ndarray)
+    assert search_results.rank_opt_scores.dtype is np.dtype(np.int)
+    assert isinstance(search_results.configs, list)
+
+    n_success, n_timeout, n_memoryout, n_crashed = 13, 2, 0, 1
+    msg = ["autoPyTorch results:", f"\tDataset name: {api.dataset_name}",
+           f"\tOptimisation Metric: {api._metric.name}",
+           f"\tBest validation score: {max(search_results.opt_scores)}",
+           "\tNumber of target algorithm runs: 16", f"\tNumber of successful target algorithm runs: {n_success}",
+           f"\tNumber of crashed target algorithm runs: {n_crashed}",
+           f"\tNumber of target algorithms that exceeded the time limit: {n_timeout}",
+           f"\tNumber of target algorithms that exceeded the memory limit: {n_memoryout}"]
+
+    assert isinstance(api.sprint_statistics(), str)
+    assert all([m1 == m2 for m1, m2 in zip(api.sprint_statistics().split("\n"), msg)])
+
+
+@pytest.mark.parametrize('run_history', (None, RunHistory()))
+def test_check_run_history(run_history):
+    manager = ResultsManager()
+    manager.run_history = run_history
+
+    with pytest.raises(RuntimeError) as excinfo:
+        manager._check_run_history()
+
+    assert excinfo._excinfo[0] == RuntimeError
+
+
+@pytest.mark.parametrize('include_traditional', (True, False))
+@pytest.mark.parametrize('metric', (accuracy, log_loss))
+@pytest.mark.parametrize('origins', ([T] * 5 + [NT] * 5, [T, NT] * 5, [NT] * 5 + [T] * 5))
+@pytest.mark.parametrize('scores', (SCORES, SCORES[::-1]))
+def test_get_incumbent_results(include_traditional, metric, origins, scores):
+    manager = ResultsManager()
+    cs = ConfigurationSpace()
+    cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1))
+
+    configs = [0.1 * (i + 1) for i in range(len(scores))]
+    if metric.name == "log_loss":
+        # This is to detect mis-computation in reversion
+        metric._optimum = 0.1
+
+    best_cost, best_idx = np.inf, -1
+    for idx, (a, origin, score) in enumerate(zip(configs, origins, scores)):
+        config = Configuration(cs, {'a': a})
+
+        # conversion defined in:
+        # autoPyTorch/pipeline/components/training/metrics/utils.py::calculate_loss
+        cost = metric._optimum - metric._sign * score
+        manager.run_history.add(
+            config=config,
+            cost=cost,
+            time=1.0,
+            status=StatusType.SUCCESS,
+            additional_info={'train_loss': {metric.name: cost},
+                             'opt_loss': {metric.name: cost},
+                             'test_loss': {metric.name: cost},
+                             'configuration_origin': origin}
+        )
+        if cost > best_cost:
+            continue
+
+        if include_traditional:
+            best_cost, best_idx = cost, idx
+        elif origin != T:
+            best_cost, best_idx = cost, idx
+
+    incumbent_config, incumbent_results = manager.get_incumbent_results(
+        metric=metric,
+        include_traditional=include_traditional
+    )
+
+    assert isinstance(incumbent_config, Configuration)
+    assert isinstance(incumbent_results, dict)
+    best_score, best_a = scores[best_idx], configs[best_idx]
+    assert np.allclose(
+        [best_score, best_score, best_a],
+        [cost2metric(best_cost, metric),
+         cost2metric(incumbent_results['opt_loss'][metric.name], metric),
+         incumbent_config['a']]
+    )
+
+    if not include_traditional:
+        assert incumbent_results['configuration_origin'] != T
diff --git a/test/test_utils/test_results_visualizer.py b/test/test_utils/test_results_visualizer.py
new file mode 100644
index 000000000..926d21e6f
--- /dev/null
+++ b/test/test_utils/test_results_visualizer.py
@@ -0,0 +1,274 @@
+import json
+import os
+from datetime import datetime
+from test.test_api.utils import make_dict_run_history_data
+from unittest.mock import MagicMock
+
+from ConfigSpace import ConfigurationSpace
+
+import matplotlib.pyplot as plt
+
+import numpy as np
+
+import pytest
+
+from autoPyTorch.api.base_task import BaseTask
+from autoPyTorch.metrics import accuracy, balanced_accuracy
+from autoPyTorch.utils.results_visualizer import (
+    ColorLabelSettings,
+    PlotSettingParams,
+    ResultsVisualizer,
+    _get_perf_and_time
+)
+
+
+TEST_CL = ('test color', 'test label')
+
+
+@pytest.mark.parametrize('cl_settings', (
+    ColorLabelSettings(single_opt=TEST_CL),
+    ColorLabelSettings(single_opt=TEST_CL, single_test=None, single_train=None)
+))
+@pytest.mark.parametrize('with_ensemble', (True, False))
+def test_extract_dicts(cl_settings, with_ensemble):
+    dummy_keys = [name for name in [
+        'single::train::dummy',
+        'single::opt::dummy',
+        'single::test::dummy',
+        'ensemble::train::dummy',
+        'ensemble::test::dummy'
+    ] if (
+        (with_ensemble or not name.startswith('ensemble'))
+        and getattr(cl_settings, "_".join(name.split('::')[:2])) is not None
+    )
+    ]
+
+    results = MagicMock()
+    results.data.keys = MagicMock(return_value=dummy_keys)
+    cd, ld = cl_settings.extract_dicts(results)
+    assert set(dummy_keys) == set(cd.keys())
+    assert set(dummy_keys) == set(ld.keys())
+
+    opt_key = 'single::opt::dummy'
+    assert TEST_CL == (cd[opt_key], ld[opt_key])
+
+
+@pytest.mark.parametrize('params', (
+    PlotSettingParams(show=True),
+    PlotSettingParams(show=False)
+))
+def test_plt_show_in_set_plot_args(params):  # TODO
+    plt.show = MagicMock()
+    _, ax = plt.subplots(nrows=1, ncols=1)
+    viz = ResultsVisualizer()
+
+    viz._set_plot_args(ax, params)
+    assert plt.show._mock_called == params.show
+    plt.close()
+
+
+@pytest.mark.parametrize('params', (
+    PlotSettingParams(xscale='none', yscale='none'),
+    PlotSettingParams(xscale='none', yscale='log'),
+    PlotSettingParams(xscale='none', yscale='none'),
+    PlotSettingParams(xscale='none', yscale='log')
+))
+def test_raise_value_error_in_set_plot_args(params):  # TODO
+    _, ax = plt.subplots(nrows=1, ncols=1)
+    viz = ResultsVisualizer()
+
+    with pytest.raises(ValueError) as excinfo:
+        viz._set_plot_args(ax, params)
+
+    assert excinfo._excinfo[0] == ValueError
+    plt.close()
+
+
+@pytest.mark.parametrize('params', (
+    PlotSettingParams(xlim=(-100, 100), ylim=(-200, 200)),
+    PlotSettingParams(xlabel='x label', ylabel='y label'),
+    PlotSettingParams(xscale='log', yscale='log'),
+    PlotSettingParams(legend=False, title='Title')
+))
+def test_set_plot_args(params):  # TODO
+    _, ax = plt.subplots(nrows=1, ncols=1)
+    viz = ResultsVisualizer()
+    viz._set_plot_args(ax, params)
+
+    if params.xlim is not None:
+        assert ax.get_xlim() == params.xlim
+    if params.ylim is not None:
+        assert ax.get_ylim() == params.ylim
+
+    assert ax.xaxis.get_label()._text == ('' if params.xlabel is None else params.xlabel)
+    assert ax.yaxis.get_label()._text == ('' if params.ylabel is None else params.ylabel)
+    assert ax.get_title() == ('' if params.title is None else params.title)
+    assert params.xscale == ax.get_xscale()
+    assert params.yscale == ax.get_yscale()
+
+    if params.legend:
+        assert ax.get_legend() is not None
+    else:
+        assert ax.get_legend() is None
+
+    plt.close()
+
+
+@pytest.mark.parametrize('metric_name', ('unknown', 'accuracy'))
+def test_raise_error_in_plot_perf_over_time_in_base_task(metric_name):
+    api = BaseTask()
+
+    if metric_name == 'unknown':
+        with pytest.raises(ValueError) as excinfo:
+            api.plot_perf_over_time(metric_name)
+        assert excinfo._excinfo[0] == ValueError
+    else:
+        with pytest.raises(RuntimeError) as excinfo:
+            api.plot_perf_over_time(metric_name)
+        assert excinfo._excinfo[0] == RuntimeError
+
+
+@pytest.mark.parametrize('metric_name', ('balanced_accuracy', 'accuracy'))
+def test_plot_perf_over_time(metric_name):  # TODO
+    dummy_history = [{'Timestamp': datetime(2022, 1, 1), 'train_accuracy': 1, 'test_accuracy': 1}]
+    api = BaseTask()
+    run_history_data = json.load(open(os.path.join(os.path.dirname(__file__),
+                                                   'runhistory.json'),
+                                      mode='r'))['data']
+    api._results_manager.run_history = MagicMock()
+    api.run_history.empty = MagicMock(return_value=False)
+
+    # The run_history has 16 runs + 1 run interruption ==> 16 runs
+    api.run_history.data = make_dict_run_history_data(run_history_data)
+    api._results_manager.ensemble_performance_history = dummy_history
+    api._metric = accuracy
+    api.dataset_name = 'iris'
+    api._scoring_functions = [accuracy, balanced_accuracy]
+    api.search_space = MagicMock(spec=ConfigurationSpace)
+
+    api.plot_perf_over_time(metric_name=metric_name)
+    _, ax = plt.subplots(nrows=1, ncols=1)
+    api.plot_perf_over_time(metric_name=metric_name, ax=ax)
+
+    # remove ensemble keys if metric name is not for the opt score
+    ans = set([
+        name
+        for name in [f'single train {metric_name}',
+                     f'single test {metric_name}',
+                     f'single opt {metric_name}',
+                     f'ensemble train {metric_name}',
+                     f'ensemble test {metric_name}']
+        if metric_name == api._metric.name or not name.startswith('ensemble')
+    ])
+    legend_set = set([txt._text for txt in ax.get_legend().texts])
+    assert ans == legend_set
+    plt.close()
+
+
+@pytest.mark.parametrize('params', (
+    PlotSettingParams(xscale='none', yscale='none'),
+    PlotSettingParams(xscale='none', yscale='log'),
+    PlotSettingParams(xscale='log', yscale='none'),
+    PlotSettingParams(yscale='log')
+))
+def test_raise_error_get_perf_and_time(params):
+    results = np.linspace(-1, 1, 10)
+    cum_times = np.linspace(0, 1, 10)
+
+    with pytest.raises(ValueError) as excinfo:
+        _get_perf_and_time(
+            cum_results=results,
+            cum_times=cum_times,
+            plot_setting_params=params,
+            worst_val=np.inf
+        )
+
+    assert excinfo._excinfo[0] == ValueError
+
+
+@pytest.mark.parametrize('params', (
+    PlotSettingParams(n_points=20, xscale='linear', yscale='linear'),
+    PlotSettingParams(n_points=20, xscale='log', yscale='log')
+))
+def test_get_perf_and_time(params):
+    y_min, y_max = 1e-5, 1
+    results = np.linspace(y_min, y_max, 10)
+    cum_times = np.linspace(y_min, y_max, 10)
+
+    check_points, perf_by_time_step = _get_perf_and_time(
+        cum_results=results,
+        cum_times=cum_times,
+        plot_setting_params=params,
+        worst_val=np.inf
+    )
+
+    times_ans = np.linspace(
+        y_min if params.xscale == 'linear' else np.log(y_min),
+        y_max if params.xscale == 'linear' else np.log(y_max),
+        params.n_points
+    )
+    times_ans = times_ans if params.xscale == 'linear' else np.exp(times_ans)
+    assert np.allclose(check_points, times_ans)
+
+    if params.xscale == 'linear':
+        """
+        each time step to record the result
+        [1.00000000e-05, 5.26410526e-02, 1.05272105e-01, 1.57903158e-01,
+         2.10534211e-01, 2.63165263e-01, 3.15796316e-01, 3.68427368e-01,
+         4.21058421e-01, 4.73689474e-01, 5.26320526e-01, 5.78951579e-01,
+         6.31582632e-01, 6.84213684e-01, 7.36844737e-01, 7.89475789e-01,
+         8.42106842e-01, 8.94737895e-01, 9.47368947e-01, 1.00000000e+00]
+
+        The time steps when each result was recorded
+        [
+            1.0000e-05,  # cover index 0 ~ 2
+            1.1112e-01,  # cover index 3, 4
+            2.2223e-01,  # cover index 5, 6
+            3.3334e-01,  # cover index 7, 8
+            4.4445e-01,  # cover index 9, 10
+            5.5556e-01,  # cover index 11, 12
+            6.6667e-01,  # cover index 13, 14
+            7.7778e-01,  # cover index 15, 16
+            8.8889e-01,  # cover index 17, 18
+            1.0000e+00   # cover index 19
+        ]
+        Since the sequence is monotonically increasing,
+        if multiple elements cover the same index, take the best.
+        """
+        results_ans = [r for r in results]
+        results_ans = [results[0]] + results_ans + results_ans[:-1]
+        results_ans = np.sort(results_ans)
+    else:
+        """
+        each time step to record the result
+        [1.00000000e-05, 1.83298071e-05, 3.35981829e-05, 6.15848211e-05,
+         1.12883789e-04, 2.06913808e-04, 3.79269019e-04, 6.95192796e-04,
+         1.27427499e-03, 2.33572147e-03, 4.28133240e-03, 7.84759970e-03,
+         1.43844989e-02, 2.63665090e-02, 4.83293024e-02, 8.85866790e-02,
+         1.62377674e-01, 2.97635144e-01, 5.45559478e-01, 1.00000000e+00]
+
+        The time steps when each result was recorded
+        [
+            1.0000e-05,  # cover index 0 ~ 15
+            1.1112e-01,  # cover index 16
+            2.2223e-01,  # cover index 17
+            3.3334e-01,  # cover index 18
+            4.4445e-01,  # cover index 18
+            5.5556e-01,  # cover index 19
+            6.6667e-01,  # cover index 19
+            7.7778e-01,  # cover index 19
+            8.8889e-01,  # cover index 19
+            1.0000e+00   # cover index 19
+        ]
+        Since the sequence is monotonically increasing,
+        if multiple elements cover the same index, take the best.
+        """
+        results_ans = [
+            *([results[0]] * 16),
+            results[1],
+            results[2],
+            results[4],
+            results[-1]
+        ]
+
+    assert np.allclose(perf_by_time_step, results_ans)

From 8f9e9f6b8d25921ef8f21331cfc1b7c9c260f312 Mon Sep 17 00:00:00 2001
From: Eddie Bergman <eddiebergmanhs@gmail.com>
Date: Wed, 1 Dec 2021 17:50:56 +0100
Subject: [PATCH 6/7] Cleanup of simple_imputer (#346)

* cleanup of simple_imputer

* Fixed doc and typo

* Fixed docs

* Made changes, added test

* Fixed init statement

* Fixed docs

* Flake'd
---
 .../imputation/SimpleImputer.py               | 161 ++++++++++++------
 .../components/preprocessing/test_imputers.py |  12 ++
 2 files changed, 117 insertions(+), 56 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
index ea09798ce..3d7ca22b1 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
@@ -1,9 +1,7 @@
 from typing import Any, Dict, List, Optional, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import (
-    CategoricalHyperparameter
-)
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
 
 import numpy as np
 
@@ -15,92 +13,143 @@
 
 
 class SimpleImputer(BaseImputer):
-    """
-    Impute missing values for categorical columns with '!missing!'
-    (In case of numpy data, the constant value is set to -1, under
-    the assumption that categorical data is fit with an Ordinal Scaler)
+    """An imputer for categorical and numerical columns
+
+    Impute missing values for categorical columns with 'constant_!missing!'
+
+    Note:
+        In case of numpy data, the constant value is set to -1, under the assumption
+        that categorical data is fit with an Ordinal Scaler.
+
+    Attributes:
+        random_state (Optional[np.random.RandomState]):
+            The random state to use for the imputer.
+        numerical_strategy (str: default='mean'):
+            The strategy to use for imputing numerical columns.
+            Can be one of ['most_frequent', 'constant_!missing!']
+        categorical_strategy (str: default='most_frequent')
+            The strategy to use for imputing categorical columns.
+            Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
     """
 
-    def __init__(self,
-                 random_state: Optional[Union[np.random.RandomState, int]] = None,
-                 numerical_strategy: str = 'mean',
-                 categorical_strategy: str = 'most_frequent'):
+    def __init__(
+        self,
+        random_state: Optional[np.random.RandomState] = None,
+        numerical_strategy: str = 'mean',
+        categorical_strategy: str = 'most_frequent'
+    ):
+        """
+        Note:
+            'constant' as numerical_strategy uses 0 as the default fill_value while
+            'constant_!missing!' uses a fill_value of -1.
+            This behaviour should probably be fixed.
+        """
         super().__init__()
         self.random_state = random_state
         self.numerical_strategy = numerical_strategy
         self.categorical_strategy = categorical_strategy
 
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseImputer:
-        """
-        The fit function calls the fit function of the underlying model
-        and returns the transformed array.
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
+        """ Fits the underlying model and returns the transformed array.
+
         Args:
-            X (np.ndarray): input features
-            y (Optional[np.ndarray]): input labels
+            X (np.ndarray):
+                The input features to fit on
+            y (Optional[np.ndarray]):
+                The labels for the input features `X`
 
         Returns:
-            instance of self
+            SimpleImputer:
+                returns self
         """
         self.check_requirements(X, y)
-        categorical_columns = X['dataset_properties']['categorical_columns'] \
-            if isinstance(X['dataset_properties']['categorical_columns'], List) else []
-        if len(categorical_columns) != 0:
+
+        # Choose an imputer for any categorical columns
+        categorical_columns = X['dataset_properties']['categorical_columns']
+
+        if isinstance(categorical_columns, List) and len(categorical_columns) != 0:
             if self.categorical_strategy == 'constant_!missing!':
-                self.preprocessor['categorical'] = SklearnSimpleImputer(strategy='constant',
-                                                                        # Train data is numpy
-                                                                        # as of this point, where
-                                                                        # Ordinal Encoding is using
-                                                                        # for categorical. Only
-                                                                        # Numbers are allowed
-                                                                        # fill_value='!missing!',
-                                                                        fill_value=-1,
-                                                                        copy=False)
+                # Train data is numpy as of this point, where an Ordinal Encoding is used
+                # for categoricals. Only Numbers are allowed for `fill_value`
+                imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False)
+                self.preprocessor['categorical'] = imputer
             else:
-                self.preprocessor['categorical'] = SklearnSimpleImputer(strategy=self.categorical_strategy,
-                                                                        copy=False)
-        numerical_columns = X['dataset_properties']['numerical_columns'] \
-            if isinstance(X['dataset_properties']['numerical_columns'], List) else []
-        if len(numerical_columns) != 0:
+                imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False)
+                self.preprocessor['categorical'] = imputer
+
+        # Choose an imputer for any numerical columns
+        numerical_columns = X['dataset_properties']['numerical_columns']
+
+        if isinstance(numerical_columns, List) and len(numerical_columns) > 0:
             if self.numerical_strategy == 'constant_zero':
-                self.preprocessor['numerical'] = SklearnSimpleImputer(strategy='constant',
-                                                                      fill_value=0,
-                                                                      copy=False)
+                imputer = SklearnSimpleImputer(strategy='constant', fill_value=0, copy=False)
+                self.preprocessor['numerical'] = imputer
             else:
-                self.preprocessor['numerical'] = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False)
+                imputer = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False)
+                self.preprocessor['numerical'] = imputer
 
         return self
 
     @staticmethod
     def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='numerical_strategy',
-                                                                                  value_range=("mean", "median",
-                                                                                               "most_frequent",
-                                                                                               "constant_zero"),
-                                                                                  default_value="mean",
-                                                                                  ),
+        numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='numerical_strategy',
+            value_range=("mean", "median", "most_frequent", "constant_zero"),
+            default_value="mean",
+        ),
         categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter='categorical_strategy',
-            value_range=("most_frequent",
-                         "constant_!missing!"),
-            default_value="most_frequent")
+            value_range=("most_frequent", "constant_!missing!"),
+            default_value="most_frequent"
+        )
     ) -> ConfigurationSpace:
+        """Get the hyperparameter search space for the SimpleImputer
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
+                Properties that describe the dataset
+                Note: Not actually Optional, just adhering to its supertype
+            numerical_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for numerical imputation
+            caterogical_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for categorical imputation
+
+        Returns:
+            ConfigurationSpace
+                The space of possible configurations for a SimpleImputer with the given
+                `dataset_properties`
+        """
         cs = ConfigurationSpace()
-        assert dataset_properties is not None, "To create hyperparameter search space" \
-                                               ", dataset_properties should not be None"
-        if len(dataset_properties['numerical_columns']) \
-                if isinstance(dataset_properties['numerical_columns'], List) else 0 != 0:
+
+        if dataset_properties is None:
+            raise ValueError("SimpleImputer requires `dataset_properties` for generating"
+                             " a search space.")
+
+        if (
+            isinstance(dataset_properties['numerical_columns'], List)
+            and len(dataset_properties['numerical_columns']) != 0
+        ):
             add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)
 
-        if len(dataset_properties['categorical_columns']) \
-                if isinstance(dataset_properties['categorical_columns'], List) else 0 != 0:
+        if (
+            isinstance(dataset_properties['categorical_columns'], List)
+            and len(dataset_properties['categorical_columns'])
+        ):
             add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter)
 
         return cs
 
     @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-                       ) -> Dict[str, Union[str, bool]]:
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+        """Get the properties of the SimpleImputer class and what it can handle
+
+        Returns:
+            Dict[str, Union[str, bool]]:
+                A dict from property names to values
+        """
         return {
             'shortname': 'SimpleImputer',
             'name': 'Simple Imputer',
diff --git a/test/test_pipeline/components/preprocessing/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py
index 983737dfe..18b43bfa6 100644
--- a/test/test_pipeline/components/preprocessing/test_imputers.py
+++ b/test/test_pipeline/components/preprocessing/test_imputers.py
@@ -3,6 +3,8 @@
 import numpy as np
 from numpy.testing import assert_array_equal
 
+import pytest
+
 from sklearn.base import BaseEstimator, clone
 from sklearn.compose import make_column_transformer
 
@@ -213,6 +215,16 @@ def test_constant_imputation(self):
                                                              [7.0, '0', 9],
                                                              [4.0, '0', '0']], dtype=str))
 
+    def test_imputation_without_dataset_properties_raises_error(self):
+        """Tests SimpleImputer checks for dataset properties when querying for
+        HyperparameterSearchSpace, even though the arg is marked `Optional`.
+
+        Expects:
+            * Should raise a ValueError that no dataset_properties were passed
+        """
+        with pytest.raises(ValueError):
+            SimpleImputer.get_hyperparameter_search_space()
+
 
 if __name__ == '__main__':
     unittest.main()

From 40a398743a4bbd6b186844fe46ce71101580b5f1 Mon Sep 17 00:00:00 2001
From: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com>
Date: Mon, 6 Dec 2021 14:04:31 +0100
Subject: [PATCH 7/7] [feat] Add the option to save a figure in plot setting
 params (#351)

* [feat] Add the option to save a figure in plot setting params

Since non-GUI based environments would like to avoid the usage of
show method in the matplotlib, I added the option to savefig and
thus users can complete the operations inside AutoPytorch.

* [doc] Add a comment for non-GUI based computer in plot_perf_over_time method

* [test] Add a test to check the priority of show and savefig

Since plt.savefig and plt.show do not work at the same time due to the
matplotlib design, we need to check whether show will not be called
when a figname is specified. We can actually raise an error, but plot
will be basically called in the end of an optimization, so I wanted
to avoid raising an error and just sticked to a check by tests.
---
 autoPyTorch/api/base_task.py                  |  3 ++
 autoPyTorch/utils/results_visualizer.py       | 48 ++++++++++++++-----
 .../40_advanced/example_plot_over_time.py     | 11 ++---
 test/test_utils/test_results_manager.py       | 30 ++++--------
 test/test_utils/test_results_visualizer.py    | 48 ++++++++++++++-----
 5 files changed, 89 insertions(+), 51 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index edd505d86..b4d20165e 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -1513,6 +1513,9 @@ def plot_perf_over_time(
                 The settings of a pair of color and label for each plot.
             args, kwargs (Any):
                 Arguments for the ax.plot.
+
+        Note:
+            You might need to run `export DISPLAY=:0.0` if you are using non-GUI based environment.
         """
 
         if not hasattr(metrics, metric_name):
diff --git a/autoPyTorch/utils/results_visualizer.py b/autoPyTorch/utils/results_visualizer.py
index 64c87ba94..e1debe29c 100644
--- a/autoPyTorch/utils/results_visualizer.py
+++ b/autoPyTorch/utils/results_visualizer.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, NamedTuple, Optional, Tuple
 
 import matplotlib.pyplot as plt
 
@@ -71,8 +71,7 @@ def extract_dicts(
         return colors, labels
 
 
-@dataclass(frozen=True)
-class PlotSettingParams:
+class PlotSettingParams(NamedTuple):
     """
     Parameters for the plot environment.
 
@@ -93,12 +92,28 @@ class PlotSettingParams:
             The range of x axis.
         ylim (Tuple[float, float]):
             The range of y axis.
+        grid (bool):
+            Whether to have grid lines.
+            If users would like to define lines in detail,
+            they need to deactivate it.
         legend (bool):
             Whether to have legend in the figure.
-        legend_loc (str):
-            The location of the legend.
+        legend_kwargs (Dict[str, Any]):
+            The kwargs for ax.legend.
+            Ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.legend.html
+        title (Optional[str]):
+            The title of the figure.
+        title_kwargs (Dict[str, Any]):
+            The kwargs for ax.set_title except title label.
+            Ref: https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.axes.Axes.set_title.html
         show (bool):
             Whether to show the plot.
+            If figname is not None, the save will be prioritized.
+        figname (Optional[str]):
+            Name of a figure to save. If None, no figure will be saved.
+        savefig_kwargs (Dict[str, Any]):
+            The kwargs for plt.savefig except filename.
+            Ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.savefig.html
         args, kwargs (Any):
             Arguments for the ax.plot.
     """
@@ -108,12 +123,16 @@ class PlotSettingParams:
     xlabel: Optional[str] = None
     ylabel: Optional[str] = None
     title: Optional[str] = None
+    title_kwargs: Dict[str, Any] = {}
     xlim: Optional[Tuple[float, float]] = None
     ylim: Optional[Tuple[float, float]] = None
+    grid: bool = True
     legend: bool = True
-    legend_loc: str = 'best'
+    legend_kwargs: Dict[str, Any] = {}
     show: bool = False
+    figname: Optional[str] = None
     figsize: Optional[Tuple[int, int]] = None
+    savefig_kwargs: Dict[str, Any] = {}
 
 
 class ScaleChoices(Enum):
@@ -201,17 +220,22 @@ def _set_plot_args(
 
         ax.set_xscale(plot_setting_params.xscale)
         ax.set_yscale(plot_setting_params.yscale)
-        if plot_setting_params.xscale == 'log' or plot_setting_params.yscale == 'log':
-            ax.grid(True, which='minor', color='gray', linestyle=':')
 
-        ax.grid(True, which='major', color='black')
+        if plot_setting_params.grid:
+            if plot_setting_params.xscale == 'log' or plot_setting_params.yscale == 'log':
+                ax.grid(True, which='minor', color='gray', linestyle=':')
+
+            ax.grid(True, which='major', color='black')
 
         if plot_setting_params.legend:
-            ax.legend(loc=plot_setting_params.legend_loc)
+            ax.legend(**plot_setting_params.legend_kwargs)
 
         if plot_setting_params.title is not None:
-            ax.set_title(plot_setting_params.title)
-        if plot_setting_params.show:
+            ax.set_title(plot_setting_params.title, **plot_setting_params.title_kwargs)
+
+        if plot_setting_params.figname is not None:
+            plt.savefig(plot_setting_params.figname, **plot_setting_params.savefig_kwargs)
+        elif plot_setting_params.show:
             plt.show()
 
     @staticmethod
diff --git a/examples/40_advanced/example_plot_over_time.py b/examples/40_advanced/example_plot_over_time.py
index 9c103452e..cf672fc46 100644
--- a/examples/40_advanced/example_plot_over_time.py
+++ b/examples/40_advanced/example_plot_over_time.py
@@ -62,21 +62,20 @@
     xlabel='Runtime',
     ylabel='Accuracy',
     title='Toy Example',
-    show=False  # If you would like to show, make it True
+    figname='example_plot_over_time.png',
+    savefig_kwargs={'bbox_inches': 'tight'},
+    show=False  # If you would like to show, make it True and set figname=None
 )
 
 ############################################################################
 # Plot with the Specified Setting Parameters
 # ==========================================
-_, ax = plt.subplots()
+# _, ax = plt.subplots()  <=== You can feed it to post-process the figure.
 
+# You might need to run `export DISPLAY=:0.0` if you are using non-GUI based environment.
 api.plot_perf_over_time(
-    ax=ax,  # You do not have to provide.
     metric_name=metric_name,
     plot_setting_params=params,
     marker='*',
     markersize=10
 )
-
-# plt.show() might cause issue depending on environments
-plt.savefig('example_plot_over_time.png')
diff --git a/test/test_utils/test_results_manager.py b/test/test_utils/test_results_manager.py
index 60ee11f42..8998009a4 100644
--- a/test/test_utils/test_results_manager.py
+++ b/test/test_utils/test_results_manager.py
@@ -165,11 +165,9 @@ def test_extract_results_from_run_history():
         time=1.0,
         status=StatusType.CAPPED,
     )
-    with pytest.raises(ValueError) as excinfo:
+    with pytest.raises(ValueError):
         SearchResults(metric=accuracy, scoring_functions=[], run_history=run_history)
 
-    assert excinfo._excinfo[0] == ValueError
-
 
 def test_raise_error_in_update_and_sort_by_time():
     cs = ConfigurationSpace()
@@ -179,7 +177,7 @@ def test_raise_error_in_update_and_sort_by_time():
     sr = SearchResults(metric=accuracy, scoring_functions=[], run_history=RunHistory())
     er = EnsembleResults(metric=accuracy, ensemble_performance_history=[])
 
-    with pytest.raises(RuntimeError) as excinfo:
+    with pytest.raises(RuntimeError):
         sr._update(
             config=config,
             run_key=RunKey(config_id=0, instance_id=0, seed=0),
@@ -189,19 +187,13 @@ def test_raise_error_in_update_and_sort_by_time():
             )
         )
 
-    assert excinfo._excinfo[0] == RuntimeError
-
-    with pytest.raises(RuntimeError) as excinfo:
+    with pytest.raises(RuntimeError):
         sr._sort_by_endtime()
 
-    assert excinfo._excinfo[0] == RuntimeError
-
-    with pytest.raises(RuntimeError) as excinfo:
+    with pytest.raises(RuntimeError):
         er._update(data={})
 
-    assert excinfo._excinfo[0] == RuntimeError
-
-    with pytest.raises(RuntimeError) as excinfo:
+    with pytest.raises(RuntimeError):
         er._sort_by_endtime()
 
 
@@ -244,11 +236,9 @@ def test_raise_error_in_get_start_time():
         status=StatusType.CAPPED,
     )
 
-    with pytest.raises(ValueError) as excinfo:
+    with pytest.raises(ValueError):
         get_start_time(run_history)
 
-    assert excinfo._excinfo[0] == ValueError
-
 
 def test_search_results_sort_by_endtime():
     run_history = RunHistory()
@@ -364,11 +354,9 @@ def test_metric_results(metric, scores, ensemble_ends_later):
 def test_search_results_sprint_statistics():
     api = BaseTask()
     for method in ['get_search_results', 'sprint_statistics', 'get_incumbent_results']:
-        with pytest.raises(RuntimeError) as excinfo:
+        with pytest.raises(RuntimeError):
             getattr(api, method)()
 
-        assert excinfo._excinfo[0] == RuntimeError
-
     run_history_data = json.load(open(os.path.join(os.path.dirname(__file__),
                                                    'runhistory.json'),
                                       mode='r'))['data']
@@ -420,11 +408,9 @@ def test_check_run_history(run_history):
     manager = ResultsManager()
     manager.run_history = run_history
 
-    with pytest.raises(RuntimeError) as excinfo:
+    with pytest.raises(RuntimeError):
         manager._check_run_history()
 
-    assert excinfo._excinfo[0] == RuntimeError
-
 
 @pytest.mark.parametrize('include_traditional', (True, False))
 @pytest.mark.parametrize('metric', (accuracy, log_loss))
diff --git a/test/test_utils/test_results_visualizer.py b/test/test_utils/test_results_visualizer.py
index 926d21e6f..c463fa063 100644
--- a/test/test_utils/test_results_visualizer.py
+++ b/test/test_utils/test_results_visualizer.py
@@ -55,15 +55,46 @@ def test_extract_dicts(cl_settings, with_ensemble):
 
 @pytest.mark.parametrize('params', (
     PlotSettingParams(show=True),
-    PlotSettingParams(show=False)
+    PlotSettingParams(show=False),
+    PlotSettingParams(show=True, figname='dummy')
 ))
 def test_plt_show_in_set_plot_args(params):  # TODO
     plt.show = MagicMock()
+    plt.savefig = MagicMock()
     _, ax = plt.subplots(nrows=1, ncols=1)
     viz = ResultsVisualizer()
 
     viz._set_plot_args(ax, params)
-    assert plt.show._mock_called == params.show
+    # if figname is not None, show will not be called. (due to the matplotlib design)
+    assert plt.show._mock_called == (params.figname is None and params.show)
+    plt.close()
+
+
+@pytest.mark.parametrize('params', (
+    PlotSettingParams(),
+    PlotSettingParams(figname='fig')
+))
+def test_plt_savefig_in_set_plot_args(params):  # TODO
+    plt.savefig = MagicMock()
+    _, ax = plt.subplots(nrows=1, ncols=1)
+    viz = ResultsVisualizer()
+
+    viz._set_plot_args(ax, params)
+    assert plt.savefig._mock_called == (params.figname is not None)
+    plt.close()
+
+
+@pytest.mark.parametrize('params', (
+    PlotSettingParams(grid=True),
+    PlotSettingParams(grid=False)
+))
+def test_ax_grid_in_set_plot_args(params):  # TODO
+    _, ax = plt.subplots(nrows=1, ncols=1)
+    ax.grid = MagicMock()
+    viz = ResultsVisualizer()
+
+    viz._set_plot_args(ax, params)
+    assert ax.grid._mock_called == params.grid
     plt.close()
 
 
@@ -77,10 +108,9 @@ def test_raise_value_error_in_set_plot_args(params):  # TODO
     _, ax = plt.subplots(nrows=1, ncols=1)
     viz = ResultsVisualizer()
 
-    with pytest.raises(ValueError) as excinfo:
+    with pytest.raises(ValueError):
         viz._set_plot_args(ax, params)
 
-    assert excinfo._excinfo[0] == ValueError
     plt.close()
 
 
@@ -119,13 +149,11 @@ def test_raise_error_in_plot_perf_over_time_in_base_task(metric_name):
     api = BaseTask()
 
     if metric_name == 'unknown':
-        with pytest.raises(ValueError) as excinfo:
+        with pytest.raises(ValueError):
             api.plot_perf_over_time(metric_name)
-        assert excinfo._excinfo[0] == ValueError
     else:
-        with pytest.raises(RuntimeError) as excinfo:
+        with pytest.raises(RuntimeError):
             api.plot_perf_over_time(metric_name)
-        assert excinfo._excinfo[0] == RuntimeError
 
 
 @pytest.mark.parametrize('metric_name', ('balanced_accuracy', 'accuracy'))
@@ -175,7 +203,7 @@ def test_raise_error_get_perf_and_time(params):
     results = np.linspace(-1, 1, 10)
     cum_times = np.linspace(0, 1, 10)
 
-    with pytest.raises(ValueError) as excinfo:
+    with pytest.raises(ValueError):
         _get_perf_and_time(
             cum_results=results,
             cum_times=cum_times,
@@ -183,8 +211,6 @@ def test_raise_error_get_perf_and_time(params):
             worst_val=np.inf
         )
 
-    assert excinfo._excinfo[0] == ValueError
-
 
 @pytest.mark.parametrize('params', (
     PlotSettingParams(n_points=20, xscale='linear', yscale='linear'),