From b39a86490665efa7a307c21841edd6611f4bdfa8 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Mon, 13 Jun 2022 21:10:19 +0000
Subject: [PATCH 01/70] Use new Train API for examples

---
 doc/source/train/examples.rst                 |  2 +-
 .../examples/tune_linear_dataset_example.rst  |  6 --
 .../tune_torch_linear_dataset_example.rst     |  6 ++
 python/ray/air/result.py                      |  6 +-
 python/ray/train/BUILD                        | 10 ---
 .../train/examples/horovod/horovod_example.py | 17 +++--
 .../examples/mlflow_fashion_mnist_example.py  | 21 +++---
 .../tensorflow_linear_dataset_example.py      | 18 +++--
 .../examples/tensorflow_mnist_example.py      | 12 ++--
 .../train/examples/tensorflow_quick_start.py  | 17 +++--
 ...peline_for_host_to_device_data_transfer.py | 13 ++--
 .../ray/train/examples/torch_quick_start.py   | 19 +++---
 .../examples/train_fashion_mnist_example.py   | 20 +++---
 .../examples/train_linear_dataset_example.py  | 15 ++--
 .../train/examples/train_linear_example.py    | 14 ++--
 .../transformers/transformers_example.py      | 17 +++--
 .../tune_cifar_pytorch_pbt_example.py         | 56 ++++++++-------
 .../examples/tune_linear_dataset_example.py   | 68 -------------------
 .../ray/train/examples/tune_linear_example.py | 34 ++++++----
 .../examples/tune_tensorflow_mnist_example.py | 36 +++++-----
 python/ray/tune/result_grid.py                |  6 +-
 21 files changed, 178 insertions(+), 235 deletions(-)
 delete mode 100644 doc/source/train/examples/tune_linear_dataset_example.rst
 create mode 100644 doc/source/train/examples/tune_torch_linear_dataset_example.rst
 delete mode 100644 python/ray/train/examples/tune_linear_dataset_example.py

diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst
index 6affd7457a1c..e644f708b639 100644
--- a/doc/source/train/examples.rst
+++ b/doc/source/train/examples.rst
@@ -62,7 +62,7 @@ Ray Datasets Integration Examples
 * :doc:`/train/examples/train_linear_dataset_example`:
   Simple example for training a linear PyTorch model.
 
-* :doc:`/train/examples/tune_linear_dataset_example`:
+* :doc:`/air/examples/tune_torch_linear_dataset_example`:
   Simple example for tuning a linear PyTorch model.
 
 
diff --git a/doc/source/train/examples/tune_linear_dataset_example.rst b/doc/source/train/examples/tune_linear_dataset_example.rst
deleted file mode 100644
index d25af796465c..000000000000
--- a/doc/source/train/examples/tune_linear_dataset_example.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-:orphan:
-
-tune_linear_dataset_example
-===========================
-
-.. literalinclude:: /../../python/ray/train/examples/tune_linear_dataset_example.py
diff --git a/doc/source/train/examples/tune_torch_linear_dataset_example.rst b/doc/source/train/examples/tune_torch_linear_dataset_example.rst
new file mode 100644
index 000000000000..22ad2e562660
--- /dev/null
+++ b/doc/source/train/examples/tune_torch_linear_dataset_example.rst
@@ -0,0 +1,6 @@
+:orphan:
+
+tune_torch_linear_dataset_example
+=================================
+
+.. literalinclude:: /../../python/ray/air/examples/tune_torch_linear_dataset_example.py
diff --git a/python/ray/air/result.py b/python/ray/air/result.py
index 69cfd69926b8..97472c64d395 100644
--- a/python/ray/air/result.py
+++ b/python/ray/air/result.py
@@ -1,5 +1,5 @@
-from typing import Any, Dict, Optional
 from dataclasses import dataclass
+from typing import Any, Dict, Optional
 
 from ray.air.checkpoint import Checkpoint
 from ray.util.annotations import PublicAPI
@@ -13,7 +13,7 @@ class Result:
     This is the class produced by Trainer.fit().
     It contains a checkpoint, which can be used for resuming training and for
     creating a Predictor object. It also contains a metrics object describing
-    training metrics. `error` is included so that non successful runs
+    training metrics. ``error`` is included so that non successful runs
     and trials can be represented as well.
 
     The constructor is a private API.
@@ -22,11 +22,13 @@ class Result:
         metrics: The final metrics as reported by an Trainable.
         checkpoint: The final checkpoint of the Trainable.
         error: The execution error of the Trainable run, if the trial finishes in error.
+        log_dir: Directory where the trial logs are saved.
     """
 
     metrics: Optional[Dict[str, Any]]
     checkpoint: Optional[Checkpoint]
     error: Optional[Exception]
+    log_dir: Optional[str]
 
     @property
     def config(self) -> Optional[Dict[str, Any]]:
diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD
index dc2d6357b6cd..d33bad4d9ac2 100644
--- a/python/ray/train/BUILD
+++ b/python/ray/train/BUILD
@@ -82,16 +82,6 @@ py_test(
     args = ["--smoke-test"]
 )
 
-py_test(
-    name = "tune_linear_dataset_example",
-    size = "medium",
-    main = "examples/tune_linear_dataset_example.py",
-    srcs = ["examples/tune_linear_dataset_example.py"],
-    tags = ["team:ml", "exclusive", "gpu_only", "tune"],
-    deps = [":train_lib"],
-    args = ["--smoke-test", "--use-gpu"]
-)
-
 py_test(
     name = "tune_linear_example",
     size = "medium",
diff --git a/python/ray/train/examples/horovod/horovod_example.py b/python/ray/train/examples/horovod/horovod_example.py
index cb578b1fb18f..c3202307755f 100644
--- a/python/ray/train/examples/horovod/horovod_example.py
+++ b/python/ray/train/examples/horovod/horovod_example.py
@@ -2,15 +2,16 @@
 import os
 
 import horovod.torch as hvd
-import ray
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 import torch.utils.data.distributed
 from filelock import FileLock
-from ray.train import Trainer
 from torchvision import datasets, transforms
 
+import ray
+from ray.train.horovod import HorovodTrainer
+
 
 def metric_average(val, name):
     tensor = torch.tensor(val)
@@ -152,11 +153,13 @@ def train_func(config):
 
 
 def main(num_workers, use_gpu, kwargs):
-    trainer = Trainer("horovod", use_gpu=use_gpu, num_workers=num_workers)
-    trainer.start()
-    loss_per_epoch = trainer.run(train_func, config=kwargs)
-    trainer.shutdown()
-    print(loss_per_epoch)
+    trainer = HorovodTrainer(
+        train_func,
+        train_loop_config=kwargs,
+        scaling_config={"use_gpu": use_gpu, "num_workers": num_workers},
+    )
+    results = trainer.fit()
+    print(results)
 
 
 # Horovod Class API.
diff --git a/python/ray/train/examples/mlflow_fashion_mnist_example.py b/python/ray/train/examples/mlflow_fashion_mnist_example.py
index 05f915523543..7cd54b821859 100644
--- a/python/ray/train/examples/mlflow_fashion_mnist_example.py
+++ b/python/ray/train/examples/mlflow_fashion_mnist_example.py
@@ -1,20 +1,23 @@
 import argparse
 
-from ray.train import Trainer
+from ray.air import RunConfig
 from ray.train.examples.train_fashion_mnist_example import train_func
-from ray.train.callbacks.logging import MLflowLoggerCallback
+from ray.train.torch import TorchTrainer
+from ray.tune.integration.mlflow import MLflowLoggerCallback
 
 
 def main(num_workers=2, use_gpu=False):
-    trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu)
-    trainer.start()
-    final_results = trainer.run(
-        train_func=train_func,
-        config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
-        callbacks=[MLflowLoggerCallback(experiment_name="train_fashion_mnist")],
+    trainer = TorchTrainer(
+        train_func,
+        train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
+        scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
+        run_config=RunConfig(
+            callbacks=[MLflowLoggerCallback(experiment_name="train_fashion_mnist")]
+        ),
     )
+    final_results = trainer.fit()
 
-    print("Full losses for rank 0 worker: ", final_results)
+    print("Full results for rank 0 worker: ", final_results)
 
 
 if __name__ == "__main__":
diff --git a/python/ray/train/examples/tensorflow_linear_dataset_example.py b/python/ray/train/examples/tensorflow_linear_dataset_example.py
index c1360195b36c..9271c5125da4 100644
--- a/python/ray/train/examples/tensorflow_linear_dataset_example.py
+++ b/python/ray/train/examples/tensorflow_linear_dataset_example.py
@@ -7,8 +7,7 @@
 import ray.train as train
 from ray.data import Dataset
 from ray.data.dataset_pipeline import DatasetPipeline
-from ray.train import Trainer
-from ray.train.tensorflow import prepare_dataset_shard
+from ray.train.tensorflow import TensorflowTrainer, prepare_dataset_shard
 
 
 class TrainReportCallback(Callback):
@@ -55,7 +54,7 @@ def train_func(config):
         # Model building/compiling need to be within `strategy.scope()`.
         multi_worker_model = build_and_compile_model(config)
 
-    dataset_pipeline = train.get_dataset_shard()
+    dataset_pipeline = train.get_dataset_shard("train")
     dataset_iterator = dataset_pipeline.iter_epochs()
 
     results = []
@@ -78,14 +77,13 @@ def train_func(config):
 
 def train_tensorflow_linear(num_workers=2, use_gpu=False):
     dataset_pipeline = get_dataset_pipeline()
-    trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu)
-    trainer.start()
-    results = trainer.run(
-        train_func=train_func,
-        dataset=dataset_pipeline,
-        config={"lr": 1e-3, "batch_size": 32, "epochs": 4},
+    trainer = TensorflowTrainer(
+        train_func,
+        train_loop_config={"lr": 1e-3, "batch_size": 32, "epochs": 4},
+        datasets={"train": dataset_pipeline},
+        scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
     )
-    trainer.shutdown()
+    results = trainer.fit()
     print(f"Results: {results[0]}")
     return results
 
diff --git a/python/ray/train/examples/tensorflow_mnist_example.py b/python/ray/train/examples/tensorflow_mnist_example.py
index 3e89969cc58e..980e58652d95 100644
--- a/python/ray/train/examples/tensorflow_mnist_example.py
+++ b/python/ray/train/examples/tensorflow_mnist_example.py
@@ -10,7 +10,7 @@
 from tensorflow.keras.callbacks import Callback
 
 import ray.train as train
-from ray.train import Trainer
+from ray.train.tensorflow import TensorflowTrainer
 
 
 class TrainReportCallback(Callback):
@@ -81,12 +81,12 @@ def train_func(config):
 
 
 def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4):
-    trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu)
-    trainer.start()
-    results = trainer.run(
-        train_func=train_func, config={"lr": 1e-3, "batch_size": 64, "epochs": epochs}
+    trainer = TensorflowTrainer(
+        train_func,
+        train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": epochs},
+        scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
     )
-    trainer.shutdown()
+    results = trainer.fit()
     print(f"Results: {results[0]}")
 
 
diff --git a/python/ray/train/examples/tensorflow_quick_start.py b/python/ray/train/examples/tensorflow_quick_start.py
index 0907853135b9..0ac3666672e2 100644
--- a/python/ray/train/examples/tensorflow_quick_start.py
+++ b/python/ray/train/examples/tensorflow_quick_start.py
@@ -3,6 +3,9 @@
 
 # __tf_setup_begin__
 
+import json
+import os
+
 import numpy as np
 import tensorflow as tf
 
@@ -47,8 +50,6 @@ def train_func():
 
 # __tf_distributed_begin__
 
-import json
-import os
 
 def train_func_distributed():
     per_worker_batch_size = 64
@@ -78,15 +79,13 @@ def train_func_distributed():
 
     # __tf_trainer_begin__
 
-    from ray.train import Trainer
-
-    trainer = Trainer(backend="tensorflow", num_workers=4)
+    from ray.train.tensorflow import TensorflowTrainer
 
     # For GPU Training, set `use_gpu` to True.
-    # trainer = Trainer(backend="tensorflow", num_workers=4, use_gpu=True)
+    use_gpu = False
+
+    trainer = TensorflowTrainer(train_func_distributed, scaling_config={"num_workers":4, "use_gpu":use_gpu})
 
-    trainer.start()
-    results = trainer.run(train_func_distributed)
-    trainer.shutdown()
+    trainer.fit()
 
     # __tf_trainer_end__
diff --git a/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py b/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py
index c8cc25b044a0..03e69ca67f96 100644
--- a/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py
+++ b/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py
@@ -5,8 +5,9 @@
 import numpy as np
 import torch
 import torch.nn as nn
+
 import ray.train as train
-from ray.train import Trainer
+from ray.train.torch import TorchTrainer
 
 
 class Net(nn.Module):
@@ -94,7 +95,6 @@ def train_func(config):
 
 
 def train_linear(num_workers=1, num_hidden_layers=1, use_auto_transfer=True, epochs=3):
-    trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=True)
     config = {
         "lr": 1e-2,
         "hidden_size": num_hidden_layers,
@@ -102,9 +102,12 @@ def train_linear(num_workers=1, num_hidden_layers=1, use_auto_transfer=True, epo
         "epochs": epochs,
         "use_auto_transfer": use_auto_transfer,
     }
-    trainer.start()
-    results = trainer.run(train_func, config)
-    trainer.shutdown()
+    trainer = TorchTrainer(
+        train_func,
+        train_loop_config=config,
+        scaling_config={"use_gpu": True, "num_workers": num_workers},
+    )
+    results = trainer.fit()
 
     print(results)
     return results
diff --git a/python/ray/train/examples/torch_quick_start.py b/python/ray/train/examples/torch_quick_start.py
index e152c8604610..eaf07a95a5d1 100644
--- a/python/ray/train/examples/torch_quick_start.py
+++ b/python/ray/train/examples/torch_quick_start.py
@@ -4,6 +4,10 @@
 # __torch_setup_begin__
 import torch
 import torch.nn as nn
+import torch.optim as optim
+
+import ray.train.torch
+from ray import train
 
 num_samples = 20
 input_size = 10
@@ -28,7 +32,6 @@ def forward(self, input):
 
 # __torch_single_begin__
 
-import torch.optim as optim
 
 def train_func():
     num_epochs = 3
@@ -48,8 +51,6 @@ def train_func():
 
 # __torch_distributed_begin__
 
-from ray import train
-import ray.train.torch
 
 def train_func_distributed():
     num_epochs = 3
@@ -78,15 +79,13 @@ def train_func_distributed():
 
     # __torch_trainer_begin__
 
-    from ray.train import Trainer
-
-    trainer = Trainer(backend="torch", num_workers=4)
+    from ray.train.torch import TorchTrainer
 
     # For GPU Training, set `use_gpu` to True.
-    # trainer = Trainer(backend="torch", num_workers=4, use_gpu=True)
+    use_gpu = False
+
+    trainer = TorchTrainer(train_func_distributed, scaling_config={"num_workers":4, "use_gpu":use_gpu})
 
-    trainer.start()
-    results = trainer.run(train_func_distributed)
-    trainer.shutdown()
+    results = trainer.fit()
 
     # __torch_trainer_end__
diff --git a/python/ray/train/examples/train_fashion_mnist_example.py b/python/ray/train/examples/train_fashion_mnist_example.py
index 5c172dc5a949..6e8db3220db4 100644
--- a/python/ray/train/examples/train_fashion_mnist_example.py
+++ b/python/ray/train/examples/train_fashion_mnist_example.py
@@ -2,14 +2,14 @@
 from typing import Dict
 
 import torch
-import ray.train as train
-from ray.train.trainer import Trainer
-from ray.train.callbacks import JsonLoggerCallback
 from torch import nn
 from torch.utils.data import DataLoader
 from torchvision import datasets
 from torchvision.transforms import ToTensor
 
+import ray.train as train
+from ray.train.torch import TorchTrainer
+
 # Download training data from open datasets.
 training_data = datasets.FashionMNIST(
     root="~/data",
@@ -118,15 +118,13 @@ def train_func(config: Dict):
 
 
 def train_fashion_mnist(num_workers=2, use_gpu=False):
-    trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu)
-    trainer.start()
-    result = trainer.run(
-        train_func=train_func,
-        config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
-        callbacks=[JsonLoggerCallback()],
+    trainer = TorchTrainer(
+        train_func,
+        train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
+        scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
     )
-    trainer.shutdown()
-    print(f"Loss results: {result}")
+    result = trainer.fit()
+    print(f"Results: {result}")
 
 
 if __name__ == "__main__":
diff --git a/python/ray/train/examples/train_linear_dataset_example.py b/python/ray/train/examples/train_linear_dataset_example.py
index 2ce30c9b81b8..1cfbff434c9f 100644
--- a/python/ray/train/examples/train_linear_dataset_example.py
+++ b/python/ray/train/examples/train_linear_dataset_example.py
@@ -8,8 +8,7 @@
 import ray.train as train
 from ray.data import Dataset
 from ray.data.dataset_pipeline import DatasetPipeline
-from ray.train import Trainer
-from ray.train.callbacks import JsonLoggerCallback, TBXLoggerCallback
+from ray.train.torch import TorchTrainer
 
 
 def get_datasets(a=5, b=10, size=1000, split=0.8) -> Dict[str, DatasetPipeline]:
@@ -120,16 +119,14 @@ def train_func(config):
 def train_linear(num_workers=2, use_gpu=False):
     datasets = get_datasets()
 
-    trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu)
     config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}
-    trainer.start()
-    results = trainer.run(
+    trainer = TorchTrainer(
         train_func,
-        config,
-        dataset=datasets,
-        callbacks=[JsonLoggerCallback(), TBXLoggerCallback()],
+        train_loop_config=config,
+        datasets=datasets,
+        scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
     )
-    trainer.shutdown()
+    results = trainer.fit()
     print(results)
     return results
 
diff --git a/python/ray/train/examples/train_linear_example.py b/python/ray/train/examples/train_linear_example.py
index 8a784190d3cc..40d850754401 100644
--- a/python/ray/train/examples/train_linear_example.py
+++ b/python/ray/train/examples/train_linear_example.py
@@ -3,9 +3,9 @@
 import numpy as np
 import torch
 import torch.nn as nn
+
 import ray.train as train
-from ray.train import Trainer
-from ray.train.callbacks import JsonLoggerCallback, TBXLoggerCallback
+from ray.train.torch import TorchTrainer
 
 
 class LinearDataset(torch.utils.data.Dataset):
@@ -86,13 +86,13 @@ def train_func(config):
 
 
 def train_linear(num_workers=2, use_gpu=False, epochs=3):
-    trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu)
     config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
-    trainer.start()
-    results = trainer.run(
-        train_func, config, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()]
+    trainer = TorchTrainer(
+        train_func,
+        train_loop_config=config,
+        scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
     )
-    trainer.shutdown()
+    results = trainer.fit()
 
     print(results)
     return results
diff --git a/python/ray/train/examples/transformers/transformers_example.py b/python/ray/train/examples/transformers/transformers_example.py
index ce269733d4de..b6cb461c4b73 100644
--- a/python/ray/train/examples/transformers/transformers_example.py
+++ b/python/ray/train/examples/transformers/transformers_example.py
@@ -20,14 +20,12 @@
 import math
 import os
 import random
-from typing import Dict, Any
+from typing import Any, Dict
 
 import datasets
-import ray
 import transformers
 from accelerate import Accelerator
 from datasets import load_dataset, load_metric
-from ray.train import Trainer
 from torch.utils.data.dataloader import DataLoader
 from tqdm.auto import tqdm
 from transformers import (
@@ -44,6 +42,9 @@
 )
 from transformers.utils.versions import require_version
 
+import ray
+from ray.train.torch import TorchTrainer
+
 logger = logging.getLogger(__name__)
 
 require_version(
@@ -612,9 +613,13 @@ def main():
         else:
             # Connect to a Ray cluster for distributed training.
             ray.init(address=args.address)
-        trainer = Trainer("torch", num_workers=args.num_workers, use_gpu=args.use_gpu)
-        trainer.start()
-        trainer.run(train_func, config)
+        trainer = TorchTrainer(
+            train_func,
+            train_loop_config=config,
+            scaling_config={"num_workers": args.num_workers, "use_gpu": args.use_gpu},
+        )
+        results = trainer.fit()
+        print(results)
     else:
         # Run training locally.
         train_func(config)
diff --git a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py
index c600684e2479..5e4711adae84 100644
--- a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py
+++ b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py
@@ -11,9 +11,11 @@
 import ray
 import ray.train as train
 from ray import tune
-from ray.train import Trainer
-from ray.tune import CLIReporter
+from ray.air.config import FailureConfig, RunConfig
+from ray.train.torch import TorchTrainer
 from ray.tune.schedulers import PopulationBasedTraining
+from ray.tune.tune_config import TuneConfig
+from ray.tune.tuner import Tuner
 from ray.util.ml_utils.resnet import ResNet18
 
 
@@ -149,8 +151,10 @@ def train_func(config):
     else:
         ray.init(address=args.address)
 
-    trainer = Trainer("torch", num_workers=args.num_workers, use_gpu=args.use_gpu)
-    Trainable = trainer.to_tune_trainable(train_func)
+    trainer = TorchTrainer(
+        train_func,
+        scaling_config={"num_workers": args.num_workers, "use_gpu": args.use_gpu},
+    )
     pbt_scheduler = PopulationBasedTraining(
         time_attr="training_iteration",
         metric="loss",
@@ -158,32 +162,32 @@ def train_func(config):
         perturbation_interval=1,
         hyperparam_mutations={
             # distribution for resampling
-            "lr": lambda: np.random.uniform(0.001, 1),
+            "train_loop_config/lr": lambda: np.random.uniform(0.001, 1),
             # allow perturbations within this set of categorical values
-            "momentum": [0.8, 0.9, 0.99],
+            "train_loop_config/momentum": [0.8, 0.9, 0.99],
         },
     )
 
-    reporter = CLIReporter()
-    reporter.add_metric_column("loss", "loss")
-
-    analysis = tune.run(
-        Trainable,
-        num_samples=4,
-        config={
-            "lr": tune.choice([0.001, 0.01, 0.1]),
-            "momentum": 0.8,
-            "batch_size": 128 * args.num_workers,
-            "epochs": args.num_epochs,
-            "test_mode": args.smoke_test,  # whether to to subset the data
+    tuner = Tuner(
+        trainer,
+        param_space={
+            "train_loop_config": {
+                "lr": tune.choice([0.001, 0.01, 0.1]),
+                "momentum": 0.8,
+                "batch_size": 128 * args.num_workers,
+                "epochs": args.num_epochs,
+                "test_mode": args.smoke_test,  # whether to to subset the data
+            }
         },
-        stop={"training_iteration": 2 if args.smoke_test else 100},
-        max_failures=3,  # used for fault tolerance
-        checkpoint_freq=3,  # used for fault tolerance
-        keep_checkpoints_num=1,  # used for fault tolerance
-        verbose=2,
-        progress_reporter=reporter,
-        scheduler=pbt_scheduler,
+        tune_config=TuneConfig(
+            num_samples=4, metric="loss", mode="min", scheduler=pbt_scheduler
+        ),
+        run_config=RunConfig(
+            stop={"training_iteration": 2 if args.smoke_test else 100},
+            failure=FailureConfig(max_failures=3),  # used for fault tolerance
+        ),
     )
 
-    print(analysis.get_best_config(metric="loss", mode="min"))
+    results = tuner.fit()
+
+    print(results.get_best_result(metric="loss", mode="min"))
diff --git a/python/ray/train/examples/tune_linear_dataset_example.py b/python/ray/train/examples/tune_linear_dataset_example.py
deleted file mode 100644
index adc04f9ba3e3..000000000000
--- a/python/ray/train/examples/tune_linear_dataset_example.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import argparse
-
-import ray
-from ray import tune
-from ray.train import Trainer
-
-from train_linear_dataset_example import train_func, get_datasets
-
-
-def tune_linear(num_workers, num_samples, use_gpu):
-    datasets = get_datasets()
-
-    trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu)
-    Trainable = trainer.to_tune_trainable(train_func, dataset=datasets)
-    analysis = tune.run(
-        Trainable,
-        num_samples=num_samples,
-        config={
-            "lr": tune.loguniform(1e-4, 1e-1),
-            "batch_size": tune.choice([4, 16, 32]),
-            "epochs": 3,
-        },
-    )
-    results = analysis.get_best_config(metric="loss", mode="min")
-    print(results)
-    return results
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--smoke-test",
-        action="store_true",
-        default=False,
-        help="Finish quickly for testing.",
-    )
-    parser.add_argument(
-        "--address", required=False, type=str, help="the address to use for Ray"
-    )
-    parser.add_argument(
-        "--num-workers",
-        "-n",
-        type=int,
-        default=2,
-        help="Sets number of workers for training.",
-    )
-    parser.add_argument(
-        "--num-samples",
-        type=int,
-        default=2,
-        help="Sets number of samples for training.",
-    )
-    parser.add_argument(
-        "--use-gpu", action="store_true", default=False, help="Use GPU for training."
-    )
-
-    args = parser.parse_args()
-
-    if args.smoke_test:
-        # 1 for driver, 1 for datasets
-        num_cpus = args.num_workers + 2
-        num_gpus = args.num_workers if args.use_gpu else 0
-        ray.init(num_cpus=args.num_workers + 2, num_gpus=num_gpus)
-    else:
-        ray.init(address=args.address)
-    tune_linear(
-        num_workers=args.num_workers, use_gpu=args.use_gpu, num_samples=args.num_samples
-    )
diff --git a/python/ray/train/examples/tune_linear_example.py b/python/ray/train/examples/tune_linear_example.py
index a0641906c202..5d4a8edc911b 100644
--- a/python/ray/train/examples/tune_linear_example.py
+++ b/python/ray/train/examples/tune_linear_example.py
@@ -1,27 +1,31 @@
 import argparse
 
+from train_linear_example import train_func
+
 import ray
 from ray import tune
-from ray.train import Trainer
-
-from train_linear_example import train_func
+from ray.train.torch import TorchTrainer
+from ray.tune.tune_config import TuneConfig
+from ray.tune.tuner import Tuner
 
 
 def tune_linear(num_workers, num_samples):
-    trainer = Trainer("torch", num_workers=num_workers)
-    Trainable = trainer.to_tune_trainable(train_func)
-    analysis = tune.run(
-        Trainable,
-        num_samples=num_samples,
-        config={
-            "lr": tune.loguniform(1e-4, 1e-1),
-            "batch_size": tune.choice([4, 16, 32]),
-            "epochs": 3,
+    trainer = TorchTrainer(train_func, scaling_config={"num_workers": num_workers})
+    tuner = Tuner(
+        trainer,
+        param_space={
+            "train_loop_config": {
+                "lr": tune.loguniform(1e-4, 1e-1),
+                "batch_size": tune.choice([4, 16, 32]),
+                "epochs": 3,
+            },
         },
+        tune_config=TuneConfig(num_samples=num_samples),
     )
-    results = analysis.get_best_config(metric="loss", mode="min")
-    print(results)
-    return results
+    analysis = tuner.fit()
+    result = analysis.get_best_result(metric="loss", mode="min")
+    print(result)
+    return result
 
 
 if __name__ == "__main__":
diff --git a/python/ray/train/examples/tune_tensorflow_mnist_example.py b/python/ray/train/examples/tune_tensorflow_mnist_example.py
index 8ab6776c3b64..4fc408b2d6eb 100644
--- a/python/ray/train/examples/tune_tensorflow_mnist_example.py
+++ b/python/ray/train/examples/tune_tensorflow_mnist_example.py
@@ -1,28 +1,32 @@
 import argparse
 
+from tensorflow_mnist_example import train_func
+
 import ray
 from ray import tune
-from ray.train import Trainer
-
-from tensorflow_mnist_example import train_func
+from ray.train.tensorflow import TensorflowTrainer
+from ray.tune.tune_config import TuneConfig
+from ray.tune.tuner import Tuner
 
 
 def tune_tensorflow_mnist(num_workers, num_samples):
-    trainer = Trainer(backend="tensorflow", num_workers=num_workers)
-    Trainable = trainer.to_tune_trainable(train_func)
-    analysis = tune.run(
-        Trainable,
-        num_samples=num_samples,
-        config={
-            "lr": tune.loguniform(1e-4, 1e-1),
-            "batch_size": tune.choice([32, 64, 128]),
-            "epochs": 3,
+    trainer = TensorflowTrainer(train_func, scaling_config={"num_workers": num_workers})
+    tuner = Tuner(
+        trainer,
+        param_space={
+            "train_loop_config": {
+                "lr": tune.loguniform(1e-4, 1e-1),
+                "batch_size": tune.choice([32, 64, 128]),
+                "epochs": 3,
+            },
         },
+        tune_config=TuneConfig(num_samples=num_samples),
     )
-    best_loss = analysis.get_best_config(metric="loss", mode="min")
-    best_accuracy = analysis.get_best_config(metric="accuracy", mode="max")
-    print(f"Best loss config: {best_loss}")
-    print(f"Best accuracy config: {best_accuracy}")
+    analysis = tuner.fit()
+    best_loss = analysis.get_best_result(metric="loss", mode="min")
+    best_accuracy = analysis.get_best_result(metric="accuracy", mode="max")
+    print(f"Best loss result: {best_loss}")
+    print(f"Best accuracy result: {best_accuracy}")
     return analysis
 
 
diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index 66440074a62f..f770e3f05f35 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -2,10 +2,11 @@
 from typing import Optional, Union
 
 import pandas as pd
-from ray.cloudpickle import cloudpickle
-from ray.exceptions import RayTaskError
+
 from ray.air.checkpoint import Checkpoint
 from ray.air.result import Result
+from ray.cloudpickle import cloudpickle
+from ray.exceptions import RayTaskError
 from ray.tune import ExperimentAnalysis
 from ray.tune.error import TuneError
 from ray.tune.trial import Trial
@@ -177,5 +178,6 @@ def _trial_to_result(self, trial: Trial) -> Result:
             checkpoint=checkpoint,
             metrics=trial.last_result.copy(),
             error=self._populate_exception(trial),
+            log_dir=trial.logdir,
         )
         return result

From b31399ef71afd24f0ac84df9bcbb8a761be893b6 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 14 Jun 2022 16:07:15 +0000
Subject: [PATCH 02/70] Fix FailureConfig not being a dataclass

---
 python/ray/air/config.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/python/ray/air/config.py b/python/ray/air/config.py
index 40b63603f51c..a8f5c2b85c66 100644
--- a/python/ray/air/config.py
+++ b/python/ray/air/config.py
@@ -1,14 +1,5 @@
 from dataclasses import dataclass
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Dict,
-    List,
-    Mapping,
-    Optional,
-    Union,
-)
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Mapping, Optional, Union
 
 from ray.air.constants import WILDCARD_KEY
 from ray.tune.syncer import SyncConfig
@@ -267,6 +258,7 @@ def _merge(self, other: "DatasetConfig") -> "DatasetConfig":
         return new_config
 
 
+@dataclass
 @PublicAPI(stability="alpha")
 class FailureConfig:
     """Configuration related to failure handling of each run/trial.

From 5cc9229716f8526f10632fddf0ef282308a47da4 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 14 Jun 2022 16:07:22 +0000
Subject: [PATCH 03/70] Fix errors

---
 .../examples/mlflow_fashion_mnist_example.py  |  2 +-
 .../train/examples/mlflow_simple_example.py   | 36 ++++++++++---------
 .../examples/tensorflow_mnist_example.py      |  2 +-
 .../train/examples/train_linear_example.py    |  2 +-
 4 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/python/ray/train/examples/mlflow_fashion_mnist_example.py b/python/ray/train/examples/mlflow_fashion_mnist_example.py
index 7cd54b821859..1cda7fc3e1ac 100644
--- a/python/ray/train/examples/mlflow_fashion_mnist_example.py
+++ b/python/ray/train/examples/mlflow_fashion_mnist_example.py
@@ -47,7 +47,7 @@ def main(num_workers=2, use_gpu=False):
     import ray
 
     if args.smoke_test:
-        ray.init(num_cpus=2)
+        ray.init(num_cpus=4)
         args.num_workers = 2
         args.use_gpu = False
     else:
diff --git a/python/ray/train/examples/mlflow_simple_example.py b/python/ray/train/examples/mlflow_simple_example.py
index 548b44d96f3c..d61a435ce3e3 100644
--- a/python/ray/train/examples/mlflow_simple_example.py
+++ b/python/ray/train/examples/mlflow_simple_example.py
@@ -1,6 +1,8 @@
 from ray import train
-from ray.train import Trainer
-from ray.train.callbacks import MLflowLoggerCallback, TBXLoggerCallback
+from ray.air import RunConfig
+from ray.train.torch import TorchTrainer
+from ray.tune.integration.mlflow import MLflowLoggerCallback
+from ray.tune.logger import TBXLoggerCallback
 
 
 def train_func():
@@ -8,29 +10,31 @@ def train_func():
         train.report(epoch=i)
 
 
-trainer = Trainer(backend="torch", num_workers=2)
-trainer.start()
+trainer = TorchTrainer(
+    train_func,
+    scaling_config={"num_workers": 2},
+    run_config=RunConfig(
+        callbacks=[
+            MLflowLoggerCallback(experiment_name="train_experiment"),
+            TBXLoggerCallback(),
+        ],
+    ),
+)
 
 # Run the training function, logging all the intermediate results
 # to MLflow and Tensorboard.
-result = trainer.run(
-    train_func,
-    callbacks=[
-        MLflowLoggerCallback(experiment_name="train_experiment"),
-        TBXLoggerCallback(),
-    ],
-)
+result = trainer.fit()
 
 # Print the latest run directory and keep note of it.
-# For example: /home/ray_results/train_2021-09-01_12-00-00/run_001
-print("Run directory:", trainer.latest_run_dir)
-
-trainer.shutdown()
+# For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06\
+# /TorchTrainer_c02c7_00000_0_2022-06-13_20-31-06
+print("Run directory:", result.logdir)
 
 # How to visualize the logs
 
 # Navigate to the run directory of the trainer.
-# For example `cd /home/ray_results/train_2021-09-01_12-00-00/run_001`
+# For example `cd /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06\
+# /TorchTrainer_c02c7_00000_0_2022-06-13_20-31-06`
 # $ cd <TRAINER_RUN_DIR>
 #
 # # View the MLflow UI.
diff --git a/python/ray/train/examples/tensorflow_mnist_example.py b/python/ray/train/examples/tensorflow_mnist_example.py
index 980e58652d95..a0ef319f8756 100644
--- a/python/ray/train/examples/tensorflow_mnist_example.py
+++ b/python/ray/train/examples/tensorflow_mnist_example.py
@@ -120,7 +120,7 @@ def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4):
     import ray
 
     if args.smoke_test:
-        ray.init(num_cpus=2)
+        ray.init(num_cpus=4)
         train_tensorflow_mnist()
     else:
         ray.init(address=args.address)
diff --git a/python/ray/train/examples/train_linear_example.py b/python/ray/train/examples/train_linear_example.py
index 40d850754401..069c6dd13db1 100644
--- a/python/ray/train/examples/train_linear_example.py
+++ b/python/ray/train/examples/train_linear_example.py
@@ -128,7 +128,7 @@ def train_linear(num_workers=2, use_gpu=False, epochs=3):
     import ray
 
     if args.smoke_test:
-        ray.init(num_cpus=2)
+        ray.init(num_cpus=4)
         train_linear()
     else:
         ray.init(address=args.address)

From 523021843ab6886fbdce1f40381be0ed733ced98 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 14 Jun 2022 17:01:17 +0000
Subject: [PATCH 04/70] Fix

---
 doc/source/train/examples/tune_torch_linear_dataset_example.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/train/examples/tune_torch_linear_dataset_example.rst b/doc/source/train/examples/tune_torch_linear_dataset_example.rst
index 22ad2e562660..df74e93ebdf2 100644
--- a/doc/source/train/examples/tune_torch_linear_dataset_example.rst
+++ b/doc/source/train/examples/tune_torch_linear_dataset_example.rst
@@ -3,4 +3,4 @@
 tune_torch_linear_dataset_example
 =================================
 
-.. literalinclude:: /../../python/ray/air/examples/tune_torch_linear_dataset_example.py
+.. literalinclude:: /../../python/ray/air/examples/pytorch/tune_torch_linear_dataset_example.py

From ef4a3fcda417ca38dae7fc8caf37b289481dd064 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 14 Jun 2022 17:46:39 +0000
Subject: [PATCH 05/70] Fix link

---
 doc/source/train/examples.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst
index e644f708b639..2a4e0b75bbd1 100644
--- a/doc/source/train/examples.rst
+++ b/doc/source/train/examples.rst
@@ -62,7 +62,7 @@ Ray Datasets Integration Examples
 * :doc:`/train/examples/train_linear_dataset_example`:
   Simple example for training a linear PyTorch model.
 
-* :doc:`/air/examples/tune_torch_linear_dataset_example`:
+* :doc:`/train/examples/tune_torch_linear_dataset_example`:
   Simple example for tuning a linear PyTorch model.
 
 

From f5cfe6262dfeb173663ab7693ff1dfd60b385208 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 14 Jun 2022 19:47:22 +0000
Subject: [PATCH 06/70] Fix simple example

---
 .../train/examples/mlflow_simple_example.py   | 25 +++++++++++++------
 python/ray/tune/result_grid.py                |  2 +-
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/python/ray/train/examples/mlflow_simple_example.py b/python/ray/train/examples/mlflow_simple_example.py
index d61a435ce3e3..d64d0525ae58 100644
--- a/python/ray/train/examples/mlflow_simple_example.py
+++ b/python/ray/train/examples/mlflow_simple_example.py
@@ -25,20 +25,29 @@ def train_func():
 # to MLflow and Tensorboard.
 result = trainer.fit()
 
+# For MLFLow logs:
+
+# MLFlow logs will by default be saved in an `mlflow` directory
+# in the current working directory.
+
+# $ cd mlflow
+# # View the MLflow UI.
+# $ mlflow ui
+
+# You can change the directory by setting the `tracking_uri` argument
+# in `MLflowLoggerCallback`.
+
+# For TensorBoard logs:
+
 # Print the latest run directory and keep note of it.
-# For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06\
-# /TorchTrainer_c02c7_00000_0_2022-06-13_20-31-06
-print("Run directory:", result.logdir)
+# For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06
+print("Run directory:", result.log_dir)
 
 # How to visualize the logs
 
 # Navigate to the run directory of the trainer.
-# For example `cd /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06\
-# /TorchTrainer_c02c7_00000_0_2022-06-13_20-31-06`
+# For example `cd /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06`
 # $ cd <TRAINER_RUN_DIR>
 #
-# # View the MLflow UI.
-# $ mlflow ui
-#
 # # View the tensorboard UI.
 # $ tensorboard --logdir .
diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index f770e3f05f35..38ebf357e5e4 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -178,6 +178,6 @@ def _trial_to_result(self, trial: Trial) -> Result:
             checkpoint=checkpoint,
             metrics=trial.last_result.copy(),
             error=self._populate_exception(trial),
-            log_dir=trial.logdir,
+            log_dir=trial.local_dir,
         )
         return result

From 468f7e80f48049f041a8c5fb038cc2d49a280b14 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 14 Jun 2022 20:09:43 +0000
Subject: [PATCH 07/70] train loop utils

---
 python/ray/train/train_loop_utils.py | 80 ++++++++++++++++------------
 1 file changed, 45 insertions(+), 35 deletions(-)

diff --git a/python/ray/train/train_loop_utils.py b/python/ray/train/train_loop_utils.py
index b4dde5f4ca7b..774fda94a324 100644
--- a/python/ray/train/train_loop_utils.py
+++ b/python/ray/train/train_loop_utils.py
@@ -1,11 +1,8 @@
-from typing import TYPE_CHECKING
-from typing import Optional, Dict, Union
 import warnings
+from typing import TYPE_CHECKING, Dict, Optional, Union
 
+from ray.train._internal.session import get_session
 from ray.train.constants import SESSION_MISUSE_LOG_ONCE_KEY
-from ray.train._internal.session import (
-    get_session,
-)
 from ray.util import PublicAPI, log_once
 
 if TYPE_CHECKING:
@@ -41,23 +38,25 @@ def get_dataset_shard(
 
         import ray
         from ray import train
+        from ray.train.torch import TorchTrainer
 
         def train_func():
             model = Net()
             for iter in range(100):
-                data_shard = train.get_dataset_shard().to_torch()
+                data_shard = train.get_dataset_shard("train").to_torch()
                 model.train(data_shard)
             return model
 
         dataset = ray.data.read_csv("train.csv")
         dataset.filter(...).repeat().random_shuffle()
 
-        trainer = Trainer(backend="torch")
-        trainer.start()
-
         # Trainer will automatically handle sharding.
-        train_model = trainer.run(train_func, dataset=dataset)
-        trainer.shutdown()
+        trainer = TorchTrainer(
+            train_func,
+            datasets={"train": dataset},
+            scaling_config={"num_workers": 2},
+        )
+        trainer.fit()
 
     Args:
         dataset_name: If a Dictionary of Datasets was passed to ``Trainer``, then
@@ -98,16 +97,15 @@ def report(**kwargs) -> None:
 
         import time
         from ray import train
+        from ray.train.torch import TorchTrainer
 
         def train_func():
             for iter in range(100):
                 time.sleep(1)
                 train.report(hello="world")
 
-        trainer = Trainer(backend="torch")
-        trainer.start()
-        trainer.run(train_func)
-        trainer.shutdown()
+        trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2})
+        trainer.fit()
 
     Args:
         **kwargs: Any key value pair to be reported by Train.
@@ -129,6 +127,7 @@ def world_rank() -> int:
 
         import time
         from ray import train
+        from ray.train.torch import TorchTrainer
 
         def train_func():
             for iter in range(100):
@@ -136,10 +135,8 @@ def train_func():
                 if train.world_rank() == 0:
                     print("Worker 0")
 
-        trainer = Trainer(backend="torch")
-        trainer.start()
-        trainer.run(train_func)
-        trainer.shutdown()
+        trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2})
+        trainer.fit()
 
     """
     session = get_session()
@@ -156,16 +153,18 @@ def local_rank() -> int:
 
         import time
         from ray import train
+        from ray.train.torch import TorchTrainer
 
         def train_func():
             if torch.cuda.is_available():
                 torch.cuda.set_device(train.local_rank())
             ...
 
-        trainer = Trainer(backend="torch", use_gpu=True)
-        trainer.start()
-        trainer.run(train_func)
-        trainer.shutdown()
+        trainer = TorchTrainer(
+            train_func,
+            scaling_config={"use_gpu": True, "num_workers": 2},
+        )
+        trainer.fit()
 
     """
     session = get_session()
@@ -181,18 +180,29 @@ def load_checkpoint() -> Optional[Dict]:
     .. code-block:: python
 
         from ray import train
+        from ray.air import Checkpoint
+        from ray.train.torch import TorchTrainer
 
         def train_func():
             checkpoint = train.load_checkpoint()
             for iter in range(checkpoint["epoch"], 5):
                 print(iter)
 
-        trainer = Trainer(backend="torch")
-        trainer.start()
-        trainer.run(train_func, checkpoint={"epoch": 3})
+        checkpoint = Checkpoint.from_dict(
+            {
+                # this would be set during checkpoint saving
+                "_current_checkpoint_id": 1,
+                "epoch": 3,
+            }
+        )
+        trainer = TorchTrainer(
+            train_func,
+            resume_from_checkpoint=checkpoint,
+            scaling_config={"num_workers": 2},
+        )
+        trainer.fit()
         # 3
         # 4
-        trainer.shutdown()
 
     Args:
         **kwargs: Any key value pair to be checkpointed by Train.
@@ -216,16 +226,16 @@ def save_checkpoint(**kwargs) -> None:
 
         import time
         from ray import train
+        from ray.train.torch import TorchTrainer
 
         def train_func():
             for iter in range(100):
                 time.sleep(1)
                 train.save_checkpoint(epoch=iter)
 
-        trainer = Trainer(backend="torch")
-        trainer.start()
-        trainer.run(train_func)
-        trainer.shutdown()
+        trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2})
+        result = trainer.fit()
+        assert result.checkpoint
 
     Args:
         **kwargs: Any key value pair to be checkpointed by Train.
@@ -245,14 +255,14 @@ def world_size() -> int:
 
         import time
         from ray import train
+        from ray.train.torch import TorchTrainer
 
         def train_func():
             assert train.world_size() == 4
 
-        trainer = Trainer(backend="torch", num_workers=4)
-        trainer.start()
-        trainer.run(train_func)
-        trainer.shutdown()
+        trainer = TorchTrainer(train_func, scaling_config={"num_workers": 4})
+        result = trainer.fit()
+
     """
     session = get_session()
     if session is None:

From 4ef6302cc5d4bfd3c00caaeb2e9ad6c678946858 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 14 Jun 2022 20:29:42 +0000
Subject: [PATCH 08/70] Remove tensorboard example

---
 python/ray/train/BUILD                        |  9 --
 .../torch_tensorboard_profiler_example.py     | 84 -------------------
 2 files changed, 93 deletions(-)
 delete mode 100644 python/ray/train/examples/torch_tensorboard_profiler_example.py

diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD
index d33bad4d9ac2..bf73935be49c 100644
--- a/python/ray/train/BUILD
+++ b/python/ray/train/BUILD
@@ -39,15 +39,6 @@ py_test(
     deps = [":train_lib"]
 )
 
-py_test(
-    name = "torch_tensorboard_profiler_example",
-    size = "small",
-    main = "examples/torch_tensorboard_profiler_example.py",
-    srcs = ["examples/torch_tensorboard_profiler_example.py"],
-    tags = ["team:ml", "exclusive"],
-    deps = [":train_lib"]
-)
-
 py_test(
     name = "transformers_example_gpu",
     size = "large",
diff --git a/python/ray/train/examples/torch_tensorboard_profiler_example.py b/python/ray/train/examples/torch_tensorboard_profiler_example.py
deleted file mode 100644
index 5f3641c31c8d..000000000000
--- a/python/ray/train/examples/torch_tensorboard_profiler_example.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import argparse
-
-import torch
-from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
-from torch.profiler import profile, record_function, schedule
-
-import ray
-import ray.train as train
-from ray.train import Trainer
-from ray.train.callbacks import TBXLoggerCallback
-from ray.train.callbacks.profile import TorchTensorboardProfilerCallback
-from ray.train.torch import TorchWorkerProfiler
-
-
-def train_func():
-    twp = TorchWorkerProfiler()
-    with profile(
-        activities=[],
-        schedule=schedule(wait=0, warmup=0, active=1),
-        on_trace_ready=twp.trace_handler,
-    ) as p:
-
-        # Setup model.
-        model = torch.nn.Linear(1, 1)
-        model = train.torch.prepare_model(model)
-        loss_fn = torch.nn.MSELoss()
-        optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)
-
-        # Setup data.
-        input = torch.randn(1000, 1)
-        labels = input * 2
-        dataset = torch.utils.data.TensorDataset(input, labels)
-        dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)
-        dataloader = train.torch.prepare_data_loader(dataloader)
-
-        # Train.
-        for epoch in range(5):
-            with record_function("train_epoch"):
-                for X, y in dataloader:
-                    pred = model(X)
-                    loss = loss_fn(pred, y)
-                    optimizer.zero_grad()
-                    loss.backward()
-                    optimizer.step()
-
-            with record_function("train_checkpoint"):
-                state_dict = model.state_dict()
-                consume_prefix_in_state_dict_if_present(state_dict, "module.")
-                train.save_checkpoint(epoch=epoch, model_weights=state_dict)
-
-            p.step()
-
-            with record_function("train_report"):
-                profile_results = twp.get_and_clear_profile_traces()
-                train.report(epoch=epoch, **profile_results)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--address", required=False, type=str, help="the address to use for Ray"
-    )
-    parser.add_argument(
-        "--num-workers",
-        "-n",
-        type=int,
-        default=2,
-        help="Sets number of workers for training.",
-    )
-    parser.add_argument(
-        "--use-gpu", action="store_true", default=False, help="Enables GPU training"
-    )
-
-    args = parser.parse_args()
-
-    ray.init(address=args.address)
-
-    callbacks = [TorchTensorboardProfilerCallback(), TBXLoggerCallback()]
-    trainer = Trainer(
-        backend="torch", num_workers=args.num_workers, use_gpu=args.use_gpu
-    )
-    trainer.start()
-    trainer.run(train_func, callbacks=callbacks)
-    trainer.shutdown()

From 5db3c14e400cc94da0448dd26b3cf5328b82ea4d Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 14 Jun 2022 20:30:09 +0000
Subject: [PATCH 09/70] PBT test update

---
 .../tune_cifar_pytorch_pbt_example.py         | 13 ++--
 .../workloads/pytorch_pbt_failure.py          | 77 ++++++++++---------
 2 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py
index 5e4711adae84..a7031b3116a1 100644
--- a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py
+++ b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py
@@ -58,6 +58,7 @@ def validate_epoch(dataloader, model, loss_fn):
 
 
 def train_func(config):
+    # print(config)
     epochs = config.pop("epochs", 3)
     model = ResNet18(config)
     model = train.torch.prepare_model(model)
@@ -157,14 +158,14 @@ def train_func(config):
     )
     pbt_scheduler = PopulationBasedTraining(
         time_attr="training_iteration",
-        metric="loss",
-        mode="min",
         perturbation_interval=1,
         hyperparam_mutations={
-            # distribution for resampling
-            "train_loop_config/lr": lambda: np.random.uniform(0.001, 1),
-            # allow perturbations within this set of categorical values
-            "train_loop_config/momentum": [0.8, 0.9, 0.99],
+            "train_loop_config": {
+                # distribution for resampling
+                "lr": lambda: np.random.uniform(0.001, 1),
+                # allow perturbations within this set of categorical values
+                "momentum": [0.8, 0.9, 0.99],
+            }
         },
     )
 
diff --git a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py
index 903e2a1cc553..d354b2834ac6 100644
--- a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py
+++ b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py
@@ -4,16 +4,16 @@
 import numpy as np
 
 import ray
-
 from ray import tune
+from ray.air.config import RunConfig
+from ray.train.examples.tune_cifar_pytorch_pbt_example import train_func
+from ray.train.torch import TorchConfig, TorchTrainer
 from ray.tune.schedulers import PopulationBasedTraining
+from ray.tune.tune_config import TuneConfig
+from ray.tune.tuner import Tuner
 from ray.tune.utils.mock import FailureInjectorCallback
 from ray.tune.utils.release_test_util import ProgressCallback
 
-from ray.train import Trainer
-from ray.train.examples.tune_cifar_pytorch_pbt_example import train_func
-from ray.train.torch import TorchConfig
-
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "--smoke-test",
@@ -26,46 +26,53 @@
 ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True)
 num_training_workers = 1 if args.smoke_test else 3
 
-trainer = Trainer(
-    num_workers=num_training_workers,
-    use_gpu=not args.smoke_test,
-    backend=TorchConfig(backend="gloo"),
+trainer = TorchTrainer(
+    train_func,
+    scaling_config=dict(
+        num_workers=num_training_workers,
+        use_gpu=not args.smoke_test,
+    ),
+    torch_config=TorchConfig(backend="gloo"),
 )
-TorchTrainable = trainer.to_tune_trainable(train_func=train_func)
 
 
 pbt_scheduler = PopulationBasedTraining(
     time_attr="training_iteration",
-    metric="loss",
-    mode="min",
     perturbation_interval=1,
     hyperparam_mutations={
-        # distribution for resampling
-        "lr": lambda: np.random.uniform(0.001, 1),
-        # allow perturbations within this set of categorical values
-        "momentum": [0.8, 0.9, 0.99],
+        "train_loop_config": {
+            # distribution for resampling
+            "lr": lambda: np.random.uniform(0.001, 1),
+            # allow perturbations within this set of categorical values
+            "momentum": [0.8, 0.9, 0.99],
+        }
     },
 )
 
-analysis = tune.run(
-    TorchTrainable,
-    num_samples=4,
-    config={
-        "lr": tune.choice([0.001, 0.01, 0.1]),
-        "momentum": 0.8,
-        "head_location": None,
-        "worker_locations": None,
-        "test_mode": args.smoke_test,
-        "batch_size": 128 * num_training_workers,
-        # For the long running test, we want the training to run forever, and it will
-        # be terminated by the release test infra.
-        "epochs": 1 if args.smoke_test else sys.maxsize,
+tuner = Tuner(
+    trainer,
+    param_space={
+        "train_loop_config": {
+            "lr": tune.choice([0.001, 0.01, 0.1]),
+            "momentum": 0.8,
+            "head_location": None,
+            "worker_locations": None,
+            "test_mode": args.smoke_test,
+            "batch_size": 128 * num_training_workers,
+            # For the long running test, we want the training to run forever,
+            # and it will be terminated by the release test infra.
+            "epochs": 1 if args.smoke_test else sys.maxsize,
+        }
     },
-    max_failures=-1,  # used for fault tolerance
-    checkpoint_freq=2,  # used for fault tolerance
-    scheduler=pbt_scheduler,
-    callbacks=[FailureInjectorCallback(time_between_checks=90), ProgressCallback()],
-    stop={"training_iteration": 1} if args.smoke_test else None,
+    tune_config=TuneConfig(
+        num_samples=4, metric="loss", mode="min", scheduler=pbt_scheduler
+    ),
+    run_config=RunConfig(
+        stop={"training_iteration": 1} if args.smoke_test else None,
+        callbacks=[FailureInjectorCallback(time_between_checks=90), ProgressCallback()],
+    ),
 )
 
-print(analysis.get_best_config(metric="loss", mode="min"))
+results = tuner.fit()
+
+print(results.get_best_result(metric="loss", mode="min"))

From cb805f297c6f14e0878cdca9fbc3774d4070191e Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 14 Jun 2022 20:48:31 +0000
Subject: [PATCH 10/70] WIP

---
 .../transformers/transformers_example.py      |  2 +-
 python/ray/train/tests/test_examples.py       | 19 ++++++++-----------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/python/ray/train/examples/transformers/transformers_example.py b/python/ray/train/examples/transformers/transformers_example.py
index b6cb461c4b73..1b47e5ce2e31 100644
--- a/python/ray/train/examples/transformers/transformers_example.py
+++ b/python/ray/train/examples/transformers/transformers_example.py
@@ -609,7 +609,7 @@ def main():
     if args.start_local or args.address or args.num_workers > 1 or args.use_gpu:
         if args.start_local:
             # Start a local Ray runtime.
-            ray.init(num_cpus=args.num_workers)
+            ray.init(num_cpus=args.num_workers + 2)
         else:
             # Connect to a Ray cluster for distributed training.
             ray.init(address=args.address)
diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py
index c72249b95ad5..10ebefa03588 100644
--- a/python/ray/train/tests/test_examples.py
+++ b/python/ray/train/tests/test_examples.py
@@ -19,7 +19,9 @@
     train_func as fashion_mnist_train_func,
 )
 from ray.train.examples.train_linear_example import train_func as linear_train_func
+from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer
 from ray.train.tests.test_trainer import KillCallback
+from ray.train.torch.torch_trainer import TorchTrainer
 
 
 @pytest.fixture
@@ -35,14 +37,11 @@ def test_tensorflow_mnist(ray_start_2_cpus, num_workers):
     num_workers = num_workers
     epochs = 3
 
-    trainer = Trainer("tensorflow", num_workers=num_workers)
+    trainer = TensorflowTrainer(tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=num_workers))
     config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
-    trainer.start()
-    results = trainer.run(tensorflow_mnist_train_func, config)
-    trainer.shutdown()
+    results = trainer.fit()
 
-    assert len(results) == num_workers
-    result = results[0]
+    result = results.metrics
 
     loss = result["loss"]
     assert len(loss) == epochs
@@ -56,17 +55,15 @@ def test_tensorflow_mnist(ray_start_2_cpus, num_workers):
 def test_tf_non_distributed(ray_start_2_cpus):
     """Make sure Ray Train works without TF MultiWorkerMirroredStrategy."""
 
-    trainer = Trainer(backend="torch", num_workers=1)
-    trainer.start()
-    trainer.run(tf_quick_start_train_func)
-    trainer.shutdown()
+    trainer = TorchTrainer(tf_quick_start_train_func, scaling_config=dict(num_workers=1))
+    trainer.fit()
 
 
 def test_tensorflow_mnist_fail(ray_start_2_cpus):
     """Tests if tensorflow example works even with worker failure."""
     epochs = 3
 
-    trainer = Trainer("tensorflow", num_workers=2)
+    trainer = TensorflowTrainer(tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=num_workers))
     config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
     trainer.start()
     kill_callback = KillCallback(fail_on=0, trainer=trainer)

From 2f69e37e50b57e282c406f198096848a6e03c5d1 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Wed, 15 Jun 2022 18:08:55 +0000
Subject: [PATCH 11/70] Do not use pipeline

---
 .../ray/train/examples/train_linear_dataset_example.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/python/ray/train/examples/train_linear_dataset_example.py b/python/ray/train/examples/train_linear_dataset_example.py
index 1cfbff434c9f..f84faa5f7a11 100644
--- a/python/ray/train/examples/train_linear_dataset_example.py
+++ b/python/ray/train/examples/train_linear_dataset_example.py
@@ -7,11 +7,10 @@
 import ray
 import ray.train as train
 from ray.data import Dataset
-from ray.data.dataset_pipeline import DatasetPipeline
 from ray.train.torch import TorchTrainer
 
 
-def get_datasets(a=5, b=10, size=1000, split=0.8) -> Dict[str, DatasetPipeline]:
+def get_datasets(a=5, b=10, size=1000, split=0.8) -> Dict[str, Dataset]:
     def get_dataset(a, b, size) -> Dataset:
         items = [i / size for i in range(size)]
         dataset = ray.data.from_items([{"x": x, "y": a * x + b} for x in items])
@@ -23,12 +22,9 @@ def get_dataset(a, b, size) -> Dataset:
         [split]
     )
 
-    train_dataset_pipeline = train_dataset.repeat().random_shuffle_each_window()
-    validation_dataset_pipeline = validation_dataset.repeat()
-
     datasets = {
-        "train": train_dataset_pipeline,
-        "validation": validation_dataset_pipeline,
+        "train": train_dataset,
+        "validation": validation_dataset,
     }
 
     return datasets

From 0d8eeb4a879f161e7215f117c75ef4ded4358099 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Wed, 15 Jun 2022 18:14:21 +0000
Subject: [PATCH 12/70] Remove callback test

---
 python/ray/train/BUILD                   |   8 -
 python/ray/train/tests/test_callbacks.py | 357 -----------------------
 2 files changed, 365 deletions(-)
 delete mode 100644 python/ray/train/tests/test_callbacks.py

diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD
index bf73935be49c..1db84dd79d37 100644
--- a/python/ray/train/BUILD
+++ b/python/ray/train/BUILD
@@ -113,14 +113,6 @@ py_test(
     deps = [":train_lib"]
 )
 
-py_test(
-    name = "test_callbacks",
-    size = "medium",
-    srcs = ["tests/test_callbacks.py"],
-    tags = ["team:ml", "exclusive"],
-    deps = [":train_lib"]
-)
-
 py_test(
     name = "test_data_parallel_trainer",
     size = "medium",
diff --git a/python/ray/train/tests/test_callbacks.py b/python/ray/train/tests/test_callbacks.py
deleted file mode 100644
index 9aeb54088801..000000000000
--- a/python/ray/train/tests/test_callbacks.py
+++ /dev/null
@@ -1,357 +0,0 @@
-from typing import Dict, List
-import glob
-import io
-import json
-from collections import defaultdict
-from contextlib import redirect_stdout
-from pathlib import Path
-
-import pytest
-
-import ray
-import ray.train as train
-from ray.train import Trainer
-from ray.train.backend import BackendConfig, Backend
-from ray.train.callbacks import (
-    TrainingCallback,
-    JsonLoggerCallback,
-    PrintCallback,
-    TBXLoggerCallback,
-    TorchTensorboardProfilerCallback,
-)
-from ray.train.callbacks.logging import (
-    MLflowLoggerCallback,
-    _TrainCallbackLogdirManager,
-)
-from ray.train.constants import (
-    TRAINING_ITERATION,
-    DETAILED_AUTOFILLED_KEYS,
-    BASIC_AUTOFILLED_KEYS,
-    ENABLE_DETAILED_AUTOFILLED_METRICS_ENV,
-)
-from ray.train._internal.worker_group import WorkerGroup
-from ray.train._internal.results_preprocessors.preprocessor import (
-    SequentialResultsPreprocessor,
-)
-
-try:
-    from tensorflow.python.summary.summary_iterator import summary_iterator
-except ImportError:
-    summary_iterator = None
-
-
-@pytest.fixture
-def ray_start_4_cpus():
-    address_info = ray.init(num_cpus=4)
-    yield address_info
-    # The code after the yield will run as teardown code.
-    ray.shutdown()
-
-
-class TestConfig(BackendConfig):
-    @property
-    def backend_cls(self):
-        return TestBackend
-
-
-class TestBackend(Backend):
-    def on_start(self, worker_group: WorkerGroup, backend_config: TestConfig):
-        pass
-
-    def on_shutdown(self, worker_group: WorkerGroup, backend_config: TestConfig):
-        pass
-
-
-def test_print(ray_start_4_cpus):
-    num_workers = 4
-
-    def train_func():
-        train.report(rank=train.world_rank())
-
-    stream = io.StringIO()
-    with redirect_stdout(stream):
-        trainer = Trainer(TestConfig(), num_workers=num_workers)
-        trainer.start()
-        trainer.run(train_func, callbacks=[PrintCallback()])
-        trainer.shutdown()
-
-    output = stream.getvalue()
-    results = json.loads(output)
-
-    assert len(results) == num_workers
-    for i, result in enumerate(results):
-        assert set(result.keys()) == (BASIC_AUTOFILLED_KEYS | {"rank"})
-        assert result["rank"] == i
-
-
-@pytest.mark.parametrize("input", [None, "dir", "file"])
-def test_train_callback_logdir_manager(tmp_path, input):
-    default_dir = tmp_path / "default_dir"
-
-    if input == "dir":
-        input_logdir = tmp_path / "dir"
-        input_logdir.mkdir(parents=True)
-    elif input == "file":
-        input_logdir = tmp_path / "file"
-        input_logdir.touch()
-    else:
-        input_logdir = None
-
-    logdir_manager = _TrainCallbackLogdirManager(input_logdir)
-
-    if input_logdir:
-        path = logdir_manager.logdir_path
-        assert path == logdir_manager.logdir_path
-    else:
-        with pytest.raises(RuntimeError):
-            path = logdir_manager.logdir_path
-
-    if input_logdir and not Path(input_logdir).is_dir():
-        with pytest.raises(FileExistsError):
-            logdir_manager.setup_logdir(str(default_dir))
-    else:
-        path = logdir_manager.setup_logdir(str(default_dir))
-        assert path == logdir_manager.logdir_path
-
-
-@pytest.mark.parametrize("workers_to_log", [0, None, [0, 1]])
-@pytest.mark.parametrize("detailed", [False, True])
-@pytest.mark.parametrize("filename", [None, "my_own_filename.json"])
-def test_json(
-    monkeypatch, ray_start_4_cpus, tmp_path, workers_to_log, detailed, filename
-):
-    if detailed:
-        monkeypatch.setenv(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, "1")
-
-    config = TestConfig()
-
-    num_iters = 5
-    num_workers = 4
-
-    if workers_to_log is None:
-        num_workers_to_log = num_workers
-    elif isinstance(workers_to_log, int):
-        num_workers_to_log = 1
-    else:
-        num_workers_to_log = len(workers_to_log)
-
-    def train_func():
-        for i in range(num_iters):
-            train.report(index=i)
-        return 1
-
-    if filename is None:
-        # if None, use default value
-        callback = JsonLoggerCallback(workers_to_log=workers_to_log)
-    else:
-        callback = JsonLoggerCallback(filename=filename, workers_to_log=workers_to_log)
-    trainer = Trainer(config, num_workers=num_workers, logdir=str(tmp_path))
-    trainer.start()
-    trainer.run(train_func, callbacks=[callback])
-    if filename is None:
-        assert str(callback.log_path.name) == JsonLoggerCallback._default_filename
-    else:
-        assert str(callback.log_path.name) == filename
-
-    with open(callback.log_path, "r") as f:
-        log = json.load(f)
-    print(log)
-    assert len(log) == num_iters
-    assert len(log[0]) == num_workers_to_log
-    assert all(len(element) == len(log[0]) for element in log)
-    assert all(
-        all(worker["index"] == worker[TRAINING_ITERATION] - 1 for worker in element)
-        for element in log
-    )
-    assert all(
-        all(all(key in worker for key in BASIC_AUTOFILLED_KEYS) for worker in element)
-        for element in log
-    )
-    if detailed:
-        assert all(
-            all(
-                all(key in worker for key in DETAILED_AUTOFILLED_KEYS)
-                for worker in element
-            )
-            for element in log
-        )
-    else:
-        assert all(
-            all(
-                not any(key in worker for key in DETAILED_AUTOFILLED_KEYS)
-                for worker in element
-            )
-            for element in log
-        )
-
-
-def _validate_tbx_result(events_dir):
-    events_file = list(glob.glob(f"{events_dir}/events*"))[0]
-    results = defaultdict(list)
-    for event in summary_iterator(events_file):
-        for v in event.summary.value:
-            assert v.tag.startswith("ray/train")
-            results[v.tag[10:]].append(v.simple_value)
-
-    assert len(results["episode_reward_mean"]) == 3
-    assert [int(res) for res in results["episode_reward_mean"]] == [4, 5, 6]
-    assert len(results["score"]) == 1
-    assert len(results["hello/world"]) == 1
-
-
-def test_TBX(ray_start_4_cpus, tmp_path):
-    config = TestConfig()
-
-    temp_dir = tmp_path
-    num_workers = 4
-
-    def train_func():
-        train.report(episode_reward_mean=4)
-        train.report(episode_reward_mean=5)
-        train.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1})
-        return 1
-
-    callback = TBXLoggerCallback(temp_dir)
-    trainer = Trainer(config, num_workers=num_workers)
-    trainer.start()
-    trainer.run(train_func, callbacks=[callback])
-
-    _validate_tbx_result(temp_dir)
-
-
-def test_mlflow(ray_start_4_cpus, tmp_path):
-    config = TestConfig()
-
-    params = {"p1": "p1"}
-
-    temp_dir = tmp_path
-    num_workers = 4
-
-    def train_func(config):
-        train.report(episode_reward_mean=4)
-        train.report(episode_reward_mean=5)
-        train.report(episode_reward_mean=6)
-        return 1
-
-    callback = MLflowLoggerCallback(experiment_name="test_exp", logdir=temp_dir)
-    trainer = Trainer(config, num_workers=num_workers)
-    trainer.start()
-    trainer.run(train_func, config=params, callbacks=[callback])
-
-    from mlflow.tracking import MlflowClient
-
-    client = MlflowClient(tracking_uri=callback.mlflow_util._mlflow.get_tracking_uri())
-
-    experiment_id = client.get_experiment_by_name("test_exp").experiment_id
-    all_runs = callback.mlflow_util._mlflow.search_runs(experiment_ids=[experiment_id])
-    assert len(all_runs) == 1
-    # all_runs is a pandas dataframe.
-    all_runs = all_runs.to_dict(orient="records")
-    run_id = all_runs[0]["run_id"]
-    run = client.get_run(run_id)
-
-    assert run.data.params == params
-    assert (
-        "episode_reward_mean" in run.data.metrics
-        and run.data.metrics["episode_reward_mean"] == 6.0
-    )
-    assert (
-        TRAINING_ITERATION in run.data.metrics
-        and run.data.metrics[TRAINING_ITERATION] == 3.0
-    )
-
-    metric_history = client.get_metric_history(run_id=run_id, key="episode_reward_mean")
-
-    assert len(metric_history) == 3
-    iterations = [metric.step for metric in metric_history]
-    assert iterations == [1, 2, 3]
-    rewards = [metric.value for metric in metric_history]
-    assert rewards == [4, 5, 6]
-
-
-def test_torch_tensorboard_profiler_callback(ray_start_4_cpus, tmp_path):
-    config = TestConfig()
-
-    temp_dir = tmp_path
-    num_workers = 4
-    num_epochs = 2
-
-    def train_func():
-        from ray.train.torch import TorchWorkerProfiler
-        from torch.profiler import profile, record_function, schedule
-
-        twp = TorchWorkerProfiler()
-        with profile(
-            activities=[],
-            schedule=schedule(wait=0, warmup=0, active=1),
-            on_trace_ready=twp.trace_handler,
-        ) as p:
-
-            for epoch in range(num_epochs):
-                with record_function("test_function"):
-                    pass
-
-                p.step()
-
-                profile_results = twp.get_and_clear_profile_traces()
-                train.report(epoch=epoch, **profile_results)
-
-    callback = TorchTensorboardProfilerCallback(temp_dir)
-    trainer = Trainer(config, num_workers=num_workers)
-    trainer.start()
-    trainer.run(train_func, callbacks=[callback])
-
-    assert temp_dir.exists()
-
-    count = 0
-    for path in temp_dir.iterdir():
-        assert path.is_file()
-        count += 1
-    assert count == num_workers * num_epochs
-
-
-# fix issue: repeat assignments for preprocessor results nested recursive calling
-# see https://github.com/ray-project/ray/issues/25005
-def test_hotfix_callback_nested_recusive_calling():
-    # test callback used to simulate the nested recursive calling for preprocess()
-    class TestCallback(TrainingCallback):
-        def __init__(self):
-            self.max_process_time = 0
-
-        def count_process_times(self, processor):
-            count = 0
-            if processor:
-                if isinstance(processor, SequentialResultsPreprocessor):
-                    for preprocessor in processor.preprocessors:
-                        # recursive calling preprocessors in list
-                        count += self.count_process_times(preprocessor)
-                else:
-                    count = 1
-            return count
-
-        def handle_result(self, results: List[Dict], **info):
-            process_times = self.count_process_times(self.results_preprocessor)
-            if process_times > self.max_process_time:
-                self.max_process_time = process_times
-            print(f"process times: {process_times}")
-
-    def train_func():
-        for idx in range(num_iterates):
-            train.report(iterate=idx + 1)
-
-    # python default limitation for iterate depth
-    num_iterates = 1000
-    trainer = Trainer(TestConfig(), num_workers=1)
-    trainer.start()
-    test_callback = TestCallback()
-    trainer.run(train_func, callbacks=[test_callback])
-    assert test_callback.max_process_time == 1
-    print(f"callback max process time: {test_callback.max_process_time}")
-    trainer.shutdown()
-
-
-if __name__ == "__main__":
-    import pytest
-    import sys
-
-    sys.exit(pytest.main(["-v", "-x", __file__]))

From 4a3103ec4a3fcd06761a9ddf51895407bbef3c87 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Wed, 15 Jun 2022 19:10:27 +0000
Subject: [PATCH 13/70] Examples tests

---
 .../train/examples/horovod/horovod_example.py |   3 +-
 python/ray/train/tests/test_examples.py       | 119 +++++++++---------
 2 files changed, 62 insertions(+), 60 deletions(-)

diff --git a/python/ray/train/examples/horovod/horovod_example.py b/python/ray/train/examples/horovod/horovod_example.py
index c3202307755f..1e163da70052 100644
--- a/python/ray/train/examples/horovod/horovod_example.py
+++ b/python/ray/train/examples/horovod/horovod_example.py
@@ -10,6 +10,7 @@
 from torchvision import datasets, transforms
 
 import ray
+from ray import train
 from ray.train.horovod import HorovodTrainer
 
 
@@ -148,7 +149,7 @@ def train_func(config):
         loss = train_epoch(
             model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda
         )
-        results.append(loss)
+        train.report(loss=loss)
     return results
 
 
diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py
index 10ebefa03588..e37cf43a4687 100644
--- a/python/ray/train/tests/test_examples.py
+++ b/python/ray/train/tests/test_examples.py
@@ -2,9 +2,10 @@
 
 import ray
 from ray.train import Trainer
+from ray.train.constants import TRAINING_ITERATION
+from ray.train.examples.horovod.horovod_example import HorovodTrainClass
 from ray.train.examples.horovod.horovod_example import (
     train_func as horovod_torch_train_func,
-    HorovodTrainClass,
 )
 from ray.train.examples.tensorflow_mnist_example import (
     train_func as tensorflow_mnist_train_func,
@@ -19,52 +20,56 @@
     train_func as fashion_mnist_train_func,
 )
 from ray.train.examples.train_linear_example import train_func as linear_train_func
+from ray.train.horovod.horovod_trainer import HorovodTrainer
 from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer
 from ray.train.tests.test_trainer import KillCallback
 from ray.train.torch.torch_trainer import TorchTrainer
 
 
 @pytest.fixture
-def ray_start_2_cpus():
-    address_info = ray.init(num_cpus=2)
+def ray_start_4_cpus():
+    address_info = ray.init(num_cpus=4)
     yield address_info
     # The code after the yield will run as teardown code.
     ray.shutdown()
 
 
 @pytest.mark.parametrize("num_workers", [1, 2])
-def test_tensorflow_mnist(ray_start_2_cpus, num_workers):
+def test_tensorflow_mnist(ray_start_4_cpus, num_workers):
     num_workers = num_workers
     epochs = 3
 
-    trainer = TensorflowTrainer(tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=num_workers))
     config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
+    trainer = TensorflowTrainer(
+        tensorflow_mnist_train_func,
+        train_loop_config=config,
+        scaling_config=dict(num_workers=num_workers),
+    )
     results = trainer.fit()
 
     result = results.metrics
 
-    loss = result["loss"]
-    assert len(loss) == epochs
-    assert loss[-1] < loss[0]
-
-    accuracy = result["accuracy"]
-    assert len(accuracy) == epochs
-    assert accuracy[-1] > accuracy[0]
+    assert result[TRAINING_ITERATION] == epochs
 
 
-def test_tf_non_distributed(ray_start_2_cpus):
+def test_tf_non_distributed(ray_start_4_cpus):
     """Make sure Ray Train works without TF MultiWorkerMirroredStrategy."""
 
-    trainer = TorchTrainer(tf_quick_start_train_func, scaling_config=dict(num_workers=1))
+    trainer = TorchTrainer(
+        tf_quick_start_train_func, scaling_config=dict(num_workers=1)
+    )
     trainer.fit()
 
 
-def test_tensorflow_mnist_fail(ray_start_2_cpus):
+@pytest.mark.skip("Refactor as a backend test.")
+def test_tensorflow_mnist_fail(ray_start_4_cpus):
     """Tests if tensorflow example works even with worker failure."""
     epochs = 3
 
-    trainer = TensorflowTrainer(tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=num_workers))
     config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
+    trainer = TensorflowTrainer(
+        tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=2)
+    )
     trainer.start()
     kill_callback = KillCallback(fail_on=0, trainer=trainer)
     results = trainer.run(
@@ -85,24 +90,24 @@ def test_tensorflow_mnist_fail(ray_start_2_cpus):
 
 
 @pytest.mark.parametrize("num_workers", [1, 2])
-def test_torch_linear(ray_start_2_cpus, num_workers):
+def test_torch_linear(ray_start_4_cpus, num_workers):
     num_workers = num_workers
     epochs = 3
 
-    trainer = Trainer("torch", num_workers=num_workers)
     config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
-    trainer.start()
-    results = trainer.run(linear_train_func, config)
-    trainer.shutdown()
-
-    assert len(results) == num_workers
+    trainer = TorchTrainer(
+        linear_train_func,
+        train_loop_config=config,
+        scaling_config=dict(num_workers=num_workers),
+    )
+    results = trainer.fit()
 
-    for result in results:
-        assert len(result) == epochs
-        assert result[-1]["loss"] < result[0]["loss"]
+    result = results.metrics
+    assert result[TRAINING_ITERATION] == epochs
 
 
-def test_torch_linear_failure(ray_start_2_cpus):
+@pytest.mark.skip("Refactor as a backend test.")
+def test_torch_linear_failure(ray_start_4_cpus):
     num_workers = 2
     epochs = 3
 
@@ -113,56 +118,51 @@ def test_torch_linear_failure(ray_start_2_cpus):
     results = trainer.run(linear_train_func, config, callbacks=[kill_callback])
     trainer.shutdown()
 
-    assert len(results) == num_workers
+    result = results.metrics
 
-    for result in results:
-        assert len(result) == epochs
-        assert result[-1]["loss"] < result[0]["loss"]
+    assert result[TRAINING_ITERATION] == epochs
 
 
-def test_torch_fashion_mnist(ray_start_2_cpus):
+def test_torch_fashion_mnist(ray_start_4_cpus):
     num_workers = 2
     epochs = 3
 
-    trainer = Trainer("torch", num_workers=num_workers)
     config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
-    trainer.start()
-    results = trainer.run(fashion_mnist_train_func, config)
-    trainer.shutdown()
-
-    assert len(results) == num_workers
+    trainer = TorchTrainer(
+        fashion_mnist_train_func,
+        train_loop_config=config,
+        scaling_config=dict(num_workers=num_workers),
+    )
+    results = trainer.fit()
 
-    for result in results:
-        assert len(result) == epochs
-        assert result[-1] < result[0]
+    result = results.metrics
+    assert result[TRAINING_ITERATION] == epochs
 
 
-def test_torch_non_distributed(ray_start_2_cpus):
+def test_torch_non_distributed(ray_start_4_cpus):
     """Make sure Ray Train works without torch DDP."""
 
-    trainer = Trainer(backend="torch", num_workers=1)
-    trainer.start()
-    trainer.run(torch_quick_start_train_func)
-    trainer.shutdown()
+    trainer = TorchTrainer(
+        torch_quick_start_train_func, scaling_config=dict(num_workers=1)
+    )
+    trainer.fit()
 
 
-def test_horovod_torch_mnist(ray_start_2_cpus):
+def test_horovod_torch_mnist(ray_start_4_cpus):
     num_workers = 2
     num_epochs = 2
-    trainer = Trainer("horovod", num_workers)
-    trainer.start()
-    results = trainer.run(
-        horovod_torch_train_func, config={"num_epochs": num_epochs, "lr": 1e-3}
+    trainer = HorovodTrainer(
+        horovod_torch_train_func,
+        train_loop_config={"num_epochs": num_epochs, "lr": 1e-3},
+        scaling_config=dict(num_workers=num_workers),
     )
-    trainer.shutdown()
-
-    assert len(results) == num_workers
-    for worker_result in results:
-        assert len(worker_result) == num_epochs
-        assert worker_result[num_epochs - 1] < worker_result[0]
+    results = trainer.fit()
+    result = results.metrics
+    assert result[TRAINING_ITERATION] == num_workers
 
 
-def test_horovod_torch_mnist_stateful(ray_start_2_cpus):
+@pytest.mark.skip("Refactor as a backend test.")
+def test_horovod_torch_mnist_stateful(ray_start_4_cpus):
     num_workers = 2
     num_epochs = 2
     trainer = Trainer("horovod", num_workers)
@@ -180,7 +180,8 @@ def test_horovod_torch_mnist_stateful(ray_start_2_cpus):
 
 
 if __name__ == "__main__":
-    import pytest
     import sys
 
+    import pytest
+
     sys.exit(pytest.main(["-v", "-x", __file__]))

From f7f3ea8559f3b2237895108d03cdc985646f9c3f Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Wed, 15 Jun 2022 20:14:22 +0000
Subject: [PATCH 14/70] Move tests

---
 python/ray/train/tests/test_examples.py |  12 ++-
 python/ray/train/tests/test_gpu.py      |  85 +++++++++---------
 python/ray/train/tests/test_minimal.py  |  59 +++++-------
 python/ray/train/tests/test_tune.py     | 114 ++++++++++++++----------
 4 files changed, 138 insertions(+), 132 deletions(-)

diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py
index e37cf43a4687..06c88577205e 100644
--- a/python/ray/train/tests/test_examples.py
+++ b/python/ray/train/tests/test_examples.py
@@ -61,15 +61,13 @@ def test_tf_non_distributed(ray_start_4_cpus):
     trainer.fit()
 
 
-@pytest.mark.skip("Refactor as a backend test.")
-def test_tensorflow_mnist_fail(ray_start_4_cpus):
+# TODO: Refactor as a backend test.
+def test_tensorflow_mnist_fail(ray_start_2_cpus):
     """Tests if tensorflow example works even with worker failure."""
     epochs = 3
 
+    trainer = Trainer("tensorflow", num_workers=2)
     config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
-    trainer = TensorflowTrainer(
-        tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=2)
-    )
     trainer.start()
     kill_callback = KillCallback(fail_on=0, trainer=trainer)
     results = trainer.run(
@@ -106,7 +104,7 @@ def test_torch_linear(ray_start_4_cpus, num_workers):
     assert result[TRAINING_ITERATION] == epochs
 
 
-@pytest.mark.skip("Refactor as a backend test.")
+# TODO: Refactor as a backend test.
 def test_torch_linear_failure(ray_start_4_cpus):
     num_workers = 2
     epochs = 3
@@ -161,7 +159,7 @@ def test_horovod_torch_mnist(ray_start_4_cpus):
     assert result[TRAINING_ITERATION] == num_workers
 
 
-@pytest.mark.skip("Refactor as a backend test.")
+# TODO: Refactor as a backend test.
 def test_horovod_torch_mnist_stateful(ray_start_4_cpus):
     num_workers = 2
     num_epochs = 2
diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py
index 875ad766ebda..16dac0c42fc7 100644
--- a/python/ray/train/tests/test_gpu.py
+++ b/python/ray/train/tests/test_gpu.py
@@ -1,15 +1,17 @@
 import os
-import pytest
 from timeit import default_timer as timer
 
+import pytest
 import torch
+import torchvision
+from test_tune import torch_fashion_mnist, tune_tensorflow_mnist
 from torch.nn.parallel import DistributedDataParallel
 from torch.utils.data import DataLoader, DistributedSampler
-import torchvision
 
 import ray
 import ray.train as train
 from ray.train import Trainer, TrainingCallback
+from ray.train.constants import TRAINING_ITERATION
 from ray.train.examples.horovod.horovod_example import (
     train_func as horovod_torch_train_func,
 )
@@ -20,7 +22,9 @@
     train_func as fashion_mnist_train_func,
 )
 from ray.train.examples.train_linear_example import LinearDataset
-from test_tune import torch_fashion_mnist, tune_tensorflow_mnist
+from ray.train.horovod.horovod_trainer import HorovodTrainer
+from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer
+from ray.train.torch.torch_trainer import TorchTrainer
 
 
 @pytest.fixture
@@ -38,6 +42,7 @@ def ray_start_1_cpu_1_gpu():
     ray.shutdown()
 
 
+@pytest.mark.skip("Refactor as a backend test.")
 @pytest.mark.parametrize("num_gpus_per_worker", [0.5, 1])
 def test_torch_get_device(ray_start_4_cpus_2_gpus, num_gpus_per_worker):
     def train_fn():
@@ -64,6 +69,7 @@ def train_fn():
         )
 
 
+@pytest.mark.skip("Refactor as a backend test.")
 def test_torch_prepare_model(ray_start_4_cpus_2_gpus):
     """Tests if ``prepare_model`` correctly wraps in DDP."""
 
@@ -85,6 +91,7 @@ def train_fn():
     trainer.shutdown()
 
 
+@pytest.mark.skip("Refactor as a backend test.")
 def test_torch_prepare_dataloader(ray_start_4_cpus_2_gpus):
     data_loader = DataLoader(LinearDataset(a=1, b=2, size=10))
 
@@ -108,6 +115,7 @@ def train_fn():
     trainer.shutdown()
 
 
+@pytest.mark.skip("Refactor as a backend test.")
 @pytest.mark.parametrize("use_gpu", (False, True))
 def test_enable_reproducibility(ray_start_4_cpus_2_gpus, use_gpu):
     # NOTE: Reproducible results aren't guaranteed between seeded executions, even with
@@ -154,6 +162,7 @@ def train_func():
     assert result1 == result2
 
 
+@pytest.mark.skip("Refactor as a backend test.")
 def test_torch_amp_performance(ray_start_4_cpus_2_gpus):
     def train_func(config):
         train.torch.accelerate(amp=config["amp"])
@@ -196,6 +205,7 @@ def latency(amp: bool) -> float:
     assert 1.05 * latency(amp=True) < latency(amp=False)
 
 
+@pytest.mark.skip("Refactor as a backend test.")
 def test_checkpoint_torch_model_with_amp(ray_start_4_cpus_2_gpus):
     """Test that model with AMP is serializable."""
 
@@ -213,6 +223,7 @@ def train_func():
     trainer.shutdown()
 
 
+@pytest.mark.skip("Refactor as a backend test.")
 def test_torch_auto_gpu_to_cpu(ray_start_4_cpus_2_gpus):
     """Tests if GPU tensors are auto converted to CPU on driver."""
 
@@ -287,55 +298,47 @@ def test_tensorflow_mnist_gpu(ray_start_4_cpus_2_gpus):
     num_workers = 2
     epochs = 3
 
-    trainer = Trainer("tensorflow", num_workers=num_workers, use_gpu=True)
     config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
-    trainer.start()
-    results = trainer.run(tensorflow_mnist_train_func, config)
-    trainer.shutdown()
-
-    assert len(results) == num_workers
-    result = results[0]
+    trainer = TensorflowTrainer(
+        tensorflow_mnist_train_func,
+        train_loop_config=config,
+        scaling_config=dict(num_workers=num_workers, use_gpu=True),
+    )
+    results = trainer.fit()
 
-    loss = result["loss"]
-    assert len(loss) == epochs
-    assert loss[-1] < loss[0]
+    result = results.metrics
 
-    accuracy = result["accuracy"]
-    assert len(accuracy) == epochs
-    assert accuracy[-1] > accuracy[0]
+    assert result[TRAINING_ITERATION] == epochs
 
 
 def test_torch_fashion_mnist_gpu(ray_start_4_cpus_2_gpus):
     num_workers = 2
     epochs = 3
 
-    trainer = Trainer("torch", num_workers=num_workers, use_gpu=True)
     config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
-    trainer.start()
-    results = trainer.run(fashion_mnist_train_func, config)
-    trainer.shutdown()
+    trainer = TorchTrainer(
+        fashion_mnist_train_func,
+        train_loop_config=config,
+        scaling_config=dict(num_workers=num_workers, use_gpu=True),
+    )
+    results = trainer.fit()
 
-    assert len(results) == num_workers
+    result = results.metrics
 
-    for result in results:
-        assert len(result) == epochs
-        assert result[-1] < result[0]
+    assert result[TRAINING_ITERATION] == epochs
 
 
 def test_horovod_torch_mnist_gpu(ray_start_4_cpus_2_gpus):
     num_workers = 2
     num_epochs = 2
-    trainer = Trainer("horovod", num_workers, use_gpu=True)
-    trainer.start()
-    results = trainer.run(
-        horovod_torch_train_func, config={"num_epochs": num_epochs, "lr": 1e-3}
+    trainer = HorovodTrainer(
+        horovod_torch_train_func,
+        train_loop_config={"num_epochs": num_epochs, "lr": 1e-3},
+        scaling_config=dict(num_workers=num_workers, use_gpu=True),
     )
-    trainer.shutdown()
-
-    assert len(results) == num_workers
-    for worker_result in results:
-        assert len(worker_result) == num_epochs
-        assert worker_result[num_epochs - 1] < worker_result[0]
+    results = trainer.fit()
+    result = results.metrics
+    assert result[TRAINING_ITERATION] == num_workers
 
 
 def test_tune_fashion_mnist_gpu(ray_start_4_cpus_2_gpus):
@@ -349,9 +352,7 @@ def test_tune_tensorflow_mnist_gpu(ray_start_4_cpus_2_gpus):
 def test_train_linear_dataset_gpu(ray_start_4_cpus_2_gpus):
     from ray.train.examples.train_linear_dataset_example import train_linear
 
-    results = train_linear(num_workers=2, use_gpu=True)
-    for result in results:
-        assert result[-1]["loss"] < result[0]["loss"]
+    assert train_linear(num_workers=2, use_gpu=True)
 
 
 def test_tensorflow_linear_dataset_gpu(ray_start_4_cpus_2_gpus):
@@ -359,11 +360,10 @@ def test_tensorflow_linear_dataset_gpu(ray_start_4_cpus_2_gpus):
         train_tensorflow_linear,
     )
 
-    results = train_tensorflow_linear(num_workers=2, use_gpu=True)
-    for result in results:
-        assert result[-1]["loss"] < result[0]["loss"]
+    assert train_tensorflow_linear(num_workers=2, use_gpu=True)
 
 
+@pytest.mark.skip("Refactor as a backend test.")
 @pytest.mark.parametrize(
     ("device_choice", "auto_transfer"),
     [
@@ -376,8 +376,8 @@ def test_tensorflow_linear_dataset_gpu(ray_start_4_cpus_2_gpus):
 def test_auto_transfer_data_from_host_to_device(
     ray_start_1_cpu_1_gpu, device_choice, auto_transfer
 ):
-    import torch
     import numpy as np
+    import torch
 
     def compute_average_runtime(func):
         device = torch.device(device_choice)
@@ -417,7 +417,8 @@ def host_to_device_auto_pipeline(device):
 
 
 if __name__ == "__main__":
-    import pytest
     import sys
 
+    import pytest
+
     sys.exit(pytest.main(["-v", "-x", "-s", __file__]))
diff --git a/python/ray/train/tests/test_minimal.py b/python/ray/train/tests/test_minimal.py
index c6c6d6bba7b3..5f3be1d4c3b3 100644
--- a/python/ray/train/tests/test_minimal.py
+++ b/python/ray/train/tests/test_minimal.py
@@ -1,18 +1,16 @@
-from typing import List, Dict
-
 import pytest
 
 import ray
 import ray.train as train
-from ray.train import Trainer
-from ray.train.backend import BackendConfig, Backend
-from ray.train.callbacks import TrainingCallback
+from ray.air.checkpoint import Checkpoint
 from ray.train._internal.worker_group import WorkerGroup
+from ray.train.backend import Backend, BackendConfig
+from ray.train.data_parallel_trainer import DataParallelTrainer
 
 
 @pytest.fixture
-def ray_start_2_cpus():
-    address_info = ray.init(num_cpus=2)
+def ray_start_4_cpus():
+    address_info = ray.init(num_cpus=4)
     yield address_info
     # The code after the yield will run as teardown code.
     ray.shutdown()
@@ -32,15 +30,7 @@ def on_shutdown(self, worker_group: WorkerGroup, backend_config: TestConfig):
         pass
 
 
-class TestCallback(TrainingCallback):
-    def __init__(self):
-        self.result_list = []
-
-    def handle_result(self, results: List[Dict], **info):
-        self.result_list.append(results)
-
-
-def test_run(ray_start_2_cpus):
+def test_run(ray_start_4_cpus):
     """Tests that Train can be run without any specific backends."""
     num_workers = 2
     key = "value"
@@ -53,27 +43,23 @@ def train_func():
         train.save_checkpoint(**checkpoint)
         return checkpoint[key]
 
-    checkpoint = {key: value}
-    test_callback = TestCallback()
-
-    trainer = Trainer(config, num_workers=num_workers)
-    trainer.start()
-    results = trainer.run(train_func, checkpoint=checkpoint, callbacks=[test_callback])
+    checkpoint = Checkpoint.from_dict(
+        {
+            # this would be set during checkpoint saving
+            "_current_checkpoint_id": 1,
+            key: value,
+        }
+    )
 
-    # Test results.
-    assert len(results) == num_workers
-    assert all(result == 1 for result in results)
+    trainer = DataParallelTrainer(
+        train_func,
+        backend_config=config,
+        resume_from_checkpoint=checkpoint,
+        scaling_config=dict(num_workers=num_workers),
+    )
+    results = trainer.fit()
 
-    # Test reporting and callbacks.
-    assert len(test_callback.result_list) == value
-    assert len(test_callback.result_list[0]) == num_workers
-    print(test_callback.result_list[0])
-    assert all(result[key] == value for result in test_callback.result_list[0])
-
-    # Test checkpointing.
-    assert trainer.latest_checkpoint[key] == value
-
-    trainer.shutdown()
+    assert results.checkpoint
 
 
 def test_failure():
@@ -89,7 +75,8 @@ def test_failure():
 
 
 if __name__ == "__main__":
-    import pytest
     import sys
 
+    import pytest
+
     sys.exit(pytest.main(["-v", "-x", __file__]))
diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py
index f08b3da43dc6..2fed4e42fa43 100644
--- a/python/ray/train/tests/test_tune.py
+++ b/python/ray/train/tests/test_tune.py
@@ -1,25 +1,31 @@
 import os
 
 import pytest
+
 import ray
 import ray.train as train
 from ray import tune
 from ray.air import Checkpoint
-from ray.tune import TuneError
+from ray.air.config import FailureConfig, RunConfig
 from ray.train import Trainer
+from ray.train._internal.worker_group import WorkerGroup
 from ray.train.backend import Backend, BackendConfig
+from ray.train.data_parallel_trainer import DataParallelTrainer
 from ray.train.examples.tensorflow_mnist_example import (
     train_func as tensorflow_mnist_train_func,
 )
 from ray.train.examples.train_fashion_mnist_example import (
     train_func as fashion_mnist_train_func,
 )
-from ray.train._internal.worker_group import WorkerGroup
+from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer
+from ray.train.torch.torch_trainer import TorchTrainer
+from ray.tune.tune_config import TuneConfig
+from ray.tune.tuner import Tuner
 
 
 @pytest.fixture
-def ray_start_2_cpus():
-    address_info = ray.init(num_cpus=2)
+def ray_start_4_cpus():
+    address_info = ray.init(num_cpus=4)
     yield address_info
     # The code after the yield will run as teardown code.
     ray.shutdown()
@@ -50,18 +56,24 @@ def on_shutdown(self, worker_group: WorkerGroup, backend_config: TestConfig):
 def torch_fashion_mnist(num_workers, use_gpu, num_samples):
     epochs = 2
 
-    trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu)
-    MnistTrainable = trainer.to_tune_trainable(fashion_mnist_train_func)
-
-    analysis = tune.run(
-        MnistTrainable,
-        num_samples=num_samples,
-        config={
-            "lr": tune.loguniform(1e-4, 1e-1),
-            "batch_size": tune.choice([32, 64, 128]),
-            "epochs": epochs,
+    trainer = TorchTrainer(
+        fashion_mnist_train_func,
+        scaling_config=dict(num_workers=num_workers, use_gpu=use_gpu),
+    )
+    tuner = Tuner(
+        trainer,
+        param_space={
+            "train_loop_config": {
+                "lr": tune.loguniform(1e-4, 1e-1),
+                "batch_size": tune.choice([32, 64, 128]),
+                "epochs": epochs,
+            }
         },
+        tune_config=TuneConfig(
+            num_samples=num_samples,
+        ),
     )
+    analysis = tuner.fit()._experiment_analysis
 
     # Check that loss decreases in each trial.
     for path, df in analysis.trial_dataframes.items():
@@ -74,18 +86,25 @@ def test_tune_torch_fashion_mnist(ray_start_8_cpus):
 
 def tune_tensorflow_mnist(num_workers, use_gpu, num_samples):
     epochs = 2
-    trainer = Trainer("tensorflow", num_workers=num_workers, use_gpu=use_gpu)
-    MnistTrainable = trainer.to_tune_trainable(tensorflow_mnist_train_func)
-
-    analysis = tune.run(
-        MnistTrainable,
-        num_samples=num_samples,
-        config={
-            "lr": tune.loguniform(1e-4, 1e-1),
-            "batch_size": tune.choice([32, 64, 128]),
-            "epochs": epochs,
+
+    trainer = TensorflowTrainer(
+        tensorflow_mnist_train_func,
+        scaling_config=dict(num_workers=num_workers, use_gpu=use_gpu),
+    )
+    tuner = Tuner(
+        trainer,
+        param_space={
+            "train_loop_config": {
+                "lr": tune.loguniform(1e-4, 1e-1),
+                "batch_size": tune.choice([32, 64, 128]),
+                "epochs": epochs,
+            }
         },
+        tune_config=TuneConfig(
+            num_samples=num_samples,
+        ),
     )
+    analysis = tuner.fit()._experiment_analysis
 
     # Check that loss decreases in each trial.
     for path, df in analysis.trial_dataframes.items():
@@ -96,18 +115,7 @@ def test_tune_tensorflow_mnist(ray_start_8_cpus):
     tune_tensorflow_mnist(num_workers=2, use_gpu=False, num_samples=2)
 
 
-def test_tune_error(ray_start_2_cpus):
-    def train_func(config):
-        raise RuntimeError("Error in training function!")
-
-    trainer = Trainer(TestConfig(), num_workers=1)
-    TestTrainable = trainer.to_tune_trainable(train_func)
-
-    with pytest.raises(TuneError):
-        tune.run(TestTrainable)
-
-
-def test_tune_checkpoint(ray_start_2_cpus):
+def test_tune_checkpoint(ray_start_4_cpus):
     def train_func():
         for i in range(10):
             train.report(test=i)
@@ -123,7 +131,7 @@ def train_func():
     assert checkpoint["hello"] == "world"
 
 
-def test_reuse_checkpoint(ray_start_2_cpus):
+def test_reuse_checkpoint(ray_start_4_cpus):
     def train_func(config):
         itr = 0
         ckpt = train.load_checkpoint()
@@ -134,19 +142,28 @@ def train_func(config):
             train.save_checkpoint(iter=i)
             train.report(test=i, training_iteration=i)
 
-    trainer = Trainer(TestConfig(), num_workers=1)
-    TestTrainable = trainer.to_tune_trainable(train_func)
-
-    [trial] = tune.run(TestTrainable, config={"max_iter": 5}).trials
+    trainer = DataParallelTrainer(
+        train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1)
+    )
+    tuner = Tuner(
+        trainer,
+        param_space={"train_loop_config": {"max_iter": 5}},
+    )
+    [trial] = tuner.fit()._experiment_analysis.trials
     checkpoint_path = trial.checkpoint.dir_or_data
     checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
     assert checkpoint["iter"] == 4
-    analysis = tune.run(TestTrainable, config={"max_iter": 10}, restore=checkpoint_path)
+
+    tuner = Tuner(
+        trainer,
+        param_space={"train_loop_config": {"max_iter": 10}},
+    ).restore(trial.local_dir)
+    analysis = tuner.fit()._experiment_analysis
     trial_dfs = list(analysis.trial_dataframes.values())
     assert len(trial_dfs[0]["training_iteration"]) == 5
 
 
-def test_retry(ray_start_2_cpus):
+def test_retry(ray_start_4_cpus):
     def train_func():
         ckpt = train.load_checkpoint()
         restored = bool(ckpt)  # Does a previous checkpoint exist?
@@ -160,10 +177,12 @@ def train_func():
             train.save_checkpoint(iter=i)
             train.report(test=i, training_iteration=i)
 
-    trainer = Trainer(TestConfig(), num_workers=1)
-    TestTrainable = trainer.to_tune_trainable(train_func)
+    trainer = DataParallelTrainer(
+        train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1)
+    )
+    tuner = Tuner(trainer, run_config=RunConfig(failure=FailureConfig(max_failures=3)))
 
-    analysis = tune.run(TestTrainable, max_failures=3)
+    analysis = tuner.fit()._experiment_analysis
     checkpoint_path = analysis.trials[0].checkpoint.dir_or_data
     checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
     assert checkpoint["iter"] == 3
@@ -173,7 +192,8 @@ def train_func():
 
 
 if __name__ == "__main__":
-    import pytest
     import sys
 
+    import pytest
+
     sys.exit(pytest.main(["-v", "-x", __file__]))

From 50ca40b3a9b701d3a97a256295de9505f5d82605 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Wed, 15 Jun 2022 21:20:04 +0000
Subject: [PATCH 15/70] Fixture fix

---
 python/ray/train/tests/test_examples.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py
index 06c88577205e..169c0a29e236 100644
--- a/python/ray/train/tests/test_examples.py
+++ b/python/ray/train/tests/test_examples.py
@@ -62,7 +62,7 @@ def test_tf_non_distributed(ray_start_4_cpus):
 
 
 # TODO: Refactor as a backend test.
-def test_tensorflow_mnist_fail(ray_start_2_cpus):
+def test_tensorflow_mnist_fail(ray_start_4_cpus):
     """Tests if tensorflow example works even with worker failure."""
     epochs = 3
 

From 20b707571febcff0ec8f2d9ba79e8005d56f85fd Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 16 Jun 2022 15:48:50 +0000
Subject: [PATCH 16/70] CI fixes

---
 .../examples/train_linear_dataset_example.py  | 22 ++++++++++++++-----
 python/ray/train/tests/test_examples.py       |  6 +++--
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/python/ray/train/examples/train_linear_dataset_example.py b/python/ray/train/examples/train_linear_dataset_example.py
index f84faa5f7a11..3038ac66aa9e 100644
--- a/python/ray/train/examples/train_linear_dataset_example.py
+++ b/python/ray/train/examples/train_linear_dataset_example.py
@@ -1,16 +1,19 @@
 import argparse
-from typing import Dict
+from typing import Dict, Tuple
 
 import torch
 import torch.nn as nn
 
 import ray
 import ray.train as train
+from ray.air.config import DatasetConfig
 from ray.data import Dataset
 from ray.train.torch import TorchTrainer
 
 
-def get_datasets(a=5, b=10, size=1000, split=0.8) -> Dict[str, Dataset]:
+def get_datasets_and_configs(
+    a=5, b=10, size=1000, split=0.8
+) -> Tuple[Dict[str, Dataset], Dict[str, DatasetConfig]]:
     def get_dataset(a, b, size) -> Dataset:
         items = [i / size for i in range(size)]
         dataset = ray.data.from_items([{"x": x, "y": a * x + b} for x in items])
@@ -27,7 +30,13 @@ def get_dataset(a, b, size) -> Dataset:
         "validation": validation_dataset,
     }
 
-    return datasets
+    # Use dataset pipelining
+    dataset_configs = {
+        "train": DatasetConfig(use_stream_api=True),
+        "validation": DatasetConfig(use_stream_api=True),
+    }
+
+    return datasets, dataset_configs
 
 
 def train_epoch(iterable_dataset, model, loss_fn, optimizer, device):
@@ -113,13 +122,14 @@ def train_func(config):
 
 
 def train_linear(num_workers=2, use_gpu=False):
-    datasets = get_datasets()
+    datasets, dataset_configs = get_datasets_and_configs()
 
     config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}
     trainer = TorchTrainer(
         train_func,
         train_loop_config=config,
         datasets=datasets,
+        dataset_config=dataset_configs,
         scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
     )
     results = trainer.fit()
@@ -152,8 +162,8 @@ def train_linear(num_workers=2, use_gpu=False):
     args, _ = parser.parse_known_args()
 
     if args.smoke_test:
-        # 1 for datasets
-        num_cpus = args.num_workers + 1
+        # 1 for datasets, 1 for Trainable actor
+        num_cpus = args.num_workers + 2
         num_gpus = args.num_workers if args.use_gpu else 0
         ray.init(num_cpus=num_cpus, num_gpus=num_gpus)
     else:
diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py
index 169c0a29e236..fd6a2fadbf91 100644
--- a/python/ray/train/tests/test_examples.py
+++ b/python/ray/train/tests/test_examples.py
@@ -116,9 +116,11 @@ def test_torch_linear_failure(ray_start_4_cpus):
     results = trainer.run(linear_train_func, config, callbacks=[kill_callback])
     trainer.shutdown()
 
-    result = results.metrics
+    assert len(results) == num_workers
 
-    assert result[TRAINING_ITERATION] == epochs
+    for result in results:
+        assert len(result) == epochs
+        assert result[-1]["loss"] < result[0]["loss"]
 
 
 def test_torch_fashion_mnist(ray_start_4_cpus):

From c3b7d42c5f15cf3d44fec370df0c5cff9443b96e Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 16 Jun 2022 16:51:32 +0000
Subject: [PATCH 17/70] Fix

---
 .../ray/train/examples/tensorflow_quick_start.py |  7 +++----
 python/ray/train/examples/torch_quick_start.py   |  7 +++----
 python/ray/train/tests/test_gpu.py               | 16 ++++++++--------
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/python/ray/train/examples/tensorflow_quick_start.py b/python/ray/train/examples/tensorflow_quick_start.py
index 0ac3666672e2..f0c7f3d10f4e 100644
--- a/python/ray/train/examples/tensorflow_quick_start.py
+++ b/python/ray/train/examples/tensorflow_quick_start.py
@@ -1,15 +1,12 @@
 # flake8: noqa
 # fmt: off
+# isort: skip_file
 
 # __tf_setup_begin__
 
-import json
-import os
-
 import numpy as np
 import tensorflow as tf
 
-
 def mnist_dataset(batch_size):
     (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
     # The `x` arrays are in uint8 and have values in the [0, 255] range.
@@ -50,6 +47,8 @@ def train_func():
 
 # __tf_distributed_begin__
 
+import json
+import os
 
 def train_func_distributed():
     per_worker_batch_size = 64
diff --git a/python/ray/train/examples/torch_quick_start.py b/python/ray/train/examples/torch_quick_start.py
index eaf07a95a5d1..2f0da37ddbc9 100644
--- a/python/ray/train/examples/torch_quick_start.py
+++ b/python/ray/train/examples/torch_quick_start.py
@@ -1,13 +1,10 @@
 # flake8: noqa
 # fmt: off
+# isort: skip_file
 
 # __torch_setup_begin__
 import torch
 import torch.nn as nn
-import torch.optim as optim
-
-import ray.train.torch
-from ray import train
 
 num_samples = 20
 input_size = 10
@@ -32,6 +29,7 @@ def forward(self, input):
 
 # __torch_single_begin__
 
+import torch.optim as optim
 
 def train_func():
     num_epochs = 3
@@ -51,6 +49,7 @@ def train_func():
 
 # __torch_distributed_begin__
 
+from ray import train
 
 def train_func_distributed():
     num_epochs = 3
diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py
index 16dac0c42fc7..ac9a0afe7cfb 100644
--- a/python/ray/train/tests/test_gpu.py
+++ b/python/ray/train/tests/test_gpu.py
@@ -42,7 +42,7 @@ def ray_start_1_cpu_1_gpu():
     ray.shutdown()
 
 
-@pytest.mark.skip("Refactor as a backend test.")
+# TODO: Refactor as a backend test.
 @pytest.mark.parametrize("num_gpus_per_worker", [0.5, 1])
 def test_torch_get_device(ray_start_4_cpus_2_gpus, num_gpus_per_worker):
     def train_fn():
@@ -69,7 +69,7 @@ def train_fn():
         )
 
 
-@pytest.mark.skip("Refactor as a backend test.")
+# TODO: Refactor as a backend test.
 def test_torch_prepare_model(ray_start_4_cpus_2_gpus):
     """Tests if ``prepare_model`` correctly wraps in DDP."""
 
@@ -91,7 +91,7 @@ def train_fn():
     trainer.shutdown()
 
 
-@pytest.mark.skip("Refactor as a backend test.")
+# TODO: Refactor as a backend test.
 def test_torch_prepare_dataloader(ray_start_4_cpus_2_gpus):
     data_loader = DataLoader(LinearDataset(a=1, b=2, size=10))
 
@@ -115,7 +115,7 @@ def train_fn():
     trainer.shutdown()
 
 
-@pytest.mark.skip("Refactor as a backend test.")
+# TODO: Refactor as a backend test.
 @pytest.mark.parametrize("use_gpu", (False, True))
 def test_enable_reproducibility(ray_start_4_cpus_2_gpus, use_gpu):
     # NOTE: Reproducible results aren't guaranteed between seeded executions, even with
@@ -162,7 +162,7 @@ def train_func():
     assert result1 == result2
 
 
-@pytest.mark.skip("Refactor as a backend test.")
+# TODO: Refactor as a backend test.
 def test_torch_amp_performance(ray_start_4_cpus_2_gpus):
     def train_func(config):
         train.torch.accelerate(amp=config["amp"])
@@ -205,7 +205,7 @@ def latency(amp: bool) -> float:
     assert 1.05 * latency(amp=True) < latency(amp=False)
 
 
-@pytest.mark.skip("Refactor as a backend test.")
+# TODO: Refactor as a backend test.
 def test_checkpoint_torch_model_with_amp(ray_start_4_cpus_2_gpus):
     """Test that model with AMP is serializable."""
 
@@ -223,7 +223,7 @@ def train_func():
     trainer.shutdown()
 
 
-@pytest.mark.skip("Refactor as a backend test.")
+# TODO: Refactor as a backend test.
 def test_torch_auto_gpu_to_cpu(ray_start_4_cpus_2_gpus):
     """Tests if GPU tensors are auto converted to CPU on driver."""
 
@@ -363,7 +363,7 @@ def test_tensorflow_linear_dataset_gpu(ray_start_4_cpus_2_gpus):
     assert train_tensorflow_linear(num_workers=2, use_gpu=True)
 
 
-@pytest.mark.skip("Refactor as a backend test.")
+# TODO: Refactor as a backend test.
 @pytest.mark.parametrize(
     ("device_choice", "auto_transfer"),
     [

From 37b81825e4c6fe406f88cc92446afac021ca4c74 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 16 Jun 2022 19:28:38 +0000
Subject: [PATCH 18/70] Apply suggestions from code review

---
 python/ray/train/BUILD                        |   8 +
 .../train/examples/horovod/horovod_example.py |   2 -
 .../tensorflow_linear_dataset_example.py      |  30 +-
 .../examples/train_linear_dataset_example.py  |   5 -
 .../train/examples/train_linear_example.py    |   5 -
 python/ray/train/tests/test_callbacks.py      | 357 ++++++++++++++++++
 python/ray/train/tests/test_minimal.py        |   2 +-
 python/ray/train/tests/test_tune.py           |  28 +-
 8 files changed, 407 insertions(+), 30 deletions(-)
 create mode 100644 python/ray/train/tests/test_callbacks.py

diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD
index 6124c35c2606..6f719b725e64 100644
--- a/python/ray/train/BUILD
+++ b/python/ray/train/BUILD
@@ -129,6 +129,14 @@ py_test(
     deps = [":train_lib"]
 )
 
+py_test(
+    name = "test_callbacks",
+    size = "medium",
+    srcs = ["tests/test_callbacks.py"],
+    tags = ["team:ml", "exclusive"],
+    deps = [":train_lib"]
+)
+
 py_test(
     name = "test_data_parallel_trainer",
     size = "medium",
diff --git a/python/ray/train/examples/horovod/horovod_example.py b/python/ray/train/examples/horovod/horovod_example.py
index 1e163da70052..c01788008ec5 100644
--- a/python/ray/train/examples/horovod/horovod_example.py
+++ b/python/ray/train/examples/horovod/horovod_example.py
@@ -144,13 +144,11 @@ def train_func(config):
 
     model, optimizer, train_loader, train_sampler = setup(config)
 
-    results = []
     for epoch in range(num_epochs):
         loss = train_epoch(
             model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda
         )
         train.report(loss=loss)
-    return results
 
 
 def main(num_workers, use_gpu, kwargs):
diff --git a/python/ray/train/examples/tensorflow_linear_dataset_example.py b/python/ray/train/examples/tensorflow_linear_dataset_example.py
index 9271c5125da4..9dbb3205b7bf 100644
--- a/python/ray/train/examples/tensorflow_linear_dataset_example.py
+++ b/python/ray/train/examples/tensorflow_linear_dataset_example.py
@@ -1,12 +1,13 @@
 import argparse
+from typing import Dict, Tuple
 
 import tensorflow as tf
 from tensorflow.keras.callbacks import Callback
 
 import ray
 import ray.train as train
+from ray.air.config import DatasetConfig
 from ray.data import Dataset
-from ray.data.dataset_pipeline import DatasetPipeline
 from ray.train.tensorflow import TensorflowTrainer, prepare_dataset_shard
 
 
@@ -15,17 +16,22 @@ def on_epoch_end(self, epoch, logs=None):
         train.report(**logs)
 
 
-def get_dataset_pipeline(a=5, b=10, size=1000) -> DatasetPipeline:
+def get_datasets_and_configs(
+    a=5, b=10, size=1000
+) -> Tuple[Dict[str, Dataset], Dict[str, DatasetConfig]]:
     def get_dataset(a, b, size) -> Dataset:
         items = [i / size for i in range(size)]
         dataset = ray.data.from_items([{"x": x, "y": a * x + b} for x in items])
         return dataset
 
-    dataset = get_dataset(a, b, size)
+    datasets = {"train": get_dataset(a, b, size)}
 
-    dataset_pipeline = dataset.repeat().random_shuffle_each_window()
+    # Use dataset pipelining
+    dataset_configs = {
+        "train": DatasetConfig(use_stream_api=True),
+    }
 
-    return dataset_pipeline
+    return datasets, dataset_configs
 
 
 def build_and_compile_model(config):
@@ -57,7 +63,6 @@ def train_func(config):
     dataset_pipeline = train.get_dataset_shard("train")
     dataset_iterator = dataset_pipeline.iter_epochs()
 
-    results = []
     for _ in range(epochs):
         dataset = next(dataset_iterator)
         tf_dataset = prepare_dataset_shard(
@@ -70,17 +75,16 @@ def train_func(config):
                 batch_size=batch_size,
             )
         )
-        history = multi_worker_model.fit(tf_dataset, callbacks=[TrainReportCallback()])
-        results.append(history.history)
-    return results
+        multi_worker_model.fit(tf_dataset, callbacks=[TrainReportCallback()])
 
 
 def train_tensorflow_linear(num_workers=2, use_gpu=False):
-    dataset_pipeline = get_dataset_pipeline()
+    datasets, dataset_configs = get_datasets_and_configs()
     trainer = TensorflowTrainer(
         train_func,
         train_loop_config={"lr": 1e-3, "batch_size": 32, "epochs": 4},
-        datasets={"train": dataset_pipeline},
+        datasets=datasets,
+        dataset_config=dataset_configs,
         scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
     )
     results = trainer.fit()
@@ -113,8 +117,8 @@ def train_tensorflow_linear(num_workers=2, use_gpu=False):
     args, _ = parser.parse_known_args()
 
     if args.smoke_test:
-        # 1 for datasets
-        num_cpus = args.num_workers + 1
+        # 1 for datasets, 1 for Trainable actor
+        num_cpus = args.num_workers + 2
         num_gpus = args.num_workers if args.use_gpu else 0
         ray.init(num_cpus=num_cpus, num_gpus=num_gpus)
     else:
diff --git a/python/ray/train/examples/train_linear_dataset_example.py b/python/ray/train/examples/train_linear_dataset_example.py
index 3038ac66aa9e..acfa0ce2e637 100644
--- a/python/ray/train/examples/train_linear_dataset_example.py
+++ b/python/ray/train/examples/train_linear_dataset_example.py
@@ -87,8 +87,6 @@ def train_func(config):
 
     optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 
-    results = []
-
     train_dataset_iterator = train_dataset_pipeline_shard.iter_epochs()
     validation_dataset_iterator = validation_dataset_pipeline_shard.iter_epochs()
 
@@ -116,9 +114,6 @@ def train_func(config):
         train_epoch(train_torch_dataset, model, loss_fn, optimizer, device)
         result = validate_epoch(validation_torch_dataset, model, loss_fn, device)
         train.report(**result)
-        results.append(result)
-
-    return results
 
 
 def train_linear(num_workers=2, use_gpu=False):
diff --git a/python/ray/train/examples/train_linear_example.py b/python/ray/train/examples/train_linear_example.py
index 069c6dd13db1..7e09acef3d3d 100644
--- a/python/ray/train/examples/train_linear_example.py
+++ b/python/ray/train/examples/train_linear_example.py
@@ -74,15 +74,10 @@ def train_func(config):
 
     optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 
-    results = []
-
     for _ in range(epochs):
         train_epoch(train_loader, model, loss_fn, optimizer)
         result = validate_epoch(validation_loader, model, loss_fn)
         train.report(**result)
-        results.append(result)
-
-    return results
 
 
 def train_linear(num_workers=2, use_gpu=False, epochs=3):
diff --git a/python/ray/train/tests/test_callbacks.py b/python/ray/train/tests/test_callbacks.py
new file mode 100644
index 000000000000..b21adf6634b9
--- /dev/null
+++ b/python/ray/train/tests/test_callbacks.py
@@ -0,0 +1,357 @@
+from typing import Dict, List
+import glob
+import io
+import json
+from collections import defaultdict
+from contextlib import redirect_stdout
+from pathlib import Path
+
+import pytest
+
+import ray
+import ray.train as train
+from ray.train import Trainer
+from ray.train.backend import BackendConfig, Backend
+from ray.train.callbacks import (
+    TrainingCallback,
+    JsonLoggerCallback,
+    PrintCallback,
+    TBXLoggerCallback,
+    TorchTensorboardProfilerCallback,
+)
+from ray.train.callbacks.logging import (
+    MLflowLoggerCallback,
+    _TrainCallbackLogdirManager,
+)
+from ray.train.constants import (
+    TRAINING_ITERATION,
+    DETAILED_AUTOFILLED_KEYS,
+    BASIC_AUTOFILLED_KEYS,
+    ENABLE_DETAILED_AUTOFILLED_METRICS_ENV,
+)
+from ray.train._internal.worker_group import WorkerGroup
+from ray.train._internal.results_preprocessors.preprocessor import (
+    SequentialResultsPreprocessor,
+)
+
+try:
+    from tensorflow.python.summary.summary_iterator import summary_iterator
+except ImportError:
+    summary_iterator = None
+
+
+@pytest.fixture
+def ray_start_4_cpus():
+    address_info = ray.init(num_cpus=4)
+    yield address_info
+    # The code after the yield will run as teardown code.
+    ray.shutdown()
+
+
+class TestConfig(BackendConfig):
+    @property
+    def backend_cls(self):
+        return TestBackend
+
+
+class TestBackend(Backend):
+    def on_start(self, worker_group: WorkerGroup, backend_config: TestConfig):
+        pass
+
+    def on_shutdown(self, worker_group: WorkerGroup, backend_config: TestConfig):
+        pass
+
+
+def test_print(ray_start_4_cpus):
+    num_workers = 4
+
+    def train_func():
+        train.report(rank=train.world_rank())
+
+    stream = io.StringIO()
+    with redirect_stdout(stream):
+        trainer = Trainer(TestConfig(), num_workers=num_workers)
+        trainer.start()
+        trainer.run(train_func, callbacks=[PrintCallback()])
+        trainer.shutdown()
+
+    output = stream.getvalue()
+    results = json.loads(output)
+
+    assert len(results) == num_workers
+    for i, result in enumerate(results):
+        assert set(result.keys()) == (BASIC_AUTOFILLED_KEYS | {"rank"})
+        assert result["rank"] == i
+
+
+@pytest.mark.parametrize("input", [None, "dir", "file"])
+def test_train_callback_logdir_manager(tmp_path, input):
+    default_dir = tmp_path / "default_dir"
+
+    if input == "dir":
+        input_logdir = tmp_path / "dir"
+        input_logdir.mkdir(parents=True)
+    elif input == "file":
+        input_logdir = tmp_path / "file"
+        input_logdir.touch()
+    else:
+        input_logdir = None
+
+    logdir_manager = _TrainCallbackLogdirManager(input_logdir)
+
+    if input_logdir:
+        path = logdir_manager.logdir_path
+        assert path == logdir_manager.logdir_path
+    else:
+        with pytest.raises(RuntimeError):
+            path = logdir_manager.logdir_path
+
+    if input_logdir and not Path(input_logdir).is_dir():
+        with pytest.raises(FileExistsError):
+            logdir_manager.setup_logdir(str(default_dir))
+    else:
+        path = logdir_manager.setup_logdir(str(default_dir))
+        assert path == logdir_manager.logdir_path
+
+
+@pytest.mark.parametrize("workers_to_log", [0, None, [0, 1]])
+@pytest.mark.parametrize("detailed", [False, True])
+@pytest.mark.parametrize("filename", [None, "my_own_filename.json"])
+def test_json(
+    monkeypatch, ray_start_4_cpus, tmp_path, workers_to_log, detailed, filename
+):
+    if detailed:
+        monkeypatch.setenv(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, "1")
+
+    config = TestConfig()
+
+    num_iters = 5
+    num_workers = 4
+
+    if workers_to_log is None:
+        num_workers_to_log = num_workers
+    elif isinstance(workers_to_log, int):
+        num_workers_to_log = 1
+    else:
+        num_workers_to_log = len(workers_to_log)
+
+    def train_func():
+        for i in range(num_iters):
+            train.report(index=i)
+        return 1
+
+    if filename is None:
+        # if None, use default value
+        callback = JsonLoggerCallback(workers_to_log=workers_to_log)
+    else:
+        callback = JsonLoggerCallback(filename=filename, workers_to_log=workers_to_log)
+    trainer = Trainer(config, num_workers=num_workers, logdir=str(tmp_path))
+    trainer.start()
+    trainer.run(train_func, callbacks=[callback])
+    if filename is None:
+        assert str(callback.log_path.name) == JsonLoggerCallback._default_filename
+    else:
+        assert str(callback.log_path.name) == filename
+
+    with open(callback.log_path, "r") as f:
+        log = json.load(f)
+    print(log)
+    assert len(log) == num_iters
+    assert len(log[0]) == num_workers_to_log
+    assert all(len(element) == len(log[0]) for element in log)
+    assert all(
+        all(worker["index"] == worker[TRAINING_ITERATION] - 1 for worker in element)
+        for element in log
+    )
+    assert all(
+        all(all(key in worker for key in BASIC_AUTOFILLED_KEYS) for worker in element)
+        for element in log
+    )
+    if detailed:
+        assert all(
+            all(
+                all(key in worker for key in DETAILED_AUTOFILLED_KEYS)
+                for worker in element
+            )
+            for element in log
+        )
+    else:
+        assert all(
+            all(
+                not any(key in worker for key in DETAILED_AUTOFILLED_KEYS)
+                for worker in element
+            )
+            for element in log
+        )
+
+
+def _validate_tbx_result(events_dir):
+    events_file = list(glob.glob(f"{events_dir}/events*"))[0]
+    results = defaultdict(list)
+    for event in summary_iterator(events_file):
+        for v in event.summary.value:
+            assert v.tag.startswith("ray/train")
+            results[v.tag[10:]].append(v.simple_value)
+
+    assert len(results["episode_reward_mean"]) == 3
+    assert [int(res) for res in results["episode_reward_mean"]] == [4, 5, 6]
+    assert len(results["score"]) == 1
+    assert len(results["hello/world"]) == 1
+
+
+def test_TBX(ray_start_4_cpus, tmp_path):
+    config = TestConfig()
+
+    temp_dir = tmp_path
+    num_workers = 4
+
+    def train_func():
+        train.report(episode_reward_mean=4)
+        train.report(episode_reward_mean=5)
+        train.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1})
+        return 1
+
+    callback = TBXLoggerCallback(temp_dir)
+    trainer = Trainer(config, num_workers=num_workers)
+    trainer.start()
+    trainer.run(train_func, callbacks=[callback])
+
+    _validate_tbx_result(temp_dir)
+
+
+def test_mlflow(ray_start_4_cpus, tmp_path):
+    config = TestConfig()
+
+    params = {"p1": "p1"}
+
+    temp_dir = tmp_path
+    num_workers = 4
+
+    def train_func(config):
+        train.report(episode_reward_mean=4)
+        train.report(episode_reward_mean=5)
+        train.report(episode_reward_mean=6)
+        return 1
+
+    callback = MLflowLoggerCallback(experiment_name="test_exp", logdir=temp_dir)
+    trainer = Trainer(config, num_workers=num_workers)
+    trainer.start()
+    trainer.run(train_func, config=params, callbacks=[callback])
+
+    from mlflow.tracking import MlflowClient
+
+    client = MlflowClient(tracking_uri=callback.mlflow_util._mlflow.get_tracking_uri())
+
+    experiment_id = client.get_experiment_by_name("test_exp").experiment_id
+    all_runs = callback.mlflow_util._mlflow.search_runs(experiment_ids=[experiment_id])
+    assert len(all_runs) == 1
+    # all_runs is a pandas dataframe.
+    all_runs = all_runs.to_dict(orient="records")
+    run_id = all_runs[0]["run_id"]
+    run = client.get_run(run_id)
+
+    assert run.data.params == params
+    assert (
+        "episode_reward_mean" in run.data.metrics
+        and run.data.metrics["episode_reward_mean"] == 6.0
+    )
+    assert (
+        TRAINING_ITERATION in run.data.metrics
+        and run.data.metrics[TRAINING_ITERATION] == 3.0
+    )
+
+    metric_history = client.get_metric_history(run_id=run_id, key="episode_reward_mean")
+
+    assert len(metric_history) == 3
+    iterations = [metric.step for metric in metric_history]
+    assert iterations == [1, 2, 3]
+    rewards = [metric.value for metric in metric_history]
+    assert rewards == [4, 5, 6]
+
+
+def test_torch_tensorboard_profiler_callback(ray_start_4_cpus, tmp_path):
+    config = TestConfig()
+
+    temp_dir = tmp_path
+    num_workers = 4
+    num_epochs = 2
+
+    def train_func():
+        from ray.train.torch import TorchWorkerProfiler
+        from torch.profiler import profile, record_function, schedule
+
+        twp = TorchWorkerProfiler()
+        with profile(
+            activities=[],
+            schedule=schedule(wait=0, warmup=0, active=1),
+            on_trace_ready=twp.trace_handler,
+        ) as p:
+
+            for epoch in range(num_epochs):
+                with record_function("test_function"):
+                    pass
+
+                p.step()
+
+                profile_results = twp.get_and_clear_profile_traces()
+                train.report(epoch=epoch, **profile_results)
+
+    callback = TorchTensorboardProfilerCallback(temp_dir)
+    trainer = Trainer(config, num_workers=num_workers)
+    trainer.start()
+    trainer.run(train_func, callbacks=[callback])
+
+    assert temp_dir.exists()
+
+    count = 0
+    for path in temp_dir.iterdir():
+        assert path.is_file()
+        count += 1
+    assert count == num_workers * num_epochs
+
+
+# fix issue: repeat assignments for preprocessor results nested recursive calling
+# see https://github.com/ray-project/ray/issues/25005
+def test_hotfix_callback_nested_recusive_calling():
+    # test callback used to simulate the nested recursive calling for preprocess()
+    class TestCallback(TrainingCallback):
+        def __init__(self):
+            self.max_process_time = 0
+
+        def count_process_times(self, processor):
+            count = 0
+            if processor:
+                if isinstance(processor, SequentialResultsPreprocessor):
+                    for preprocessor in processor.preprocessors:
+                        # recursive calling preprocessors in list
+                        count += self.count_process_times(preprocessor)
+                else:
+                    count = 1
+            return count
+
+        def handle_result(self, results: List[Dict], **info):
+            process_times = self.count_process_times(self.results_preprocessor)
+            if process_times > self.max_process_time:
+                self.max_process_time = process_times
+            print(f"process times: {process_times}")
+
+    def train_func():
+        for idx in range(num_iterates):
+            train.report(iterate=idx + 1)
+
+    # python default limitation for iterate depth
+    num_iterates = 1000
+    trainer = Trainer(TestConfig(), num_workers=1)
+    trainer.start()
+    test_callback = TestCallback()
+    trainer.run(train_func, callbacks=[test_callback])
+    assert test_callback.max_process_time == 1
+    print(f"callback max process time: {test_callback.max_process_time}")
+    trainer.shutdown()
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+
+    sys.exit(pytest.main(["-v", "-x", __file__]))
\ No newline at end of file
diff --git a/python/ray/train/tests/test_minimal.py b/python/ray/train/tests/test_minimal.py
index 5f3be1d4c3b3..a23d7b4f23f9 100644
--- a/python/ray/train/tests/test_minimal.py
+++ b/python/ray/train/tests/test_minimal.py
@@ -59,7 +59,7 @@ def train_func():
     )
     results = trainer.fit()
 
-    assert results.checkpoint
+    assert results.checkpoint.to_dict()[key] == checkpoint.to_dict()[key]
 
 
 def test_failure():
diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py
index 2fed4e42fa43..3fac9a1e6599 100644
--- a/python/ray/train/tests/test_tune.py
+++ b/python/ray/train/tests/test_tune.py
@@ -7,7 +7,6 @@
 from ray import tune
 from ray.air import Checkpoint
 from ray.air.config import FailureConfig, RunConfig
-from ray.train import Trainer
 from ray.train._internal.worker_group import WorkerGroup
 from ray.train.backend import Backend, BackendConfig
 from ray.train.data_parallel_trainer import DataParallelTrainer
@@ -115,16 +114,37 @@ def test_tune_tensorflow_mnist(ray_start_8_cpus):
     tune_tensorflow_mnist(num_workers=2, use_gpu=False, num_samples=2)
 
 
+def test_tune_error(ray_start_4_cpus):
+    def train_func(config):
+        raise RuntimeError("Error in training function!")
+
+    trainer = DataParallelTrainer(
+        train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1)
+    )
+    tuner = Tuner(
+        trainer,
+    )
+
+    # with pytest.raises(TuneError):
+    tuner.fit()
+    print("a")
+
+
 def test_tune_checkpoint(ray_start_4_cpus):
     def train_func():
         for i in range(10):
             train.report(test=i)
         train.save_checkpoint(hello="world")
 
-    trainer = Trainer(TestConfig(), num_workers=1)
-    TestTrainable = trainer.to_tune_trainable(train_func)
+    trainer = DataParallelTrainer(
+        train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1)
+    )
+    tuner = Tuner(
+        trainer,
+        param_space={"train_loop_config": {"max_iter": 5}},
+    )
 
-    [trial] = tune.run(TestTrainable).trials
+    [trial] = tuner.fit()._experiment_analysis.trials
     checkpoint_path = trial.checkpoint.dir_or_data
     assert os.path.exists(checkpoint_path)
     checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()

From 6f8d7e092c5ea919c99479781b8483f328ceaec6 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 16 Jun 2022 21:58:20 +0000
Subject: [PATCH 19/70] Fix tracked checkpoint error

---
 python/ray/util/ml_utils/checkpoint_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py
index 9dced9377750..9a27acd10e36 100644
--- a/python/ray/util/ml_utils/checkpoint_manager.py
+++ b/python/ray/util/ml_utils/checkpoint_manager.py
@@ -132,7 +132,7 @@ def to_air_checkpoint(self) -> Optional[Checkpoint]:
             checkpoint_dir = TrainableUtil.find_checkpoint_dir(checkpoint_data)
             checkpoint = Checkpoint.from_directory(checkpoint_dir)
         elif isinstance(checkpoint_data, bytes):
-            with tempfile.mkdtemp() as tmpdir:
+            with tempfile.TemporaryDirectory() as tmpdir:
                 TrainableUtil.create_from_pickle(checkpoint_data, tmpdir)
                 # Double wrap in checkpoint so we hold the data in memory and
                 # can remove the temp directory

From 85cb1a71e90ee21d5095a817ff10feb478da7922 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 16 Jun 2022 21:58:39 +0000
Subject: [PATCH 20/70] CI fixes

---
 .../tensorflow_linear_dataset_example.py      |  2 +-
 .../train/examples/train_linear_example.py    |  4 +++
 python/ray/train/tests/test_callbacks.py      | 26 ++++++++++---------
 3 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/python/ray/train/examples/tensorflow_linear_dataset_example.py b/python/ray/train/examples/tensorflow_linear_dataset_example.py
index 9dbb3205b7bf..ccc408455b44 100644
--- a/python/ray/train/examples/tensorflow_linear_dataset_example.py
+++ b/python/ray/train/examples/tensorflow_linear_dataset_example.py
@@ -88,7 +88,7 @@ def train_tensorflow_linear(num_workers=2, use_gpu=False):
         scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
     )
     results = trainer.fit()
-    print(f"Results: {results[0]}")
+    print(f"Results: {results}")
     return results
 
 
diff --git a/python/ray/train/examples/train_linear_example.py b/python/ray/train/examples/train_linear_example.py
index 7e09acef3d3d..ceabd0c2853f 100644
--- a/python/ray/train/examples/train_linear_example.py
+++ b/python/ray/train/examples/train_linear_example.py
@@ -74,10 +74,14 @@ def train_func(config):
 
     optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 
+    results = []
     for _ in range(epochs):
         train_epoch(train_loader, model, loss_fn, optimizer)
         result = validate_epoch(validation_loader, model, loss_fn)
         train.report(**result)
+        results.append(result)
+    # return required for backwards compatibility with the old API
+    return results
 
 
 def train_linear(num_workers=2, use_gpu=False, epochs=3):
diff --git a/python/ray/train/tests/test_callbacks.py b/python/ray/train/tests/test_callbacks.py
index b21adf6634b9..eb6ed7c3db17 100644
--- a/python/ray/train/tests/test_callbacks.py
+++ b/python/ray/train/tests/test_callbacks.py
@@ -1,37 +1,37 @@
-from typing import Dict, List
 import glob
 import io
 import json
 from collections import defaultdict
 from contextlib import redirect_stdout
 from pathlib import Path
+from typing import Dict, List
 
 import pytest
 
 import ray
 import ray.train as train
 from ray.train import Trainer
-from ray.train.backend import BackendConfig, Backend
+from ray.train._internal.results_preprocessors.preprocessor import (
+    SequentialResultsPreprocessor,
+)
+from ray.train._internal.worker_group import WorkerGroup
+from ray.train.backend import Backend, BackendConfig
 from ray.train.callbacks import (
-    TrainingCallback,
     JsonLoggerCallback,
     PrintCallback,
     TBXLoggerCallback,
     TorchTensorboardProfilerCallback,
+    TrainingCallback,
 )
 from ray.train.callbacks.logging import (
     MLflowLoggerCallback,
     _TrainCallbackLogdirManager,
 )
 from ray.train.constants import (
-    TRAINING_ITERATION,
-    DETAILED_AUTOFILLED_KEYS,
     BASIC_AUTOFILLED_KEYS,
+    DETAILED_AUTOFILLED_KEYS,
     ENABLE_DETAILED_AUTOFILLED_METRICS_ENV,
-)
-from ray.train._internal.worker_group import WorkerGroup
-from ray.train._internal.results_preprocessors.preprocessor import (
-    SequentialResultsPreprocessor,
+    TRAINING_ITERATION,
 )
 
 try:
@@ -277,9 +277,10 @@ def test_torch_tensorboard_profiler_callback(ray_start_4_cpus, tmp_path):
     num_epochs = 2
 
     def train_func():
-        from ray.train.torch import TorchWorkerProfiler
         from torch.profiler import profile, record_function, schedule
 
+        from ray.train.torch import TorchWorkerProfiler
+
         twp = TorchWorkerProfiler()
         with profile(
             activities=[],
@@ -351,7 +352,8 @@ def train_func():
 
 
 if __name__ == "__main__":
-    import pytest
     import sys
 
-    sys.exit(pytest.main(["-v", "-x", __file__]))
\ No newline at end of file
+    import pytest
+
+    sys.exit(pytest.main(["-v", "-x", __file__]))

From 86a71d6bd6958b639e3eeaa19264bc99398c0aa5 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Mon, 20 Jun 2022 16:10:20 +0000
Subject: [PATCH 21/70] Add checkpoint configuration to `RunConfig`

---
 python/ray/air/config.py               | 23 ++++++++++++++++++++++-
 python/ray/tune/impl/tuner_internal.py | 10 ++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/python/ray/air/config.py b/python/ray/air/config.py
index a8f5c2b85c66..d75dc3c0cc5d 100644
--- a/python/ray/air/config.py
+++ b/python/ray/air/config.py
@@ -273,6 +273,25 @@ class FailureConfig:
     max_failures: int = 0
 
 
+@dataclass
+@PublicAPI(stability="alpha")
+class CheckpointingConfig:
+    """Configuration related to checkpointing of each run/trial.
+
+    Args:
+        keep_checkpoints_num: Number of checkpoints to keep. A value of
+            `None` keeps all checkpoints. Defaults to `None`. If set, need
+            to provide `checkpoint_score_attr`.
+        checkpoint_score_attr: Specifies by which attribute to rank the
+            best checkpoint. Default is increasing order. If attribute starts
+            with `min-` it will rank attribute in decreasing order, i.e.
+            `min-validation_loss`.
+    """
+
+    keep_checkpoints_num: Optional[int] = None
+    checkpoint_score_attr: Optional[str] = None
+
+
 @dataclass
 @PublicAPI(stability="alpha")
 class RunConfig:
@@ -298,8 +317,9 @@ class RunConfig:
             Currently only stateless callbacks are supported for resumed runs.
             (any state of the callback will not be checkpointed by Tune
             and thus will not take effect in resumed runs).
-        failure: The failure mode configuration.
+        failure: Failure mode configuration.
         sync_config: Configuration object for syncing. See tune.SyncConfig.
+        checkpointing: Checkpointing configuration.
         verbose: 0, 1, 2, or 3. Verbosity mode.
             0 = silent, 1 = only status updates, 2 = status and brief
             results, 3 = status and detailed results. Defaults to 2.
@@ -312,4 +332,5 @@ class RunConfig:
     stop: Optional[Union[Mapping, "Stopper", Callable[[str, Mapping], bool]]] = None
     failure: Optional[FailureConfig] = None
     sync_config: Optional[SyncConfig] = None
+    checkpointing: Optional[CheckpointingConfig] = None
     verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS
diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py
index 51d67a5a5c9b..7190f6c6ff38 100644
--- a/python/ray/tune/impl/tuner_internal.py
+++ b/python/ray/tune/impl/tuner_internal.py
@@ -166,6 +166,16 @@ def _fit_internal(self, trainable, param_space) -> ExperimentAnalysis:
             max_failures=(
                 self._run_config.failure.max_failures if self._run_config.failure else 0
             ),
+            keep_checkpoints_num=(
+                self._run_config.checkpointing.keep_checkpoints_num
+                if self._run_config.checkpointing
+                else None
+            ),
+            checkpoint_score_attr=(
+                self._run_config.checkpointing.checkpoint_score_attr
+                if self._run_config.checkpointing
+                else None
+            ),
             _experiment_checkpoint_dir=self._experiment_checkpoint_dir,
             raise_on_failed_trial=False,
             verbose=self._run_config.verbose,

From 41eb7809fd2f7bcb1188080f927d95cc5ed23645 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Mon, 20 Jun 2022 18:41:58 +0000
Subject: [PATCH 22/70] Add `best_checkpoint` and `dataframe` to `Result`

---
 python/ray/air/result.py                  |  9 ++++
 python/ray/tune/impl/tuner_internal.py    | 49 ++++++++++++----------
 python/ray/tune/result_grid.py            |  6 +++
 python/ray/tune/tests/test_result_grid.py | 50 +++++++++++++++++++++++
 4 files changed, 92 insertions(+), 22 deletions(-)

diff --git a/python/ray/air/result.py b/python/ray/air/result.py
index 69cfd69926b8..2b52fb844244 100644
--- a/python/ray/air/result.py
+++ b/python/ray/air/result.py
@@ -4,6 +4,8 @@
 from ray.air.checkpoint import Checkpoint
 from ray.util.annotations import PublicAPI
 
+import pandas as pd
+
 
 @dataclass
 @PublicAPI(stability="alpha")
@@ -21,12 +23,19 @@ class Result:
     Args:
         metrics: The final metrics as reported by an Trainable.
         checkpoint: The final checkpoint of the Trainable.
+        best_checkpoint: The best checkpoint of the Trainable, as
+            determined by the ``metric`` and ``mode`` arguments set.
+            If either of those has not been set, this will be None.
+            May be the same as ``checkpoint``.
         error: The execution error of the Trainable run, if the trial finishes in error.
+        dataframe: The full result dataframe of the Trainable.
     """
 
     metrics: Optional[Dict[str, Any]]
     checkpoint: Optional[Checkpoint]
+    best_checkpoint: Optional[Checkpoint]
     error: Optional[Exception]
+    dataframe: Optional[pd.DataFrame]
 
     @property
     def config(self) -> Optional[Dict[str, Any]]:
diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py
index 7190f6c6ff38..b2ea167369d7 100644
--- a/python/ray/tune/impl/tuner_internal.py
+++ b/python/ray/tune/impl/tuner_internal.py
@@ -149,17 +149,11 @@ def fit(self) -> ResultGrid:
 
         return ResultGrid(analysis)
 
-    def _fit_internal(self, trainable, param_space) -> ExperimentAnalysis:
-        """Fitting for a fresh Tuner."""
-        analysis = run(
-            trainable,
-            config={**param_space},
+    def _get_tune_run_arguments(self) -> Dict[str, Any]:
+        """Get tune.run arguments common for both new and resumed runs."""
+        return dict(
             mode=self._tune_config.mode,
             metric=self._tune_config.metric,
-            num_samples=self._tune_config.num_samples,
-            search_alg=self._tune_config.search_alg,
-            scheduler=self._tune_config.scheduler,
-            name=self._run_config.name,
             callbacks=self._run_config.callbacks,
             sync_config=self._run_config.sync_config,
             stop=self._run_config.stop,
@@ -179,27 +173,38 @@ def _fit_internal(self, trainable, param_space) -> ExperimentAnalysis:
             _experiment_checkpoint_dir=self._experiment_checkpoint_dir,
             raise_on_failed_trial=False,
             verbose=self._run_config.verbose,
+        )
+
+    def _fit_internal(self, trainable, param_space) -> ExperimentAnalysis:
+        """Fitting for a fresh Tuner."""
+        args = {
+            **self._get_tune_run_arguments(),
+            **dict(
+                run_or_experiment=trainable,
+                config={**param_space},
+                num_samples=self._tune_config.num_samples,
+                search_alg=self._tune_config.search_alg,
+                scheduler=self._tune_config.scheduler,
+                name=self._run_config.name,
+            ),
             **self._tuner_kwargs,
+        }
+        analysis = run(
+            **args,
         )
         return analysis
 
     def _fit_resume(self, trainable) -> ExperimentAnalysis:
         """Fitting for a restored Tuner."""
-        analysis = run(
-            trainable,
-            resume=True,
-            mode=self._tune_config.mode,
-            metric=self._tune_config.metric,
-            callbacks=self._run_config.callbacks,
-            sync_config=self._run_config.sync_config,
-            stop=self._run_config.stop,
-            max_failures=(
-                self._run_config.failure.max_failures if self._run_config.failure else 0
+        args = {
+            **self._get_tune_run_arguments(),
+            **dict(
+                run_or_experiment=trainable,
+                resume=True,
             ),
-            _experiment_checkpoint_dir=self._experiment_checkpoint_dir,
-            raise_on_failed_trial=False,
             **self._tuner_kwargs,
-        )
+        }
+        analysis = run(**args)
         return analysis
 
     def __getstate__(self):
diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index 9d653ecb4991..2568ebe46b09 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -165,10 +165,16 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError]
 
     def _trial_to_result(self, trial: Trial) -> Result:
         checkpoint = trial.checkpoint.to_air_checkpoint()
+        try:
+            best_checkpoint = self._experiment_analysis.best_checkpoint
+        except ValueError:
+            best_checkpoint = None
 
         result = Result(
             checkpoint=checkpoint,
+            best_checkpoint=best_checkpoint,
             metrics=trial.last_result.copy(),
             error=self._populate_exception(trial),
+            dataframe=self._experiment_analysis.trial_dataframes.get(trial.logdir),
         )
         return result
diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py
index 77fae7453edf..dbcab0037d50 100644
--- a/python/ray/tune/tests/test_result_grid.py
+++ b/python/ray/tune/tests/test_result_grid.py
@@ -3,6 +3,7 @@
 import pickle
 
 import pytest
+import pandas as pd
 
 import ray
 from ray import tune
@@ -40,6 +41,55 @@ def f(config):
     assert result.metrics["config"] == result.config
 
 
+def test_result_grid_metric_mode(ray_start_2_cpus):
+    def f(config):
+        for i in range(2):
+            with tune.checkpoint_dir(step=i) as checkpoint_dir:
+                path = os.path.join(checkpoint_dir, "checkpoint")
+                with open(path, "w") as f:
+                    f.write(json.dumps({"step": i}))
+            tune.report(step=i)
+
+    analysis = tune.run(f, config={"a": 1}, metric="step", mode="max")
+    analysis._legacy_checkpoint = False
+    result_grid = ResultGrid(analysis)
+    result = result_grid[0]
+    assert isinstance(result.checkpoint, Checkpoint)
+    assert isinstance(result.best_checkpoint, Checkpoint)
+    assert isinstance(result.metrics, dict)
+    assert isinstance(result.config, dict)
+    assert isinstance(result.dataframe, pd.DataFrame)
+    assert os.path.normpath(
+        result.checkpoint.get_internal_representation()[1]
+    ) == os.path.normpath(result.best_checkpoint.get_internal_representation()[1])
+    assert result.config == {"a": 1}
+    assert result.metrics["config"] == result.config
+    assert len(result.dataframe) == 2
+
+
+def test_result_grid_metric_mode_unset(ray_start_2_cpus):
+    def f(config):
+        for i in range(2):
+            with tune.checkpoint_dir(step=i) as checkpoint_dir:
+                path = os.path.join(checkpoint_dir, "checkpoint")
+                with open(path, "w") as f:
+                    f.write(json.dumps({"step": i}))
+            tune.report(step=i)
+
+    analysis = tune.run(f, config={"a": 1})
+    analysis._legacy_checkpoint = False
+    result_grid = ResultGrid(analysis)
+    result = result_grid[0]
+    assert isinstance(result.checkpoint, Checkpoint)
+    assert result.best_checkpoint is None
+    assert isinstance(result.metrics, dict)
+    assert isinstance(result.config, dict)
+    assert isinstance(result.dataframe, pd.DataFrame)
+    assert result.config == {"a": 1}
+    assert result.metrics["config"] == result.config
+    assert len(result.dataframe) == 2
+
+
 def test_result_grid_no_checkpoint(ray_start_2_cpus):
     def f(config):
         pass

From eb2eb6717ff59a29072a566377760fb5cc1e2025 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Mon, 20 Jun 2022 19:53:31 +0000
Subject: [PATCH 23/70] Tests, fixes

---
 python/ray/air/__init__.py                | 10 ++-
 python/ray/air/config.py                  | 43 +++++++++++--
 python/ray/air/result.py                  |  6 +-
 python/ray/air/tests/test_api.py          | 28 +++++++++
 python/ray/tune/impl/tuner_internal.py    |  2 +-
 python/ray/tune/result_grid.py            | 28 ++++++++-
 python/ray/tune/tests/test_result_grid.py |  4 +-
 python/ray/tune/tests/test_tuner.py       | 76 ++++++++++++++++++++++-
 8 files changed, 181 insertions(+), 16 deletions(-)

diff --git a/python/ray/air/__init__.py b/python/ray/air/__init__.py
index 196fa1aa7e35..2c82cce8f4e3 100644
--- a/python/ray/air/__init__.py
+++ b/python/ray/air/__init__.py
@@ -1,5 +1,11 @@
 from ray.air.checkpoint import Checkpoint
-from ray.air.config import DatasetConfig, RunConfig, ScalingConfig
+from ray.air.config import (
+    DatasetConfig,
+    RunConfig,
+    ScalingConfig,
+    FailureConfig,
+    CheckpointingConfig,
+)
 from ray.air.data_batch_type import DataBatchType
 from ray.air.result import Result
 from ray.air.util.datasets import train_test_split
@@ -11,5 +17,7 @@
     "Result",
     "ScalingConfig",
     "DatasetConfig",
+    "FailureConfig",
+    "CheckpointingConfig",
     "train_test_split",
 ]
diff --git a/python/ray/air/config.py b/python/ray/air/config.py
index d75dc3c0cc5d..ab9cfe79b67e 100644
--- a/python/ray/air/config.py
+++ b/python/ray/air/config.py
@@ -282,14 +282,47 @@ class CheckpointingConfig:
         keep_checkpoints_num: Number of checkpoints to keep. A value of
             `None` keeps all checkpoints. Defaults to `None`. If set, need
             to provide `checkpoint_score_attr`.
-        checkpoint_score_attr: Specifies by which attribute to rank the
-            best checkpoint. Default is increasing order. If attribute starts
-            with `min-` it will rank attribute in decreasing order, i.e.
-            `min-validation_loss`.
+        checkpoint_score_metric: Specifies by which metric to rank the
+            best checkpoint. Defaults to training iteration.
+        checkpoint_score_mode: Must be one of [min, max]. Determines
+            whether ``checkpoint_score_metric`` should be minimized or maximized.
+            If not set, will be the same as 'max'. Cannot be set if
+            ``checkpoint_score_metric`` is not set.
     """
 
     keep_checkpoints_num: Optional[int] = None
-    checkpoint_score_attr: Optional[str] = None
+    checkpoint_score_metric: Optional[str] = None
+    checkpoint_score_mode: Optional[str] = None
+
+    def __post_init__(self):
+        if self.checkpoint_score_mode not in (None, "min", "max"):
+            raise ValueError(
+                "The `checkpoint_score_mode` parameter can only be "
+                f"either None, 'min' or 'max', got {self.checkpoint_score_mode}."
+            )
+        if (
+            self.checkpoint_score_metric is None
+            and self.checkpoint_score_mode is not None
+        ):
+            raise ValueError(
+                "`checkpoint_score_mode` cannot be set if "
+                "`checkpoint_score_metric` is not set."
+            )
+
+    @property
+    def checkpoint_score_attr(self) -> Optional[str]:
+        """Same as ``checkpoint_score_attr`` in ``tune.run``."""
+        if self.checkpoint_score_metric is None:
+            return self.checkpoint_score_metric
+        prefix = ""
+        if self.checkpoint_score_mode == "min":
+            prefix = "min-"
+        return f"{prefix}{self.checkpoint_score_metric}"
+
+    @property
+    def checkpoint_score_mode_not_none(self) -> str:
+        """``checkpoint_score_mode`` but None -> 'max'"""
+        return self.checkpoint_score_mode or "max"
 
 
 @dataclass
diff --git a/python/ray/air/result.py b/python/ray/air/result.py
index 2b52fb844244..954c0b8f9054 100644
--- a/python/ray/air/result.py
+++ b/python/ray/air/result.py
@@ -24,9 +24,11 @@ class Result:
         metrics: The final metrics as reported by an Trainable.
         checkpoint: The final checkpoint of the Trainable.
         best_checkpoint: The best checkpoint of the Trainable, as
-            determined by the ``metric`` and ``mode`` arguments set.
+            determined by the ``checkpointing`` argument of ``RunConfig``,
+            or, if that's unset, by ``metric`` and ``mode`` arguments of
+            ``TuneConfig``.
             If either of those has not been set, this will be None.
-            May be the same as ``checkpoint``.
+            May be the same object as ``checkpoint``.
         error: The execution error of the Trainable run, if the trial finishes in error.
         dataframe: The full result dataframe of the Trainable.
     """
diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py
index dce2ce930c8d..20138448a77d 100644
--- a/python/ray/air/tests/test_api.py
+++ b/python/ray/air/tests/test_api.py
@@ -4,6 +4,7 @@
 from ray.air import Checkpoint
 from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated
 from ray.air.config import ScalingConfigDataClass
+from ray.air.config import CheckpointingConfig
 from ray.data.preprocessor import Preprocessor
 from ray.train.trainer import BaseTrainer
 
@@ -38,6 +39,33 @@ def test_run_config():
     DummyTrainer(run_config=ray.air.RunConfig())
 
 
+def test_checkpointing_config():
+    # cannot set checkpoint_score_mode if checkpoint_score_metric is unset
+    with pytest.raises(ValueError):
+        CheckpointingConfig(checkpoint_score_mode="min")
+
+    with pytest.raises(ValueError):
+        CheckpointingConfig(
+            checkpoint_score_metric="metric", checkpoint_score_mode="invalid"
+        )
+
+    checkpointing = CheckpointingConfig()
+    assert checkpointing.checkpoint_score_attr is None
+
+    checkpointing = CheckpointingConfig(checkpoint_score_metric="metric")
+    assert checkpointing.checkpoint_score_attr == "metric"
+
+    checkpointing = CheckpointingConfig(
+        checkpoint_score_metric="metric", checkpoint_score_mode="max"
+    )
+    assert checkpointing.checkpoint_score_attr == "metric"
+
+    checkpointing = CheckpointingConfig(
+        checkpoint_score_metric="metric", checkpoint_score_mode="min"
+    )
+    assert checkpointing.checkpoint_score_attr == "min-metric"
+
+
 def test_scaling_config():
     with pytest.raises(ValueError):
         DummyTrainer(scaling_config="invalid")
diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py
index b2ea167369d7..7a0bf39eff6a 100644
--- a/python/ray/tune/impl/tuner_internal.py
+++ b/python/ray/tune/impl/tuner_internal.py
@@ -147,7 +147,7 @@ def fit(self) -> ResultGrid:
         else:
             analysis = self._fit_resume(trainable)
 
-        return ResultGrid(analysis)
+        return ResultGrid(analysis, self._run_config.checkpointing)
 
     def _get_tune_run_arguments(self) -> Dict[str, Any]:
         """Get tune.run arguments common for both new and resumed runs."""
diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index 2568ebe46b09..ef4dd58b5064 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -1,5 +1,5 @@
 import os
-from typing import Optional, Union
+from typing import TYPE_CHECKING, Optional, Union
 
 import pandas as pd
 
@@ -11,6 +11,9 @@
 from ray.tune.trial import Trial
 from ray.util import PublicAPI
 
+if TYPE_CHECKING:
+    from ray.air.config import CheckpointingConfig
+
 
 @PublicAPI(stability="alpha")
 class ResultGrid:
@@ -40,8 +43,14 @@ class ResultGrid:
     seen by Tune will be provided.
     """
 
-    def __init__(self, experiment_analysis: ExperimentAnalysis):
+    def __init__(
+        self,
+        experiment_analysis: ExperimentAnalysis,
+        checkpointing_config: Optional["CheckpointingConfig"] = None,
+    ):
         self._experiment_analysis = experiment_analysis
+        # Used to determine best checkpoint
+        self._checkpointing_config = checkpointing_config
 
     def get_best_result(
         self,
@@ -165,8 +174,21 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError]
 
     def _trial_to_result(self, trial: Trial) -> Result:
         checkpoint = trial.checkpoint.to_air_checkpoint()
+
+        checkpoint_metric = (
+            self._checkpointing_config.checkpoint_score_metric
+            if self._checkpointing_config
+            else None
+        )
+        checkpoint_mode = (
+            self._checkpointing_config.checkpoint_score_mode_not_none
+            if self._checkpointing_config and checkpoint_metric
+            else None
+        )
         try:
-            best_checkpoint = self._experiment_analysis.best_checkpoint
+            best_checkpoint = self._experiment_analysis.get_best_checkpoint(
+                trial, metric=checkpoint_metric, mode=checkpoint_mode
+            )
         except ValueError:
             best_checkpoint = None
 
diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py
index dbcab0037d50..6a81cfd01d1d 100644
--- a/python/ray/tune/tests/test_result_grid.py
+++ b/python/ray/tune/tests/test_result_grid.py
@@ -50,7 +50,7 @@ def f(config):
                     f.write(json.dumps({"step": i}))
             tune.report(step=i)
 
-    analysis = tune.run(f, config={"a": 1}, metric="step", mode="max")
+    analysis = tune.run(f, config={"a": 1}, metric="step", mode="min")
     analysis._legacy_checkpoint = False
     result_grid = ResultGrid(analysis)
     result = result_grid[0]
@@ -61,7 +61,7 @@ def f(config):
     assert isinstance(result.dataframe, pd.DataFrame)
     assert os.path.normpath(
         result.checkpoint.get_internal_representation()[1]
-    ) == os.path.normpath(result.best_checkpoint.get_internal_representation()[1])
+    ) != os.path.normpath(result.best_checkpoint.get_internal_representation()[1])
     assert result.config == {"a": 1}
     assert result.metrics["config"] == result.config
     assert len(result.dataframe) == 2
diff --git a/python/ray/tune/tests/test_tuner.py b/python/ray/tune/tests/test_tuner.py
index a7cad997092c..07a088efacf6 100644
--- a/python/ray/tune/tests/test_tuner.py
+++ b/python/ray/tune/tests/test_tuner.py
@@ -7,7 +7,7 @@
 from sklearn.utils import shuffle
 
 from ray import tune
-from ray.air.config import RunConfig
+from ray.air.config import CheckpointingConfig, RunConfig
 from ray.air.examples.pytorch.torch_linear_example import (
     train_func as linear_train_func,
 )
@@ -32,6 +32,16 @@ class DummyTrainer(BaseTrainer):
         "placement_strategy",
     ]
 
+    def training_loop(self) -> None:
+        for i in range(5):
+            with tune.checkpoint_dir(step=i) as checkpoint_dir:
+                path = os.path.join(checkpoint_dir, "checkpoint")
+                with open(path, "w") as f:
+                    f.write(str(i))
+            tune.report(step=i)
+
+
+class FailingTrainer(DummyTrainer):
     def training_loop(self) -> None:
         raise RuntimeError("There is an error in trainer!")
 
@@ -189,7 +199,7 @@ def on_step_end(self, iteration, trials, **kwargs):
         assert len(results) == 4
 
     def test_tuner_trainer_fail(self):
-        trainer = DummyTrainer()
+        trainer = FailingTrainer()
         param_space = {
             "scaling_config": {
                 "num_workers": tune.grid_search([1, 2]),
@@ -243,6 +253,68 @@ def test_tuner_run_config_override(self):
 
         assert tuner._local_tuner._run_config.stop == {"metric": 4}
 
+    def test_tuner_checkpoint_configuration(self):
+        # Case 1: nothing set
+        trainer = DummyTrainer()
+        tuner = Tuner(trainer)
+
+        results = tuner.fit()
+        result = results[0]
+        assert result.checkpoint
+        assert not result.best_checkpoint
+
+        # Case 2: metric and mode set
+        trainer = DummyTrainer()
+        tuner = Tuner(
+            trainer, tune_config=TuneConfig(mode="min", metric="step", num_samples=2)
+        )
+
+        results = tuner.fit()
+        result = results[0]
+        assert result.checkpoint
+        assert result.best_checkpoint
+        assert (
+            os.path.basename(
+                os.path.normpath(
+                    result.best_checkpoint.get_internal_representation()[1]
+                )
+            )
+            == "checkpoint_000000"
+        )
+        assert (
+            result.best_checkpoint.get_internal_representation()
+            != results[1].best_checkpoint.get_internal_representation()
+        )
+
+        # Case 3: CheckpointingConfig set. Takes priority.
+        trainer = DummyTrainer(
+            run_config=RunConfig(
+                checkpointing=CheckpointingConfig(
+                    checkpoint_score_metric="step", checkpoint_score_mode="min"
+                )
+            )
+        )
+        tuner = Tuner(
+            trainer, tune_config=TuneConfig(mode="max", metric="step", num_samples=2)
+        )
+
+        results = tuner.fit()
+        result = results[0]
+        assert result.checkpoint
+        assert result.best_checkpoint
+        assert (
+            os.path.basename(
+                os.path.normpath(
+                    result.best_checkpoint.get_internal_representation()[1]
+                )
+            )
+            == "checkpoint_000000"
+        )
+        assert (
+            result.best_checkpoint.get_internal_representation()
+            != results[1].best_checkpoint.get_internal_representation()
+        )
+
 
 if __name__ == "__main__":
     import sys

From 024932e8acaf3a1deeb2a3af7ccdb9965589bbd3 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Mon, 20 Jun 2022 20:17:52 +0000
Subject: [PATCH 24/70] Result grid tweaks

---
 python/ray/tune/result_grid.py            | 43 +++++++++++++++---
 python/ray/tune/tests/test_result_grid.py | 53 +++++++++++++++++++++++
 2 files changed, 89 insertions(+), 7 deletions(-)

diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index ef4dd58b5064..78f05e6dcd20 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -58,6 +58,7 @@ def get_best_result(
         mode: Optional[str] = None,
         scope: str = "last",
         filter_nan_and_inf: bool = True,
+        checkpointing_config: Union[bool, "CheckpointingConfig"] = True,
     ) -> Result:
         """Get the best result from all the trials run.
 
@@ -79,6 +80,13 @@ def get_best_result(
             filter_nan_and_inf: If True (default), NaN or infinite
                 values are disregarded and these trials are never selected as
                 the best trial.
+            checkpointing_config: If True (default), will use the
+                ``CheckpointingConfig`` object set in Trainer's ``RunConfig``
+                to determine the best checkpoint of the trial.
+                If False, or if the ``CheckpointingConfig`` object was not set, will use
+                ``metric`` and ``mode`` as set here.
+                Can also be a ``CheckpointingConfig`` object, in which case it will
+                 be used directly.
         """
         if not metric and not self._experiment_analysis.default_metric:
             raise ValueError(
@@ -92,6 +100,10 @@ def get_best_result(
                 "`get_best_result` or specify a mode in the "
                 "`TuneConfig` of your `Tuner`."
             )
+
+        metric = metric or self._experiment_analysis.default_metric
+        mode = mode or self._experiment_analysis.default_mode
+
         best_trial = self._experiment_analysis.get_best_trial(
             metric=metric,
             mode=mode,
@@ -112,7 +124,19 @@ def get_best_result(
             )
             raise RuntimeError(error_msg)
 
-        return self._trial_to_result(best_trial)
+        # Lazy import to avoid circular dependency
+        from ray.air.config import CheckpointingConfig
+
+        if not isinstance(checkpointing_config, CheckpointingConfig):
+            if checkpointing_config and self._checkpointing_config:
+                checkpointing_config = self._checkpointing_config
+            else:
+                checkpointing_config = CheckpointingConfig(
+                    checkpoint_score_metric=metric, checkpoint_score_mode=mode
+                )
+        return self._trial_to_result(
+            best_trial, checkpointing_config=checkpointing_config
+        )
 
     def get_dataframe(
         self,
@@ -159,7 +183,10 @@ def __len__(self) -> int:
 
     def __getitem__(self, i) -> Result:
         """Returns the i'th result in the grid."""
-        return self._trial_to_result(self._experiment_analysis.trials[i])
+        return self._trial_to_result(
+            self._experiment_analysis.trials[i],
+            checkpointing_config=self._checkpointing_config,
+        )
 
     @staticmethod
     def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError]]:
@@ -172,17 +199,19 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError]
                 return TuneError(f.read())
         return None
 
-    def _trial_to_result(self, trial: Trial) -> Result:
+    def _trial_to_result(
+        self, trial: Trial, checkpointing_config: "CheckpointingConfig"
+    ) -> Result:
         checkpoint = trial.checkpoint.to_air_checkpoint()
 
         checkpoint_metric = (
-            self._checkpointing_config.checkpoint_score_metric
-            if self._checkpointing_config
+            checkpointing_config.checkpoint_score_metric
+            if checkpointing_config
             else None
         )
         checkpoint_mode = (
-            self._checkpointing_config.checkpoint_score_mode_not_none
-            if self._checkpointing_config and checkpoint_metric
+            checkpointing_config.checkpoint_score_mode_not_none
+            if checkpointing_config and checkpoint_metric
             else None
         )
         try:
diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py
index 6a81cfd01d1d..c6bcb7af2077 100644
--- a/python/ray/tune/tests/test_result_grid.py
+++ b/python/ray/tune/tests/test_result_grid.py
@@ -150,6 +150,59 @@ def f(config):
     assert best_result.metrics["x"] == 2
 
 
+def test_best_result_best_checkpoint(ray_start_2_cpus):
+    from ray.air.config import CheckpointingConfig
+
+    def f(config):
+        for i in range(2):
+            with tune.checkpoint_dir(step=i) as checkpoint_dir:
+                path = os.path.join(checkpoint_dir, "checkpoint")
+                with open(path, "w") as f:
+                    f.write(json.dumps(dict(x=config["x"] * (i + 1), step=i)))
+            tune.report(x=config["x"] * (i + 1), step=i)
+
+    def load_checkpoint(result):
+        with open(
+            os.path.join(result.best_checkpoint.to_directory(), "checkpoint")
+        ) as f:
+            checkpoint_data = json.load(f)
+        return checkpoint_data
+
+    analysis = tune.run(f, config={"x": tune.grid_search([1, 3])})
+
+    # No checkpointing config. Use metric and mode
+    result_grid = ResultGrid(analysis)
+    best_result = result_grid.get_best_result(metric="x", mode="max")
+    assert best_result.metrics["x"] == 6
+    assert best_result.best_checkpoint
+    assert load_checkpoint(best_result)["step"] == 1
+
+    # Checkpointing config. Use by default
+    result_grid = ResultGrid(
+        analysis, checkpointing_config=CheckpointingConfig(checkpoint_score_metric="x")
+    )
+    best_result = result_grid.get_best_result(metric="x", mode="min")
+    assert best_result.metrics["x"] == 2
+    assert best_result.best_checkpoint
+    assert load_checkpoint(best_result)["step"] == 1
+
+    best_result = result_grid.get_best_result(
+        metric="x", mode="min", checkpointing_config=False
+    )
+    assert best_result.metrics["x"] == 2
+    assert best_result.best_checkpoint
+    assert load_checkpoint(best_result)["step"] == 0
+
+    best_result = result_grid.get_best_result(
+        metric="x",
+        mode="min",
+        checkpointing_config=CheckpointingConfig(checkpoint_score_metric="x"),
+    )
+    assert best_result.metrics["x"] == 2
+    assert best_result.best_checkpoint
+    assert load_checkpoint(best_result)["step"] == 1
+
+
 def test_best_result_no_report(ray_start_2_cpus):
     def f(config):
         pass

From abf2cdc9a18d56147d1dfd2aec4b56eab0a1223b Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Mon, 20 Jun 2022 20:24:23 +0000
Subject: [PATCH 25/70] Extend

---
 python/ray/tune/result_grid.py | 62 ++++++++++++++++++++++++++--------
 1 file changed, 48 insertions(+), 14 deletions(-)

diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index 78f05e6dcd20..896cd1b885c8 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -52,6 +52,27 @@ def __init__(
         # Used to determine best checkpoint
         self._checkpointing_config = checkpointing_config
 
+    def _resolve_checkpointing_config(
+        self,
+        checkpointing_config: "CheckpointingConfig",
+        metric: Optional[str] = None,
+        mode: Optional[str] = None,
+    ) -> "CheckpointingConfig":
+        # Lazy import to avoid circular dependency
+        from ray.air.config import CheckpointingConfig
+
+        metric = metric or self._experiment_analysis.default_metric
+        mode = mode or self._experiment_analysis.default_mode
+
+        if not isinstance(checkpointing_config, CheckpointingConfig):
+            if checkpointing_config and self._checkpointing_config:
+                checkpointing_config = self._checkpointing_config
+            else:
+                checkpointing_config = CheckpointingConfig(
+                    checkpoint_score_metric=metric, checkpoint_score_mode=mode
+                )
+        return checkpointing_config
+
     def get_best_result(
         self,
         metric: Optional[str] = None,
@@ -101,9 +122,6 @@ def get_best_result(
                 "`TuneConfig` of your `Tuner`."
             )
 
-        metric = metric or self._experiment_analysis.default_metric
-        mode = mode or self._experiment_analysis.default_mode
-
         best_trial = self._experiment_analysis.get_best_trial(
             metric=metric,
             mode=mode,
@@ -124,16 +142,10 @@ def get_best_result(
             )
             raise RuntimeError(error_msg)
 
-        # Lazy import to avoid circular dependency
-        from ray.air.config import CheckpointingConfig
+        checkpointing_config = self._resolve_checkpointing_config(
+            checkpointing_config, metric=metric, mode=mode
+        )
 
-        if not isinstance(checkpointing_config, CheckpointingConfig):
-            if checkpointing_config and self._checkpointing_config:
-                checkpointing_config = self._checkpointing_config
-            else:
-                checkpointing_config = CheckpointingConfig(
-                    checkpoint_score_metric=metric, checkpoint_score_mode=mode
-                )
         return self._trial_to_result(
             best_trial, checkpointing_config=checkpointing_config
         )
@@ -181,11 +193,33 @@ def get_dataframe(
     def __len__(self) -> int:
         return len(self._experiment_analysis.trials)
 
-    def __getitem__(self, i) -> Result:
+    def __getitem__(self, i: int) -> Result:
         """Returns the i'th result in the grid."""
+        return self.get(
+            self._experiment_analysis.trials[i],
+        )
+
+    def get(
+        self, i: int, *, checkpointing_config: Union[bool, "CheckpointingConfig"] = True
+    ):
+        """Returns the i'th result in the grid.
+
+        Args:
+            i: index to return.
+            checkpointing_config: If True (default), will use the
+                ``CheckpointingConfig`` object set in Trainer's ``RunConfig``
+                to determine the best checkpoint of the trial.
+                If False, or if the ``CheckpointingConfig`` object was not set, will use
+                ``metric`` and ``mode`` as set here.
+                Can also be a ``CheckpointingConfig`` object, in which case it will
+                 be used directly.
+        """
+
+        checkpointing_config = self._resolve_checkpointing_config(checkpointing_config)
+
         return self._trial_to_result(
             self._experiment_analysis.trials[i],
-            checkpointing_config=self._checkpointing_config,
+            checkpointing_config=checkpointing_config,
         )
 
     @staticmethod

From 563bc338b976e0a4a98d51982cddada388ceb7a6 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 21 Jun 2022 05:38:46 +0200
Subject: [PATCH 26/70] Update result_grid.py

---
 python/ray/tune/result_grid.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index 896cd1b885c8..9321de0d8cdb 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -195,9 +195,7 @@ def __len__(self) -> int:
 
     def __getitem__(self, i: int) -> Result:
         """Returns the i'th result in the grid."""
-        return self.get(
-            self._experiment_analysis.trials[i],
-        )
+        return self.get(i)
 
     def get(
         self, i: int, *, checkpointing_config: Union[bool, "CheckpointingConfig"] = True

From d0261bea6d4f40414491d84fe9017cc7ad335c45 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 21 Jun 2022 16:58:23 +0000
Subject: [PATCH 27/70] Fix

---
 python/ray/tune/result_grid.py            | 2 +-
 python/ray/tune/tests/test_result_grid.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index 9321de0d8cdb..344c4356938f 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -232,7 +232,7 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError]
         return None
 
     def _trial_to_result(
-        self, trial: Trial, checkpointing_config: "CheckpointingConfig"
+        self, trial: Trial, checkpointing_config: Optional["CheckpointingConfig"]
     ) -> Result:
         checkpoint = trial.checkpoint.to_air_checkpoint()
 
diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py
index c6bcb7af2077..bccd553469a3 100644
--- a/python/ray/tune/tests/test_result_grid.py
+++ b/python/ray/tune/tests/test_result_grid.py
@@ -124,7 +124,7 @@ def test_result_grid_future_checkpoint(ray_start_2_cpus, to_object):
     result_grid = ResultGrid(None)
 
     # Internal result grid conversion
-    result = result_grid._trial_to_result(trial)
+    result = result_grid._trial_to_result(trial, checkpointing_config=None)
     assert isinstance(result.checkpoint, Checkpoint)
     assert isinstance(result.metrics, dict)
     assert isinstance(result.config, dict)

From 56df4936e05391ae27255d762b1f32f4f2105138 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 21 Jun 2022 16:58:47 +0000
Subject: [PATCH 28/70] Lint

---
 python/ray/tune/result_grid.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index 344c4356938f..4513935697a0 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -107,7 +107,7 @@ def get_best_result(
                 If False, or if the ``CheckpointingConfig`` object was not set, will use
                 ``metric`` and ``mode`` as set here.
                 Can also be a ``CheckpointingConfig`` object, in which case it will
-                 be used directly.
+                be used directly.
         """
         if not metric and not self._experiment_analysis.default_metric:
             raise ValueError(

From ef0c75ae685afdc31b70a66f43701b265f59decc Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 21 Jun 2022 17:57:36 +0000
Subject: [PATCH 29/70] Lint

---
 python/ray/tune/result_grid.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index 4513935697a0..6f5edc3d3f13 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -210,7 +210,7 @@ def get(
                 If False, or if the ``CheckpointingConfig`` object was not set, will use
                 ``metric`` and ``mode`` as set here.
                 Can also be a ``CheckpointingConfig`` object, in which case it will
-                 be used directly.
+                be used directly.
         """
 
         checkpointing_config = self._resolve_checkpointing_config(checkpointing_config)

From 3464c93c5eb8fdd7ea0fb3a5c6cf4a071371246d Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 21 Jun 2022 18:43:36 +0000
Subject: [PATCH 30/70] WIP

---
 python/ray/train/tests/test_tune.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py
index 3fac9a1e6599..dafe52241312 100644
--- a/python/ray/train/tests/test_tune.py
+++ b/python/ray/train/tests/test_tune.py
@@ -5,6 +5,7 @@
 import ray
 import ray.train as train
 from ray import tune
+from ray.tune import TuneError
 from ray.air import Checkpoint
 from ray.air.config import FailureConfig, RunConfig
 from ray.train._internal.worker_group import WorkerGroup
@@ -125,9 +126,8 @@ def train_func(config):
         trainer,
     )
 
-    # with pytest.raises(TuneError):
-    tuner.fit()
-    print("a")
+    with pytest.raises(TuneError):
+        tuner.fit()
 
 
 def test_tune_checkpoint(ray_start_4_cpus):

From ee87c12d772860c18f80d4a2bb4a5c2514a81195 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 21 Jun 2022 19:12:46 +0000
Subject: [PATCH 31/70] Renaming

---
 python/ray/air/__init__.py                |  4 +-
 python/ray/air/config.py                  |  6 +-
 python/ray/air/result.py                  | 12 ++--
 python/ray/air/tests/test_api.py          | 14 ++---
 python/ray/tune/impl/tuner_internal.py    | 10 ++--
 python/ray/tune/result_grid.py            | 68 +++++++++++------------
 python/ray/tune/tests/test_result_grid.py | 10 ++--
 python/ray/tune/tests/test_tuner.py       |  6 +-
 8 files changed, 63 insertions(+), 67 deletions(-)

diff --git a/python/ray/air/__init__.py b/python/ray/air/__init__.py
index 2c82cce8f4e3..506f9d022cc0 100644
--- a/python/ray/air/__init__.py
+++ b/python/ray/air/__init__.py
@@ -4,7 +4,7 @@
     RunConfig,
     ScalingConfig,
     FailureConfig,
-    CheckpointingConfig,
+    CheckpointConfig,
 )
 from ray.air.data_batch_type import DataBatchType
 from ray.air.result import Result
@@ -18,6 +18,6 @@
     "ScalingConfig",
     "DatasetConfig",
     "FailureConfig",
-    "CheckpointingConfig",
+    "CheckpointConfig",
     "train_test_split",
 ]
diff --git a/python/ray/air/config.py b/python/ray/air/config.py
index ab9cfe79b67e..5b0a886cc3e2 100644
--- a/python/ray/air/config.py
+++ b/python/ray/air/config.py
@@ -275,7 +275,7 @@ class FailureConfig:
 
 @dataclass
 @PublicAPI(stability="alpha")
-class CheckpointingConfig:
+class CheckpointConfig:
     """Configuration related to checkpointing of each run/trial.
 
     Args:
@@ -352,7 +352,7 @@ class RunConfig:
             and thus will not take effect in resumed runs).
         failure: Failure mode configuration.
         sync_config: Configuration object for syncing. See tune.SyncConfig.
-        checkpointing: Checkpointing configuration.
+        checkpoint_config: Checkpointing configuration.
         verbose: 0, 1, 2, or 3. Verbosity mode.
             0 = silent, 1 = only status updates, 2 = status and brief
             results, 3 = status and detailed results. Defaults to 2.
@@ -365,5 +365,5 @@ class RunConfig:
     stop: Optional[Union[Mapping, "Stopper", Callable[[str, Mapping], bool]]] = None
     failure: Optional[FailureConfig] = None
     sync_config: Optional[SyncConfig] = None
-    checkpointing: Optional[CheckpointingConfig] = None
+    checkpoint_config: Optional[CheckpointConfig] = None
     verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS
diff --git a/python/ray/air/result.py b/python/ray/air/result.py
index 954c0b8f9054..f615959a2fdc 100644
--- a/python/ray/air/result.py
+++ b/python/ray/air/result.py
@@ -23,11 +23,13 @@ class Result:
     Args:
         metrics: The final metrics as reported by an Trainable.
         checkpoint: The final checkpoint of the Trainable.
-        best_checkpoint: The best checkpoint of the Trainable, as
-            determined by the ``checkpointing`` argument of ``RunConfig``,
-            or, if that's unset, by ``metric`` and ``mode`` arguments of
-            ``TuneConfig``.
-            If either of those has not been set, this will be None.
+        best_checkpoint: The best checkpoint of the Trainable.
+            This will be determined by (from highest priority):
+
+            1. ``checkpoint_config`` argument of ``run_config``
+            2. ``metric`` and ``mode`` arguments of ``tune_config`` (if using ``Tuner``)
+
+            If neither of those has not been set, this will be None.
             May be the same object as ``checkpoint``.
         error: The execution error of the Trainable run, if the trial finishes in error.
         dataframe: The full result dataframe of the Trainable.
diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py
index 20138448a77d..136cb0e58473 100644
--- a/python/ray/air/tests/test_api.py
+++ b/python/ray/air/tests/test_api.py
@@ -4,7 +4,7 @@
 from ray.air import Checkpoint
 from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated
 from ray.air.config import ScalingConfigDataClass
-from ray.air.config import CheckpointingConfig
+from ray.air.config import CheckpointConfig
 from ray.data.preprocessor import Preprocessor
 from ray.train.trainer import BaseTrainer
 
@@ -42,25 +42,25 @@ def test_run_config():
 def test_checkpointing_config():
     # cannot set checkpoint_score_mode if checkpoint_score_metric is unset
     with pytest.raises(ValueError):
-        CheckpointingConfig(checkpoint_score_mode="min")
+        CheckpointConfig(checkpoint_score_mode="min")
 
     with pytest.raises(ValueError):
-        CheckpointingConfig(
+        CheckpointConfig(
             checkpoint_score_metric="metric", checkpoint_score_mode="invalid"
         )
 
-    checkpointing = CheckpointingConfig()
+    checkpointing = CheckpointConfig()
     assert checkpointing.checkpoint_score_attr is None
 
-    checkpointing = CheckpointingConfig(checkpoint_score_metric="metric")
+    checkpointing = CheckpointConfig(checkpoint_score_metric="metric")
     assert checkpointing.checkpoint_score_attr == "metric"
 
-    checkpointing = CheckpointingConfig(
+    checkpointing = CheckpointConfig(
         checkpoint_score_metric="metric", checkpoint_score_mode="max"
     )
     assert checkpointing.checkpoint_score_attr == "metric"
 
-    checkpointing = CheckpointingConfig(
+    checkpointing = CheckpointConfig(
         checkpoint_score_metric="metric", checkpoint_score_mode="min"
     )
     assert checkpointing.checkpoint_score_attr == "min-metric"
diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py
index 7a0bf39eff6a..e2dfadf43fdf 100644
--- a/python/ray/tune/impl/tuner_internal.py
+++ b/python/ray/tune/impl/tuner_internal.py
@@ -147,7 +147,7 @@ def fit(self) -> ResultGrid:
         else:
             analysis = self._fit_resume(trainable)
 
-        return ResultGrid(analysis, self._run_config.checkpointing)
+        return ResultGrid(analysis, self._run_config.checkpoint_config)
 
     def _get_tune_run_arguments(self) -> Dict[str, Any]:
         """Get tune.run arguments common for both new and resumed runs."""
@@ -161,13 +161,13 @@ def _get_tune_run_arguments(self) -> Dict[str, Any]:
                 self._run_config.failure.max_failures if self._run_config.failure else 0
             ),
             keep_checkpoints_num=(
-                self._run_config.checkpointing.keep_checkpoints_num
-                if self._run_config.checkpointing
+                self._run_config.checkpoint_config.keep_checkpoints_num
+                if self._run_config.checkpoint_config
                 else None
             ),
             checkpoint_score_attr=(
-                self._run_config.checkpointing.checkpoint_score_attr
-                if self._run_config.checkpointing
+                self._run_config.checkpoint_config.checkpoint_score_attr
+                if self._run_config.checkpoint_config
                 else None
             ),
             _experiment_checkpoint_dir=self._experiment_checkpoint_dir,
diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index 6f5edc3d3f13..afa68b609b59 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -12,7 +12,7 @@
 from ray.util import PublicAPI
 
 if TYPE_CHECKING:
-    from ray.air.config import CheckpointingConfig
+    from ray.air.config import CheckpointConfig
 
 
 @PublicAPI(stability="alpha")
@@ -46,32 +46,32 @@ class ResultGrid:
     def __init__(
         self,
         experiment_analysis: ExperimentAnalysis,
-        checkpointing_config: Optional["CheckpointingConfig"] = None,
+        checkpoint_config: Optional["CheckpointConfig"] = None,
     ):
         self._experiment_analysis = experiment_analysis
         # Used to determine best checkpoint
-        self._checkpointing_config = checkpointing_config
+        self._checkpointing_config = checkpoint_config
 
-    def _resolve_checkpointing_config(
+    def _resolve_checkpoint_config(
         self,
-        checkpointing_config: "CheckpointingConfig",
+        checkpoint_config: "CheckpointConfig",
         metric: Optional[str] = None,
         mode: Optional[str] = None,
-    ) -> "CheckpointingConfig":
+    ) -> "CheckpointConfig":
         # Lazy import to avoid circular dependency
-        from ray.air.config import CheckpointingConfig
+        from ray.air.config import CheckpointConfig
 
         metric = metric or self._experiment_analysis.default_metric
         mode = mode or self._experiment_analysis.default_mode
 
-        if not isinstance(checkpointing_config, CheckpointingConfig):
-            if checkpointing_config and self._checkpointing_config:
-                checkpointing_config = self._checkpointing_config
+        if not isinstance(checkpoint_config, CheckpointConfig):
+            if checkpoint_config and self._checkpointing_config:
+                checkpoint_config = self._checkpointing_config
             else:
-                checkpointing_config = CheckpointingConfig(
+                checkpoint_config = CheckpointConfig(
                     checkpoint_score_metric=metric, checkpoint_score_mode=mode
                 )
-        return checkpointing_config
+        return checkpoint_config
 
     def get_best_result(
         self,
@@ -79,7 +79,7 @@ def get_best_result(
         mode: Optional[str] = None,
         scope: str = "last",
         filter_nan_and_inf: bool = True,
-        checkpointing_config: Union[bool, "CheckpointingConfig"] = True,
+        checkpoint_config: Union[bool, "CheckpointConfig"] = True,
     ) -> Result:
         """Get the best result from all the trials run.
 
@@ -101,12 +101,12 @@ def get_best_result(
             filter_nan_and_inf: If True (default), NaN or infinite
                 values are disregarded and these trials are never selected as
                 the best trial.
-            checkpointing_config: If True (default), will use the
-                ``CheckpointingConfig`` object set in Trainer's ``RunConfig``
+            checkpoint_config: If True (default), will use the
+                ``CheckpointConfig`` object set in Trainer's ``run_config``
                 to determine the best checkpoint of the trial.
-                If False, or if the ``CheckpointingConfig`` object was not set, will use
+                If False, or if the ``CheckpointConfig`` object was not set, will use
                 ``metric`` and ``mode`` as set here.
-                Can also be a ``CheckpointingConfig`` object, in which case it will
+                Can also be a ``CheckpointConfig`` object, in which case it will
                 be used directly.
         """
         if not metric and not self._experiment_analysis.default_metric:
@@ -142,13 +142,11 @@ def get_best_result(
             )
             raise RuntimeError(error_msg)
 
-        checkpointing_config = self._resolve_checkpointing_config(
-            checkpointing_config, metric=metric, mode=mode
+        checkpoint_config = self._resolve_checkpoint_config(
+            checkpoint_config, metric=metric, mode=mode
         )
 
-        return self._trial_to_result(
-            best_trial, checkpointing_config=checkpointing_config
-        )
+        return self._trial_to_result(best_trial, checkpoint_config=checkpoint_config)
 
     def get_dataframe(
         self,
@@ -197,27 +195,25 @@ def __getitem__(self, i: int) -> Result:
         """Returns the i'th result in the grid."""
         return self.get(i)
 
-    def get(
-        self, i: int, *, checkpointing_config: Union[bool, "CheckpointingConfig"] = True
-    ):
+    def get(self, i: int, *, checkpoint_config: Union[bool, "CheckpointConfig"] = True):
         """Returns the i'th result in the grid.
 
         Args:
             i: index to return.
-            checkpointing_config: If True (default), will use the
-                ``CheckpointingConfig`` object set in Trainer's ``RunConfig``
+            checkpoint_config: If True (default), will use the
+                ``CheckpointConfig`` object set in Trainer's ``RunConfig``
                 to determine the best checkpoint of the trial.
-                If False, or if the ``CheckpointingConfig`` object was not set, will use
+                If False, or if the ``CheckpointConfig`` object was not set, will use
                 ``metric`` and ``mode`` as set here.
-                Can also be a ``CheckpointingConfig`` object, in which case it will
+                Can also be a ``CheckpointConfig`` object, in which case it will
                 be used directly.
         """
 
-        checkpointing_config = self._resolve_checkpointing_config(checkpointing_config)
+        checkpoint_config = self._resolve_checkpoint_config(checkpoint_config)
 
         return self._trial_to_result(
             self._experiment_analysis.trials[i],
-            checkpointing_config=checkpointing_config,
+            checkpoint_config=checkpoint_config,
         )
 
     @staticmethod
@@ -232,18 +228,16 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError]
         return None
 
     def _trial_to_result(
-        self, trial: Trial, checkpointing_config: Optional["CheckpointingConfig"]
+        self, trial: Trial, checkpoint_config: Optional["CheckpointConfig"]
     ) -> Result:
         checkpoint = trial.checkpoint.to_air_checkpoint()
 
         checkpoint_metric = (
-            checkpointing_config.checkpoint_score_metric
-            if checkpointing_config
-            else None
+            checkpoint_config.checkpoint_score_metric if checkpoint_config else None
         )
         checkpoint_mode = (
-            checkpointing_config.checkpoint_score_mode_not_none
-            if checkpointing_config and checkpoint_metric
+            checkpoint_config.checkpoint_score_mode_not_none
+            if checkpoint_config and checkpoint_metric
             else None
         )
         try:
diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py
index bccd553469a3..96789116db33 100644
--- a/python/ray/tune/tests/test_result_grid.py
+++ b/python/ray/tune/tests/test_result_grid.py
@@ -124,7 +124,7 @@ def test_result_grid_future_checkpoint(ray_start_2_cpus, to_object):
     result_grid = ResultGrid(None)
 
     # Internal result grid conversion
-    result = result_grid._trial_to_result(trial, checkpointing_config=None)
+    result = result_grid._trial_to_result(trial, checkpoint_config=None)
     assert isinstance(result.checkpoint, Checkpoint)
     assert isinstance(result.metrics, dict)
     assert isinstance(result.config, dict)
@@ -151,7 +151,7 @@ def f(config):
 
 
 def test_best_result_best_checkpoint(ray_start_2_cpus):
-    from ray.air.config import CheckpointingConfig
+    from ray.air.config import CheckpointConfig
 
     def f(config):
         for i in range(2):
@@ -179,7 +179,7 @@ def load_checkpoint(result):
 
     # Checkpointing config. Use by default
     result_grid = ResultGrid(
-        analysis, checkpointing_config=CheckpointingConfig(checkpoint_score_metric="x")
+        analysis, checkpoint_config=CheckpointConfig(checkpoint_score_metric="x")
     )
     best_result = result_grid.get_best_result(metric="x", mode="min")
     assert best_result.metrics["x"] == 2
@@ -187,7 +187,7 @@ def load_checkpoint(result):
     assert load_checkpoint(best_result)["step"] == 1
 
     best_result = result_grid.get_best_result(
-        metric="x", mode="min", checkpointing_config=False
+        metric="x", mode="min", checkpoint_config=False
     )
     assert best_result.metrics["x"] == 2
     assert best_result.best_checkpoint
@@ -196,7 +196,7 @@ def load_checkpoint(result):
     best_result = result_grid.get_best_result(
         metric="x",
         mode="min",
-        checkpointing_config=CheckpointingConfig(checkpoint_score_metric="x"),
+        checkpoint_config=CheckpointConfig(checkpoint_score_metric="x"),
     )
     assert best_result.metrics["x"] == 2
     assert best_result.best_checkpoint
diff --git a/python/ray/tune/tests/test_tuner.py b/python/ray/tune/tests/test_tuner.py
index 07a088efacf6..dab5b9a1b51b 100644
--- a/python/ray/tune/tests/test_tuner.py
+++ b/python/ray/tune/tests/test_tuner.py
@@ -7,7 +7,7 @@
 from sklearn.utils import shuffle
 
 from ray import tune
-from ray.air.config import CheckpointingConfig, RunConfig
+from ray.air.config import CheckpointConfig, RunConfig
 from ray.air.examples.pytorch.torch_linear_example import (
     train_func as linear_train_func,
 )
@@ -286,10 +286,10 @@ def test_tuner_checkpoint_configuration(self):
             != results[1].best_checkpoint.get_internal_representation()
         )
 
-        # Case 3: CheckpointingConfig set. Takes priority.
+        # Case 3: CheckpointConfig set. Takes priority.
         trainer = DummyTrainer(
             run_config=RunConfig(
-                checkpointing=CheckpointingConfig(
+                checkpoint_config=CheckpointConfig(
                     checkpoint_score_metric="step", checkpoint_score_mode="min"
                 )
             )

From b10fe1e18745cd045168f256229b64c3b841fa6b Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 21 Jun 2022 20:05:25 +0000
Subject: [PATCH 32/70] Improve test coverage

---
 python/ray/train/tests/test_examples.py | 12 ++++++++++++
 python/ray/train/tests/test_tune.py     |  6 +++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py
index fd6a2fadbf91..2ebef818d7aa 100644
--- a/python/ray/train/tests/test_examples.py
+++ b/python/ray/train/tests/test_examples.py
@@ -51,6 +51,10 @@ def test_tensorflow_mnist(ray_start_4_cpus, num_workers):
 
     assert result[TRAINING_ITERATION] == epochs
 
+    loss = list(results.dataframe["loss"])
+    assert len(loss) == epochs
+    assert loss[-1] < loss[0]
+
 
 def test_tf_non_distributed(ray_start_4_cpus):
     """Make sure Ray Train works without TF MultiWorkerMirroredStrategy."""
@@ -103,6 +107,10 @@ def test_torch_linear(ray_start_4_cpus, num_workers):
     result = results.metrics
     assert result[TRAINING_ITERATION] == epochs
 
+    loss = list(results.dataframe["loss"])
+    assert len(loss) == epochs
+    assert loss[-1] < loss[0]
+
 
 # TODO: Refactor as a backend test.
 def test_torch_linear_failure(ray_start_4_cpus):
@@ -138,6 +146,10 @@ def test_torch_fashion_mnist(ray_start_4_cpus):
     result = results.metrics
     assert result[TRAINING_ITERATION] == epochs
 
+    loss = list(results.dataframe["loss"])
+    assert len(loss) == epochs
+    assert loss[-1] < loss[0]
+
 
 def test_torch_non_distributed(ray_start_4_cpus):
     """Make sure Ray Train works without torch DDP."""
diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py
index dafe52241312..0196a84e46b6 100644
--- a/python/ray/train/tests/test_tune.py
+++ b/python/ray/train/tests/test_tune.py
@@ -5,7 +5,6 @@
 import ray
 import ray.train as train
 from ray import tune
-from ray.tune import TuneError
 from ray.air import Checkpoint
 from ray.air.config import FailureConfig, RunConfig
 from ray.train._internal.worker_group import WorkerGroup
@@ -126,8 +125,9 @@ def train_func(config):
         trainer,
     )
 
-    with pytest.raises(TuneError):
-        tuner.fit()
+    result_grid = tuner.fit()
+    with pytest.raises(RuntimeError):
+        raise result_grid[0].error
 
 
 def test_tune_checkpoint(ray_start_4_cpus):

From 4dbcccaba67b23538d969e5df309c506e128d964 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 21 Jun 2022 20:51:57 +0000
Subject: [PATCH 33/70] Simplify

---
 python/ray/air/result.py                      | 18 ++--
 .../ray/tune/analysis/experiment_analysis.py  |  2 +-
 python/ray/tune/function_runner.py            |  2 +
 python/ray/tune/result_grid.py                | 84 +++----------------
 python/ray/tune/tests/test_result_grid.py     | 65 +++++---------
 python/ray/tune/tests/test_tuner.py           | 64 +-------------
 python/ray/tune/trial.py                      |  3 +
 7 files changed, 46 insertions(+), 192 deletions(-)

diff --git a/python/ray/air/result.py b/python/ray/air/result.py
index f615959a2fdc..77b3d4b03d28 100644
--- a/python/ray/air/result.py
+++ b/python/ray/air/result.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional, Tuple
 from dataclasses import dataclass
 
 from ray.air.checkpoint import Checkpoint
@@ -23,23 +23,19 @@ class Result:
     Args:
         metrics: The final metrics as reported by an Trainable.
         checkpoint: The final checkpoint of the Trainable.
-        best_checkpoint: The best checkpoint of the Trainable.
-            This will be determined by (from highest priority):
-
-            1. ``checkpoint_config`` argument of ``run_config``
-            2. ``metric`` and ``mode`` arguments of ``tune_config`` (if using ``Tuner``)
-
-            If neither of those has not been set, this will be None.
-            May be the same object as ``checkpoint``.
         error: The execution error of the Trainable run, if the trial finishes in error.
-        dataframe: The full result dataframe of the Trainable.
+        dataframe: The full result dataframe of the Trainable. Each row of the
+            dataframe corresponds to one iteration and contains reported
+            metrics.
+        checkpoint_history: A list of tuples of all checkpoints saved
+            by the Trainable and their associated metrics.
     """
 
     metrics: Optional[Dict[str, Any]]
     checkpoint: Optional[Checkpoint]
-    best_checkpoint: Optional[Checkpoint]
     error: Optional[Exception]
     dataframe: Optional[pd.DataFrame]
+    checkpoint_history: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]]
 
     @property
     def config(self) -> Optional[Dict[str, Any]]:
diff --git a/python/ray/tune/analysis/experiment_analysis.py b/python/ray/tune/analysis/experiment_analysis.py
index 84d99637f4c7..f537788f61f9 100644
--- a/python/ray/tune/analysis/experiment_analysis.py
+++ b/python/ray/tune/analysis/experiment_analysis.py
@@ -436,7 +436,7 @@ def get_trial_checkpoints_paths(
             )
             return path_metric_df[["chkpt_path", metric]].values.tolist()
         elif isinstance(trial, Trial):
-            checkpoints = trial.checkpoint_manager.best_checkpoints()
+            checkpoints = trial.get_trial_checkpoints()
             # Support metrics given as paths, e.g.
             # "info/learner/default_policy/policy_loss".
             return [
diff --git a/python/ray/tune/function_runner.py b/python/ray/tune/function_runner.py
index 89930e921351..02f5cf707989 100644
--- a/python/ray/tune/function_runner.py
+++ b/python/ray/tune/function_runner.py
@@ -441,6 +441,8 @@ def step(self):
             new_result = self._last_result.copy()
             new_result.update(result)
             result = new_result
+            # Do not checkpoint again
+            result[SHOULD_CHECKPOINT] = False
 
         self._last_result = result
         if self._status_reporter.has_new_checkpoint():
diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index afa68b609b59..12f4cf0c8514 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -1,5 +1,5 @@
 import os
-from typing import TYPE_CHECKING, Optional, Union
+from typing import Optional, Union
 
 import pandas as pd
 
@@ -11,9 +11,6 @@
 from ray.tune.trial import Trial
 from ray.util import PublicAPI
 
-if TYPE_CHECKING:
-    from ray.air.config import CheckpointConfig
-
 
 @PublicAPI(stability="alpha")
 class ResultGrid:
@@ -46,32 +43,8 @@ class ResultGrid:
     def __init__(
         self,
         experiment_analysis: ExperimentAnalysis,
-        checkpoint_config: Optional["CheckpointConfig"] = None,
     ):
         self._experiment_analysis = experiment_analysis
-        # Used to determine best checkpoint
-        self._checkpointing_config = checkpoint_config
-
-    def _resolve_checkpoint_config(
-        self,
-        checkpoint_config: "CheckpointConfig",
-        metric: Optional[str] = None,
-        mode: Optional[str] = None,
-    ) -> "CheckpointConfig":
-        # Lazy import to avoid circular dependency
-        from ray.air.config import CheckpointConfig
-
-        metric = metric or self._experiment_analysis.default_metric
-        mode = mode or self._experiment_analysis.default_mode
-
-        if not isinstance(checkpoint_config, CheckpointConfig):
-            if checkpoint_config and self._checkpointing_config:
-                checkpoint_config = self._checkpointing_config
-            else:
-                checkpoint_config = CheckpointConfig(
-                    checkpoint_score_metric=metric, checkpoint_score_mode=mode
-                )
-        return checkpoint_config
 
     def get_best_result(
         self,
@@ -79,7 +52,6 @@ def get_best_result(
         mode: Optional[str] = None,
         scope: str = "last",
         filter_nan_and_inf: bool = True,
-        checkpoint_config: Union[bool, "CheckpointConfig"] = True,
     ) -> Result:
         """Get the best result from all the trials run.
 
@@ -142,11 +114,7 @@ def get_best_result(
             )
             raise RuntimeError(error_msg)
 
-        checkpoint_config = self._resolve_checkpoint_config(
-            checkpoint_config, metric=metric, mode=mode
-        )
-
-        return self._trial_to_result(best_trial, checkpoint_config=checkpoint_config)
+        return self._trial_to_result(best_trial)
 
     def get_dataframe(
         self,
@@ -193,27 +161,8 @@ def __len__(self) -> int:
 
     def __getitem__(self, i: int) -> Result:
         """Returns the i'th result in the grid."""
-        return self.get(i)
-
-    def get(self, i: int, *, checkpoint_config: Union[bool, "CheckpointConfig"] = True):
-        """Returns the i'th result in the grid.
-
-        Args:
-            i: index to return.
-            checkpoint_config: If True (default), will use the
-                ``CheckpointConfig`` object set in Trainer's ``RunConfig``
-                to determine the best checkpoint of the trial.
-                If False, or if the ``CheckpointConfig`` object was not set, will use
-                ``metric`` and ``mode`` as set here.
-                Can also be a ``CheckpointConfig`` object, in which case it will
-                be used directly.
-        """
-
-        checkpoint_config = self._resolve_checkpoint_config(checkpoint_config)
-
         return self._trial_to_result(
             self._experiment_analysis.trials[i],
-            checkpoint_config=checkpoint_config,
         )
 
     @staticmethod
@@ -227,31 +176,20 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError]
                 return TuneError(f.read())
         return None
 
-    def _trial_to_result(
-        self, trial: Trial, checkpoint_config: Optional["CheckpointConfig"]
-    ) -> Result:
+    def _trial_to_result(self, trial: Trial) -> Result:
         checkpoint = trial.checkpoint.to_air_checkpoint()
-
-        checkpoint_metric = (
-            checkpoint_config.checkpoint_score_metric if checkpoint_config else None
-        )
-        checkpoint_mode = (
-            checkpoint_config.checkpoint_score_mode_not_none
-            if checkpoint_config and checkpoint_metric
-            else None
-        )
-        try:
-            best_checkpoint = self._experiment_analysis.get_best_checkpoint(
-                trial, metric=checkpoint_metric, mode=checkpoint_mode
-            )
-        except ValueError:
-            best_checkpoint = None
+        checkpoint_history = [
+            (checkpoint.to_air_checkpoint(), checkpoint.metrics)
+            for checkpoint in trial.get_trial_checkpoints()
+        ]
 
         result = Result(
             checkpoint=checkpoint,
-            best_checkpoint=best_checkpoint,
             metrics=trial.last_result.copy(),
             error=self._populate_exception(trial),
-            dataframe=self._experiment_analysis.trial_dataframes.get(trial.logdir),
+            dataframe=self._experiment_analysis.trial_dataframes.get(trial.logdir)
+            if self._experiment_analysis
+            else None,
+            checkpoint_history=checkpoint_history,
         )
         return result
diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py
index 96789116db33..0de68be19190 100644
--- a/python/ray/tune/tests/test_result_grid.py
+++ b/python/ray/tune/tests/test_result_grid.py
@@ -55,13 +55,17 @@ def f(config):
     result_grid = ResultGrid(analysis)
     result = result_grid[0]
     assert isinstance(result.checkpoint, Checkpoint)
-    assert isinstance(result.best_checkpoint, Checkpoint)
+    assert isinstance(result.checkpoint_history, list)
     assert isinstance(result.metrics, dict)
     assert isinstance(result.config, dict)
     assert isinstance(result.dataframe, pd.DataFrame)
     assert os.path.normpath(
         result.checkpoint.get_internal_representation()[1]
-    ) != os.path.normpath(result.best_checkpoint.get_internal_representation()[1])
+    ) != os.path.normpath(
+        min((x for x in result.checkpoint_history), key=lambda x: x[1]["step"])[
+            0
+        ].get_internal_representation()[1]
+    )
     assert result.config == {"a": 1}
     assert result.metrics["config"] == result.config
     assert len(result.dataframe) == 2
@@ -81,7 +85,6 @@ def f(config):
     result_grid = ResultGrid(analysis)
     result = result_grid[0]
     assert isinstance(result.checkpoint, Checkpoint)
-    assert result.best_checkpoint is None
     assert isinstance(result.metrics, dict)
     assert isinstance(result.config, dict)
     assert isinstance(result.dataframe, pd.DataFrame)
@@ -124,10 +127,11 @@ def test_result_grid_future_checkpoint(ray_start_2_cpus, to_object):
     result_grid = ResultGrid(None)
 
     # Internal result grid conversion
-    result = result_grid._trial_to_result(trial, checkpoint_config=None)
+    result = result_grid._trial_to_result(trial)
     assert isinstance(result.checkpoint, Checkpoint)
     assert isinstance(result.metrics, dict)
     assert isinstance(result.config, dict)
+    assert result.dataframe is None
     assert result.config == {"some_config": 1}
     assert result.metrics["config"] == result.config
 
@@ -150,57 +154,30 @@ def f(config):
     assert best_result.metrics["x"] == 2
 
 
-def test_best_result_best_checkpoint(ray_start_2_cpus):
-    from ray.air.config import CheckpointConfig
-
+def test_best_result_checkpoint_history(ray_start_2_cpus):
     def f(config):
         for i in range(2):
             with tune.checkpoint_dir(step=i) as checkpoint_dir:
                 path = os.path.join(checkpoint_dir, "checkpoint")
                 with open(path, "w") as f:
-                    f.write(json.dumps(dict(x=config["x"] * (i + 1), step=i)))
-            tune.report(x=config["x"] * (i + 1), step=i)
-
-    def load_checkpoint(result):
-        with open(
-            os.path.join(result.best_checkpoint.to_directory(), "checkpoint")
-        ) as f:
-            checkpoint_data = json.load(f)
-        return checkpoint_data
+                    f.write(json.dumps(dict(x=config["x"], step=i)))
+            tune.report(x=config["x"], step=i)
 
     analysis = tune.run(f, config={"x": tune.grid_search([1, 3])})
 
     # No checkpointing config. Use metric and mode
     result_grid = ResultGrid(analysis)
     best_result = result_grid.get_best_result(metric="x", mode="max")
-    assert best_result.metrics["x"] == 6
-    assert best_result.best_checkpoint
-    assert load_checkpoint(best_result)["step"] == 1
-
-    # Checkpointing config. Use by default
-    result_grid = ResultGrid(
-        analysis, checkpoint_config=CheckpointConfig(checkpoint_score_metric="x")
-    )
-    best_result = result_grid.get_best_result(metric="x", mode="min")
-    assert best_result.metrics["x"] == 2
-    assert best_result.best_checkpoint
-    assert load_checkpoint(best_result)["step"] == 1
-
-    best_result = result_grid.get_best_result(
-        metric="x", mode="min", checkpoint_config=False
-    )
-    assert best_result.metrics["x"] == 2
-    assert best_result.best_checkpoint
-    assert load_checkpoint(best_result)["step"] == 0
-
-    best_result = result_grid.get_best_result(
-        metric="x",
-        mode="min",
-        checkpoint_config=CheckpointConfig(checkpoint_score_metric="x"),
-    )
-    assert best_result.metrics["x"] == 2
-    assert best_result.best_checkpoint
-    assert load_checkpoint(best_result)["step"] == 1
+    assert best_result.metrics["x"] == 3
+    print(best_result.checkpoint_history)
+    print([x[0].get_internal_representation() for x in best_result.checkpoint_history])
+    assert len(best_result.checkpoint_history) == 2
+    i = 0
+    for checkpoint, metrics in best_result.checkpoint_history:
+        assert isinstance(checkpoint, Checkpoint)
+        assert metrics["x"] == 3
+        assert metrics["step"] == i
+        i += 1
 
 
 def test_best_result_no_report(ray_start_2_cpus):
diff --git a/python/ray/tune/tests/test_tuner.py b/python/ray/tune/tests/test_tuner.py
index dab5b9a1b51b..e00e41fcdefd 100644
--- a/python/ray/tune/tests/test_tuner.py
+++ b/python/ray/tune/tests/test_tuner.py
@@ -7,7 +7,7 @@
 from sklearn.utils import shuffle
 
 from ray import tune
-from ray.air.config import CheckpointConfig, RunConfig
+from ray.air.config import RunConfig
 from ray.air.examples.pytorch.torch_linear_example import (
     train_func as linear_train_func,
 )
@@ -253,68 +253,6 @@ def test_tuner_run_config_override(self):
 
         assert tuner._local_tuner._run_config.stop == {"metric": 4}
 
-    def test_tuner_checkpoint_configuration(self):
-        # Case 1: nothing set
-        trainer = DummyTrainer()
-        tuner = Tuner(trainer)
-
-        results = tuner.fit()
-        result = results[0]
-        assert result.checkpoint
-        assert not result.best_checkpoint
-
-        # Case 2: metric and mode set
-        trainer = DummyTrainer()
-        tuner = Tuner(
-            trainer, tune_config=TuneConfig(mode="min", metric="step", num_samples=2)
-        )
-
-        results = tuner.fit()
-        result = results[0]
-        assert result.checkpoint
-        assert result.best_checkpoint
-        assert (
-            os.path.basename(
-                os.path.normpath(
-                    result.best_checkpoint.get_internal_representation()[1]
-                )
-            )
-            == "checkpoint_000000"
-        )
-        assert (
-            result.best_checkpoint.get_internal_representation()
-            != results[1].best_checkpoint.get_internal_representation()
-        )
-
-        # Case 3: CheckpointConfig set. Takes priority.
-        trainer = DummyTrainer(
-            run_config=RunConfig(
-                checkpoint_config=CheckpointConfig(
-                    checkpoint_score_metric="step", checkpoint_score_mode="min"
-                )
-            )
-        )
-        tuner = Tuner(
-            trainer, tune_config=TuneConfig(mode="max", metric="step", num_samples=2)
-        )
-
-        results = tuner.fit()
-        result = results[0]
-        assert result.checkpoint
-        assert result.best_checkpoint
-        assert (
-            os.path.basename(
-                os.path.normpath(
-                    result.best_checkpoint.get_internal_representation()[1]
-                )
-            )
-            == "checkpoint_000000"
-        )
-        assert (
-            result.best_checkpoint.get_internal_representation()
-            != results[1].best_checkpoint.get_internal_representation()
-        )
-
 
 if __name__ == "__main__":
     import sys
diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py
index 73e3944ee289..ea5d4bceb809 100644
--- a/python/ray/tune/trial.py
+++ b/python/ray/tune/trial.py
@@ -767,6 +767,9 @@ def get_trainable_cls(self):
     def is_finished(self):
         return self.status in [Trial.ERROR, Trial.TERMINATED]
 
+    def get_trial_checkpoints(self) -> List[_TrackedCheckpoint]:
+        return self.checkpoint_manager.best_checkpoints()
+
     @property
     def is_restoring(self):
         return self.restoring_from is not None

From 27e531c3431c23225e3caa47f92e3d93b871cc0d Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 21 Jun 2022 20:54:43 +0000
Subject: [PATCH 34/70] Docstring tweak

---
 python/ray/tune/analysis/experiment_analysis.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/ray/tune/analysis/experiment_analysis.py b/python/ray/tune/analysis/experiment_analysis.py
index f537788f61f9..7b0518cfa5c5 100644
--- a/python/ray/tune/analysis/experiment_analysis.py
+++ b/python/ray/tune/analysis/experiment_analysis.py
@@ -366,7 +366,11 @@ def results_df(self) -> DataFrame:
 
     @property
     def trial_dataframes(self) -> Dict[str, DataFrame]:
-        """List of all dataframes of the trials."""
+        """List of all dataframes of the trials.
+
+        Each row of the dataframe corresponds to one iteration of a trial
+        and contains reported metrics.
+        """
         return self._trial_dataframes
 
     def dataframe(

From 7d1abfe2a2d6b1786e3b571a5cdc8fbcca256cdf Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 21 Jun 2022 20:56:16 +0000
Subject: [PATCH 35/70] Remove docstring

---
 python/ray/tune/result_grid.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index 12f4cf0c8514..9c216b657e7b 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -73,13 +73,6 @@ def get_best_result(
             filter_nan_and_inf: If True (default), NaN or infinite
                 values are disregarded and these trials are never selected as
                 the best trial.
-            checkpoint_config: If True (default), will use the
-                ``CheckpointConfig`` object set in Trainer's ``run_config``
-                to determine the best checkpoint of the trial.
-                If False, or if the ``CheckpointConfig`` object was not set, will use
-                ``metric`` and ``mode`` as set here.
-                Can also be a ``CheckpointConfig`` object, in which case it will
-                be used directly.
         """
         if not metric and not self._experiment_analysis.default_metric:
             raise ValueError(

From b0dd3baf038252f2f4208b18a10e380d8b566a40 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 21 Jun 2022 20:58:45 +0000
Subject: [PATCH 36/70] Fix

---
 python/ray/air/result.py                  |  9 ++++++---
 python/ray/tune/result_grid.py            |  4 ++--
 python/ray/tune/tests/test_result_grid.py | 12 ++++++------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/python/ray/air/result.py b/python/ray/air/result.py
index 77b3d4b03d28..a404569b3c9b 100644
--- a/python/ray/air/result.py
+++ b/python/ray/air/result.py
@@ -27,15 +27,18 @@ class Result:
         dataframe: The full result dataframe of the Trainable. Each row of the
             dataframe corresponds to one iteration and contains reported
             metrics.
-        checkpoint_history: A list of tuples of all checkpoints saved
-            by the Trainable and their associated metrics.
+        best_checkpoints: A list of tuples of the best checkpoints saved
+            by the Trainable and their associated metrics. The number of
+            saved checkpoints is determined by the ``checkpoint_config``
+            argument of ``run_config`` (by default, all checkpoints will
+            be saved).
     """
 
     metrics: Optional[Dict[str, Any]]
     checkpoint: Optional[Checkpoint]
     error: Optional[Exception]
     dataframe: Optional[pd.DataFrame]
-    checkpoint_history: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]]
+    best_checkpoints: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]]
 
     @property
     def config(self) -> Optional[Dict[str, Any]]:
diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index 9c216b657e7b..aaf6a73ecf2b 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -171,7 +171,7 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError]
 
     def _trial_to_result(self, trial: Trial) -> Result:
         checkpoint = trial.checkpoint.to_air_checkpoint()
-        checkpoint_history = [
+        best_checkpoints = [
             (checkpoint.to_air_checkpoint(), checkpoint.metrics)
             for checkpoint in trial.get_trial_checkpoints()
         ]
@@ -183,6 +183,6 @@ def _trial_to_result(self, trial: Trial) -> Result:
             dataframe=self._experiment_analysis.trial_dataframes.get(trial.logdir)
             if self._experiment_analysis
             else None,
-            checkpoint_history=checkpoint_history,
+            best_checkpoints=best_checkpoints,
         )
         return result
diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py
index 0de68be19190..dc49c404cdc6 100644
--- a/python/ray/tune/tests/test_result_grid.py
+++ b/python/ray/tune/tests/test_result_grid.py
@@ -55,14 +55,14 @@ def f(config):
     result_grid = ResultGrid(analysis)
     result = result_grid[0]
     assert isinstance(result.checkpoint, Checkpoint)
-    assert isinstance(result.checkpoint_history, list)
+    assert isinstance(result.best_checkpoints, list)
     assert isinstance(result.metrics, dict)
     assert isinstance(result.config, dict)
     assert isinstance(result.dataframe, pd.DataFrame)
     assert os.path.normpath(
         result.checkpoint.get_internal_representation()[1]
     ) != os.path.normpath(
-        min((x for x in result.checkpoint_history), key=lambda x: x[1]["step"])[
+        min((x for x in result.best_checkpoints), key=lambda x: x[1]["step"])[
             0
         ].get_internal_representation()[1]
     )
@@ -169,11 +169,11 @@ def f(config):
     result_grid = ResultGrid(analysis)
     best_result = result_grid.get_best_result(metric="x", mode="max")
     assert best_result.metrics["x"] == 3
-    print(best_result.checkpoint_history)
-    print([x[0].get_internal_representation() for x in best_result.checkpoint_history])
-    assert len(best_result.checkpoint_history) == 2
+    print(best_result.best_checkpoints)
+    print([x[0].get_internal_representation() for x in best_result.best_checkpoints])
+    assert len(best_result.best_checkpoints) == 2
     i = 0
-    for checkpoint, metrics in best_result.checkpoint_history:
+    for checkpoint, metrics in best_result.best_checkpoints:
         assert isinstance(checkpoint, Checkpoint)
         assert metrics["x"] == 3
         assert metrics["step"] == i

From 5b226abab1b98e5bd237a87270bf2cd31f05b8bf Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 21 Jun 2022 21:10:01 +0000
Subject: [PATCH 37/70] Tweak docstring

---
 python/ray/air/result.py                        | 4 ++--
 python/ray/tune/analysis/experiment_analysis.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/ray/air/result.py b/python/ray/air/result.py
index a404569b3c9b..d6fa35a4a809 100644
--- a/python/ray/air/result.py
+++ b/python/ray/air/result.py
@@ -24,8 +24,8 @@ class Result:
         metrics: The final metrics as reported by an Trainable.
         checkpoint: The final checkpoint of the Trainable.
         error: The execution error of the Trainable run, if the trial finishes in error.
-        dataframe: The full result dataframe of the Trainable. Each row of the
-            dataframe corresponds to one iteration and contains reported
+        dataframe: The full result dataframe of the Trainable.
+            The dataframe is indexed by iterations and contains reported
             metrics.
         best_checkpoints: A list of tuples of the best checkpoints saved
             by the Trainable and their associated metrics. The number of
diff --git a/python/ray/tune/analysis/experiment_analysis.py b/python/ray/tune/analysis/experiment_analysis.py
index 7b0518cfa5c5..97dd0a924a51 100644
--- a/python/ray/tune/analysis/experiment_analysis.py
+++ b/python/ray/tune/analysis/experiment_analysis.py
@@ -368,8 +368,8 @@ def results_df(self) -> DataFrame:
     def trial_dataframes(self) -> Dict[str, DataFrame]:
         """List of all dataframes of the trials.
 
-        Each row of the dataframe corresponds to one iteration of a trial
-        and contains reported metrics.
+        Each dataframe is indexed by iterations and contains reported
+        metrics.
         """
         return self._trial_dataframes
 

From 65ce1d3c9a1fd668a3c09eece4c07d46476c15b9 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 21 Jun 2022 22:21:39 +0000
Subject: [PATCH 38/70] Fix

---
 python/ray/tune/impl/tuner_internal.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py
index e2dfadf43fdf..3faf25ee895c 100644
--- a/python/ray/tune/impl/tuner_internal.py
+++ b/python/ray/tune/impl/tuner_internal.py
@@ -147,7 +147,7 @@ def fit(self) -> ResultGrid:
         else:
             analysis = self._fit_resume(trainable)
 
-        return ResultGrid(analysis, self._run_config.checkpoint_config)
+        return ResultGrid(analysis)
 
     def _get_tune_run_arguments(self) -> Dict[str, Any]:
         """Get tune.run arguments common for both new and resumed runs."""

From 1e1fbea7d3392aa70869750a09c3e1467678dc06 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Wed, 22 Jun 2022 11:22:08 +0000
Subject: [PATCH 39/70] Use CheckpointStrategy

---
 python/ray/air/__init__.py                    |  4 +-
 python/ray/air/config.py                      | 55 +------------------
 python/ray/air/tests/test_api.py              | 14 ++---
 .../ray/util/ml_utils/checkpoint_manager.py   | 10 ++++
 4 files changed, 21 insertions(+), 62 deletions(-)

diff --git a/python/ray/air/__init__.py b/python/ray/air/__init__.py
index 506f9d022cc0..922f1bc83b94 100644
--- a/python/ray/air/__init__.py
+++ b/python/ray/air/__init__.py
@@ -4,8 +4,8 @@
     RunConfig,
     ScalingConfig,
     FailureConfig,
-    CheckpointConfig,
 )
+from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy
 from ray.air.data_batch_type import DataBatchType
 from ray.air.result import Result
 from ray.air.util.datasets import train_test_split
@@ -18,6 +18,6 @@
     "ScalingConfig",
     "DatasetConfig",
     "FailureConfig",
-    "CheckpointConfig",
+    "CheckpointStrategy",
     "train_test_split",
 ]
diff --git a/python/ray/air/config.py b/python/ray/air/config.py
index 5b0a886cc3e2..5ea53d92f749 100644
--- a/python/ray/air/config.py
+++ b/python/ray/air/config.py
@@ -5,6 +5,7 @@
 from ray.tune.syncer import SyncConfig
 from ray.tune.utils.log import Verbosity
 from ray.util.annotations import PublicAPI
+from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy
 
 if TYPE_CHECKING:
     from ray.data import Dataset
@@ -273,58 +274,6 @@ class FailureConfig:
     max_failures: int = 0
 
 
-@dataclass
-@PublicAPI(stability="alpha")
-class CheckpointConfig:
-    """Configuration related to checkpointing of each run/trial.
-
-    Args:
-        keep_checkpoints_num: Number of checkpoints to keep. A value of
-            `None` keeps all checkpoints. Defaults to `None`. If set, need
-            to provide `checkpoint_score_attr`.
-        checkpoint_score_metric: Specifies by which metric to rank the
-            best checkpoint. Defaults to training iteration.
-        checkpoint_score_mode: Must be one of [min, max]. Determines
-            whether ``checkpoint_score_metric`` should be minimized or maximized.
-            If not set, will be the same as 'max'. Cannot be set if
-            ``checkpoint_score_metric`` is not set.
-    """
-
-    keep_checkpoints_num: Optional[int] = None
-    checkpoint_score_metric: Optional[str] = None
-    checkpoint_score_mode: Optional[str] = None
-
-    def __post_init__(self):
-        if self.checkpoint_score_mode not in (None, "min", "max"):
-            raise ValueError(
-                "The `checkpoint_score_mode` parameter can only be "
-                f"either None, 'min' or 'max', got {self.checkpoint_score_mode}."
-            )
-        if (
-            self.checkpoint_score_metric is None
-            and self.checkpoint_score_mode is not None
-        ):
-            raise ValueError(
-                "`checkpoint_score_mode` cannot be set if "
-                "`checkpoint_score_metric` is not set."
-            )
-
-    @property
-    def checkpoint_score_attr(self) -> Optional[str]:
-        """Same as ``checkpoint_score_attr`` in ``tune.run``."""
-        if self.checkpoint_score_metric is None:
-            return self.checkpoint_score_metric
-        prefix = ""
-        if self.checkpoint_score_mode == "min":
-            prefix = "min-"
-        return f"{prefix}{self.checkpoint_score_metric}"
-
-    @property
-    def checkpoint_score_mode_not_none(self) -> str:
-        """``checkpoint_score_mode`` but None -> 'max'"""
-        return self.checkpoint_score_mode or "max"
-
-
 @dataclass
 @PublicAPI(stability="alpha")
 class RunConfig:
@@ -365,5 +314,5 @@ class RunConfig:
     stop: Optional[Union[Mapping, "Stopper", Callable[[str, Mapping], bool]]] = None
     failure: Optional[FailureConfig] = None
     sync_config: Optional[SyncConfig] = None
-    checkpoint_config: Optional[CheckpointConfig] = None
+    checkpoint_config: Optional[CheckpointStrategy] = None
     verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS
diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py
index 136cb0e58473..d7054c5a81bf 100644
--- a/python/ray/air/tests/test_api.py
+++ b/python/ray/air/tests/test_api.py
@@ -4,7 +4,7 @@
 from ray.air import Checkpoint
 from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated
 from ray.air.config import ScalingConfigDataClass
-from ray.air.config import CheckpointConfig
+from ray.air.config import CheckpointStrategy
 from ray.data.preprocessor import Preprocessor
 from ray.train.trainer import BaseTrainer
 
@@ -42,25 +42,25 @@ def test_run_config():
 def test_checkpointing_config():
     # cannot set checkpoint_score_mode if checkpoint_score_metric is unset
     with pytest.raises(ValueError):
-        CheckpointConfig(checkpoint_score_mode="min")
+        CheckpointStrategy(checkpoint_score_mode="min")
 
     with pytest.raises(ValueError):
-        CheckpointConfig(
+        CheckpointStrategy(
             checkpoint_score_metric="metric", checkpoint_score_mode="invalid"
         )
 
-    checkpointing = CheckpointConfig()
+    checkpointing = CheckpointStrategy()
     assert checkpointing.checkpoint_score_attr is None
 
-    checkpointing = CheckpointConfig(checkpoint_score_metric="metric")
+    checkpointing = CheckpointStrategy(checkpoint_score_metric="metric")
     assert checkpointing.checkpoint_score_attr == "metric"
 
-    checkpointing = CheckpointConfig(
+    checkpointing = CheckpointStrategy(
         checkpoint_score_metric="metric", checkpoint_score_mode="max"
     )
     assert checkpointing.checkpoint_score_attr == "metric"
 
-    checkpointing = CheckpointConfig(
+    checkpointing = CheckpointStrategy(
         checkpoint_score_metric="metric", checkpoint_score_mode="min"
     )
     assert checkpointing.checkpoint_score_attr == "min-metric"
diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py
index 9a27acd10e36..4b4dc9b8d113 100644
--- a/python/ray/util/ml_utils/checkpoint_manager.py
+++ b/python/ray/util/ml_utils/checkpoint_manager.py
@@ -230,6 +230,16 @@ def __post_init__(self):
                 f"checkpoint_score_order must be either " f'"{MAX}" or "{MIN}".'
             )
 
+    @property
+    def checkpoint_score_attr(self) -> Optional[str]:
+        """Same as ``checkpoint_score_attr`` in ``tune.run``."""
+        if self.checkpoint_score_attribute is None:
+            return self.checkpoint_score_attribute
+        prefix = ""
+        if self.checkpoint_score_order == MIN:
+            prefix = "min-"
+        return f"{prefix}{self.checkpoint_score_attribute}"
+
 
 class _CheckpointManager:
     """Common checkpoint management and bookkeeping class for Ray Train and Tune.

From e19d40f542dd8e3af89042331ccc1d94d48692cf Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Wed, 22 Jun 2022 15:15:17 +0200
Subject: [PATCH 40/70] Fix

---
 python/ray/air/tests/test_api.py       | 12 ++++--------
 python/ray/tune/impl/tuner_internal.py |  2 +-
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py
index d7054c5a81bf..1c0680860e3f 100644
--- a/python/ray/air/tests/test_api.py
+++ b/python/ray/air/tests/test_api.py
@@ -40,28 +40,24 @@ def test_run_config():
 
 
 def test_checkpointing_config():
-    # cannot set checkpoint_score_mode if checkpoint_score_metric is unset
-    with pytest.raises(ValueError):
-        CheckpointStrategy(checkpoint_score_mode="min")
-
     with pytest.raises(ValueError):
         CheckpointStrategy(
-            checkpoint_score_metric="metric", checkpoint_score_mode="invalid"
+            checkpoint_score_attribute="metric", checkpoint_score_order="invalid"
         )
 
     checkpointing = CheckpointStrategy()
     assert checkpointing.checkpoint_score_attr is None
 
-    checkpointing = CheckpointStrategy(checkpoint_score_metric="metric")
+    checkpointing = CheckpointStrategy(checkpoint_score_attribute="metric")
     assert checkpointing.checkpoint_score_attr == "metric"
 
     checkpointing = CheckpointStrategy(
-        checkpoint_score_metric="metric", checkpoint_score_mode="max"
+        checkpoint_score_attribute="metric", checkpoint_score_order="max"
     )
     assert checkpointing.checkpoint_score_attr == "metric"
 
     checkpointing = CheckpointStrategy(
-        checkpoint_score_metric="metric", checkpoint_score_mode="min"
+        checkpoint_score_attribute="metric", checkpoint_score_order="min"
     )
     assert checkpointing.checkpoint_score_attr == "min-metric"
 
diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py
index 60b89b8acb98..2d348c94d47a 100644
--- a/python/ray/tune/impl/tuner_internal.py
+++ b/python/ray/tune/impl/tuner_internal.py
@@ -163,7 +163,7 @@ def _get_tune_run_arguments(self) -> Dict[str, Any]:
                 else 0
             ),
             keep_checkpoints_num=(
-                self._run_config.checkpoint_config.keep_checkpoints_num
+                self._run_config.checkpoint_config.num_to_keep
                 if self._run_config.checkpoint_config
                 else None
             ),

From fd961746ca582263ecbc6bacc4342e915bd74416 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Fri, 24 Jun 2022 15:43:53 +0000
Subject: [PATCH 41/70] dataframe -> metrics_dataframe

---
 python/ray/air/result.py       | 4 ++--
 python/ray/tune/result_grid.py | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/ray/air/result.py b/python/ray/air/result.py
index d6fa35a4a809..5b7f0fcba04b 100644
--- a/python/ray/air/result.py
+++ b/python/ray/air/result.py
@@ -24,7 +24,7 @@ class Result:
         metrics: The final metrics as reported by an Trainable.
         checkpoint: The final checkpoint of the Trainable.
         error: The execution error of the Trainable run, if the trial finishes in error.
-        dataframe: The full result dataframe of the Trainable.
+        metrics_dataframe: The full result dataframe of the Trainable.
             The dataframe is indexed by iterations and contains reported
             metrics.
         best_checkpoints: A list of tuples of the best checkpoints saved
@@ -37,7 +37,7 @@ class Result:
     metrics: Optional[Dict[str, Any]]
     checkpoint: Optional[Checkpoint]
     error: Optional[Exception]
-    dataframe: Optional[pd.DataFrame]
+    metrics_dataframe: Optional[pd.DataFrame]
     best_checkpoints: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]]
 
     @property
diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index ed6711414985..b0cc6f83899e 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -180,7 +180,9 @@ def _trial_to_result(self, trial: Trial) -> Result:
             checkpoint=checkpoint,
             metrics=trial.last_result.copy(),
             error=self._populate_exception(trial),
-            dataframe=self._experiment_analysis.trial_dataframes.get(trial.logdir)
+            metrics_dataframe=self._experiment_analysis.trial_dataframes.get(
+                trial.logdir
+            )
             if self._experiment_analysis
             else None,
             best_checkpoints=best_checkpoints,

From 8d5f1b3d63d6b58843f2fd51d39c8502b2293015 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Fri, 24 Jun 2022 16:04:09 +0000
Subject: [PATCH 42/70] CheckpointStrategy -> CheckpointConfig

---
 doc/source/train/api.rst                      |  4 ---
 doc/source/train/user_guide.rst               | 10 +++---
 python/ray/air/__init__.py                    |  4 +--
 python/ray/air/config.py                      |  6 ++--
 python/ray/air/tests/test_api.py              | 12 +++----
 python/ray/train/__init__.py                  |  6 +++-
 python/ray/train/_internal/checkpoint.py      |  8 ++---
 python/ray/train/data_parallel_trainer.py     |  6 ++--
 python/ray/train/tests/test_trainer.py        | 12 +++----
 python/ray/train/trainer.py                   | 16 ++++-----
 python/ray/tune/callback.py                   |  4 +--
 .../ray/tune/execution/checkpoint_manager.py  |  4 +--
 .../ray/util/ml_utils/checkpoint_manager.py   | 33 ++++++++++++++-----
 .../ml_utils/tests/test_checkpoint_manager.py | 14 ++++----
 14 files changed, 79 insertions(+), 60 deletions(-)

diff --git a/doc/source/train/api.rst b/doc/source/train/api.rst
index 054ad0f30d10..ea8b879cdcd9 100644
--- a/doc/source/train/api.rst
+++ b/doc/source/train/api.rst
@@ -117,10 +117,6 @@ Checkpointing
 
 .. _train-api-checkpoint-strategy:
 
-CheckpointStrategy
-~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: ray.train.CheckpointStrategy
 
 .. _train-api-func-utils:
 
diff --git a/doc/source/train/user_guide.rst b/doc/source/train/user_guide.rst
index 7dc1bd79ce3a..ff2b2556afb0 100644
--- a/doc/source/train/user_guide.rst
+++ b/doc/source/train/user_guide.rst
@@ -700,13 +700,13 @@ As an example, to completely disable writing checkpoints to disk:
     :emphasize-lines: 8,12
 
     from ray import train
-    from ray.train import CheckpointStrategy, Trainer
+    from ray.train import CheckpointConfig, Trainer
 
     def train_func():
         for epoch in range(3):
             train.save_checkpoint(epoch=epoch)
 
-    checkpoint_strategy = CheckpointStrategy(num_to_keep=0)
+    checkpoint_strategy = CheckpointConfig(num_to_keep=0)
 
     trainer = Trainer(backend="torch", num_workers=2)
     trainer.start()
@@ -714,12 +714,12 @@ As an example, to completely disable writing checkpoints to disk:
     trainer.shutdown()
 
 
-You may also config ``CheckpointStrategy`` to keep the "N best" checkpoints persisted to disk. The following example shows how you could keep the 2 checkpoints with the lowest "loss" value:
+You may also config ``CheckpointConfig`` to keep the "N best" checkpoints persisted to disk. The following example shows how you could keep the 2 checkpoints with the lowest "loss" value:
 
 .. code-block:: python
 
     from ray import train
-    from ray.train import CheckpointStrategy, Trainer
+    from ray.train import CheckpointConfig, Trainer
 
 
     def train_func():
@@ -733,7 +733,7 @@ You may also config ``CheckpointStrategy`` to keep the "N best" checkpoints pers
         train.save_checkpoint(loss=3)
 
     # Keep the 2 checkpoints with the smallest "loss" value.
-    checkpoint_strategy = CheckpointStrategy(num_to_keep=2,
+    checkpoint_strategy = CheckpointConfig(num_to_keep=2,
                                              checkpoint_score_attribute="loss",
                                              checkpoint_score_order="min")
 
diff --git a/python/ray/air/__init__.py b/python/ray/air/__init__.py
index 922f1bc83b94..506f9d022cc0 100644
--- a/python/ray/air/__init__.py
+++ b/python/ray/air/__init__.py
@@ -4,8 +4,8 @@
     RunConfig,
     ScalingConfig,
     FailureConfig,
+    CheckpointConfig,
 )
-from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy
 from ray.air.data_batch_type import DataBatchType
 from ray.air.result import Result
 from ray.air.util.datasets import train_test_split
@@ -18,6 +18,6 @@
     "ScalingConfig",
     "DatasetConfig",
     "FailureConfig",
-    "CheckpointStrategy",
+    "CheckpointConfig",
     "train_test_split",
 ]
diff --git a/python/ray/air/config.py b/python/ray/air/config.py
index 00db7cdcf33b..5b7317d283ca 100644
--- a/python/ray/air/config.py
+++ b/python/ray/air/config.py
@@ -5,7 +5,9 @@
 from ray.tune.syncer import SyncConfig
 from ray.tune.utils.log import Verbosity
 from ray.util.annotations import PublicAPI
-from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy
+
+# Move here later when ml_utils is deprecated
+from ray.util.ml_utils.checkpoint_manager import CheckpointConfig
 
 if TYPE_CHECKING:
     from ray.data import Dataset
@@ -314,5 +316,5 @@ class RunConfig:
     stop: Optional[Union[Mapping, "Stopper", Callable[[str, Mapping], bool]]] = None
     failure_config: Optional[FailureConfig] = None
     sync_config: Optional[SyncConfig] = None
-    checkpoint_config: Optional[CheckpointStrategy] = None
+    checkpoint_config: Optional[CheckpointConfig] = None
     verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS
diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py
index 1c0680860e3f..e4474c24626a 100644
--- a/python/ray/air/tests/test_api.py
+++ b/python/ray/air/tests/test_api.py
@@ -4,7 +4,7 @@
 from ray.air import Checkpoint
 from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated
 from ray.air.config import ScalingConfigDataClass
-from ray.air.config import CheckpointStrategy
+from ray.air.config import CheckpointConfig
 from ray.data.preprocessor import Preprocessor
 from ray.train.trainer import BaseTrainer
 
@@ -41,22 +41,22 @@ def test_run_config():
 
 def test_checkpointing_config():
     with pytest.raises(ValueError):
-        CheckpointStrategy(
+        CheckpointConfig(
             checkpoint_score_attribute="metric", checkpoint_score_order="invalid"
         )
 
-    checkpointing = CheckpointStrategy()
+    checkpointing = CheckpointConfig()
     assert checkpointing.checkpoint_score_attr is None
 
-    checkpointing = CheckpointStrategy(checkpoint_score_attribute="metric")
+    checkpointing = CheckpointConfig(checkpoint_score_attribute="metric")
     assert checkpointing.checkpoint_score_attr == "metric"
 
-    checkpointing = CheckpointStrategy(
+    checkpointing = CheckpointConfig(
         checkpoint_score_attribute="metric", checkpoint_score_order="max"
     )
     assert checkpointing.checkpoint_score_attr == "metric"
 
-    checkpointing = CheckpointStrategy(
+    checkpointing = CheckpointConfig(
         checkpoint_score_attribute="metric", checkpoint_score_order="min"
     )
     assert checkpointing.checkpoint_score_attr == "min-metric"
diff --git a/python/ray/train/__init__.py b/python/ray/train/__init__.py
index 11407fa8a16a..74d360f117b7 100644
--- a/python/ray/train/__init__.py
+++ b/python/ray/train/__init__.py
@@ -12,13 +12,16 @@
     world_size,
 )
 from ray.train.trainer import Trainer, TrainingIterator
+from ray.air.config import CheckpointConfig
+
+# deprecated
 from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy
 
 usage_lib.record_library_usage("train")
 
 __all__ = [
     "BackendConfig",
-    "CheckpointStrategy",
+    "CheckpointConfig",
     "get_dataset_shard",
     "load_checkpoint",
     "local_rank",
@@ -30,4 +33,5 @@
     "world_rank",
     "world_size",
     "TRAIN_DATASET_KEY",
+    "CheckpointStrategy",
 ]
diff --git a/python/ray/train/_internal/checkpoint.py b/python/ray/train/_internal/checkpoint.py
index 0a85f4396e36..8bffe957833d 100644
--- a/python/ray/train/_internal/checkpoint.py
+++ b/python/ray/train/_internal/checkpoint.py
@@ -11,7 +11,7 @@
     TUNE_CHECKPOINT_ID,
     TUNE_INSTALLED,
 )
-from ray.util.ml_utils.checkpoint_manager import CheckpointStorage, CheckpointStrategy
+from ray.util.ml_utils.checkpoint_manager import CheckpointStorage, CheckpointConfig
 from ray.util.ml_utils.checkpoint_manager import (
     _CheckpointManager as CommonCheckpointManager,
 )
@@ -67,7 +67,7 @@ class CheckpointManager(CommonCheckpointManager):
     def __init__(
         self,
         run_dir: Optional[Path] = None,
-        checkpoint_strategy: Optional[CheckpointStrategy] = None,
+        checkpoint_strategy: Optional[CheckpointConfig] = None,
     ):
         self.run_dir = run_dir
 
@@ -136,11 +136,11 @@ def _get_next_checkpoint_path(self) -> Optional[Path]:
 
     def on_start_training(
         self,
-        checkpoint_strategy: Optional[CheckpointStrategy],
+        checkpoint_strategy: Optional[CheckpointConfig],
         run_dir: Path,
         latest_checkpoint_id: Optional[int] = 0,
     ):
-        checkpoint_strategy = checkpoint_strategy or CheckpointStrategy()
+        checkpoint_strategy = checkpoint_strategy or CheckpointConfig()
         self._checkpoint_strategy = checkpoint_strategy
 
         self._validate_checkpoint_strategy()
diff --git a/python/ray/train/data_parallel_trainer.py b/python/ray/train/data_parallel_trainer.py
index 59ed5cb0be4c..c8729282ab40 100644
--- a/python/ray/train/data_parallel_trainer.py
+++ b/python/ray/train/data_parallel_trainer.py
@@ -8,7 +8,7 @@
 from ray import tune
 from ray.air import session
 from ray.air.checkpoint import Checkpoint
-from ray.air.config import DatasetConfig, RunConfig, ScalingConfig
+from ray.air.config import DatasetConfig, RunConfig, ScalingConfig, CheckpointConfig
 from ray.air.constants import MODEL_KEY, PREPROCESSOR_KEY
 from ray.train import BackendConfig, TrainingIterator
 from ray.train._internal.backend_executor import BackendExecutor, TrialInfo
@@ -18,7 +18,7 @@
 from ray.train.constants import TRAIN_DATASET_KEY, WILDCARD_KEY
 from ray.train.trainer import BaseTrainer, GenDataset
 from ray.util.annotations import DeveloperAPI
-from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy, _TrackedCheckpoint
+from ray.util.ml_utils.checkpoint_manager import _TrackedCheckpoint
 
 if TYPE_CHECKING:
     from ray.data.preprocessor import Preprocessor
@@ -32,7 +32,7 @@ def __init__(
         self,
         preprocessor: "Preprocessor",
         run_dir: Optional[Path] = None,
-        checkpoint_strategy: Optional[CheckpointStrategy] = None,
+        checkpoint_strategy: Optional[CheckpointConfig] = None,
     ):
         self.preprocessor = preprocessor
         super(_DataParallelCheckpointManager, self).__init__(
diff --git a/python/ray/train/tests/test_trainer.py b/python/ray/train/tests/test_trainer.py
index 207d45f79d28..d366999e8611 100644
--- a/python/ray/train/tests/test_trainer.py
+++ b/python/ray/train/tests/test_trainer.py
@@ -10,7 +10,7 @@
 import ray
 import ray.train as train
 from ray._private.test_utils import wait_for_condition
-from ray.train import Trainer, CheckpointStrategy
+from ray.train import Trainer, CheckpointConfig
 from ray.train.backend import BackendConfig, Backend
 from ray.train.constants import TRAIN_ENABLE_WORKER_SPREAD_ENV
 from ray.train.torch import TorchConfig
@@ -514,7 +514,7 @@ def test_persisted_checkpoint_strategy(ray_start_2_cpus):
     logdir = "/tmp/test/trainer/test_persisted_checkpoint_strategy"
     config = TestConfig()
 
-    checkpoint_strategy = CheckpointStrategy(
+    checkpoint_strategy = CheckpointConfig(
         num_to_keep=2, checkpoint_score_attribute="loss", checkpoint_score_order="min"
     )
 
@@ -555,7 +555,7 @@ def validate():
 def test_load_checkpoint_from_path(ray_start_2_cpus, tmpdir):
     config = TestConfig()
 
-    checkpoint_strategy = CheckpointStrategy(
+    checkpoint_strategy = CheckpointConfig(
         checkpoint_score_attribute="loss", checkpoint_score_order="min"
     )
 
@@ -585,12 +585,12 @@ def train_func():
     trainer.start()
 
     with pytest.raises(ValueError):
-        trainer.run(train_func, checkpoint_strategy=CheckpointStrategy(num_to_keep=-1))
+        trainer.run(train_func, checkpoint_strategy=CheckpointConfig(num_to_keep=-1))
 
     with pytest.raises(ValueError):
         trainer.run(
             train_func,
-            checkpoint_strategy=CheckpointStrategy(
+            checkpoint_strategy=CheckpointConfig(
                 checkpoint_score_order="invalid_order"
             ),
         )
@@ -598,7 +598,7 @@ def train_func():
     with pytest.raises(ValueError):
         trainer.run(
             train_func,
-            checkpoint_strategy=CheckpointStrategy(
+            checkpoint_strategy=CheckpointConfig(
                 checkpoint_score_attribute="missing_attribute"
             ),
         )
diff --git a/python/ray/train/trainer.py b/python/ray/train/trainer.py
index 1980f16235df..0262f92963d8 100644
--- a/python/ray/train/trainer.py
+++ b/python/ray/train/trainer.py
@@ -9,6 +9,7 @@
 import ray
 from ray.actor import ActorHandle
 from ray.air.checkpoint import Checkpoint
+from ray.air.config import CheckpointConfig
 from ray.train._internal.backend_executor import (
     BackendExecutor,
     InactiveWorkerGroupError,
@@ -42,7 +43,6 @@
     TUNE_INSTALLED,
 )
 from ray.util.annotations import Deprecated, DeveloperAPI
-from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy
 
 if TUNE_INSTALLED:
     from ray import tune
@@ -293,7 +293,7 @@ def run(
         callbacks: Optional[List[TrainingCallback]] = None,
         dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]] = None,
         checkpoint: Optional[Union[Dict, str, Path]] = None,
-        checkpoint_strategy: Optional[CheckpointStrategy] = None,
+        checkpoint_strategy: Optional[CheckpointConfig] = None,
     ) -> List[T]:
         """Runs a training function in a distributed manner.
 
@@ -321,7 +321,7 @@ def run(
                 or ``Path`` then the value is expected to be a path to a file
                 that contains a serialized checkpoint dict. If this is
                 ``None`` then no checkpoint will be loaded.
-            checkpoint_strategy (Optional[CheckpointStrategy]): The
+            checkpoint_strategy (Optional[CheckpointConfig]): The
                 configurations for saving checkpoints.
 
         Returns:
@@ -373,7 +373,7 @@ def run_iterator(
         config: Optional[Dict[str, Any]] = None,
         dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]] = None,
         checkpoint: Optional[Union[Dict, str, Path]] = None,
-        checkpoint_strategy: Optional[CheckpointStrategy] = None,
+        checkpoint_strategy: Optional[CheckpointConfig] = None,
     ) -> "TrainingIterator":
         """Same as ``run`` except returns an iterator over the results.
 
@@ -411,7 +411,7 @@ def train_func(config):
                 ``str`` or ``Path`` then the value is expected to be a path
                 to a file that contains a serialized checkpoint dict. If this
                 is ``None`` then no checkpoint will be loaded.
-            checkpoint_strategy (Optional[CheckpointStrategy]): The
+            checkpoint_strategy (Optional[CheckpointConfig]): The
                 configurations for saving checkpoints.
 
         Returns:
@@ -462,7 +462,7 @@ def latest_checkpoint_dir(self) -> Optional[Path]:
     def best_checkpoint_path(self) -> Optional[Path]:
         """Path to the best persisted checkpoint from the latest run.
 
-        "Best" is defined by the input ``CheckpointStrategy``.
+        "Best" is defined by the input ``CheckpointConfig``.
         Default behavior is to return the most recent checkpoint.
 
         Returns ``None`` if ``run()`` has not been called or if
@@ -486,7 +486,7 @@ def latest_checkpoint(self) -> Optional[Dict]:
     def best_checkpoint(self) -> Optional[Dict]:
         """Best saved checkpoint from the latest run.
 
-        "Best" is defined by the input ``CheckpointStrategy``.
+        "Best" is defined by the input ``CheckpointConfig``.
         Default behavior is to return the most recent checkpoint.
 
         Returns ``None`` if ``run()`` has not been called or if
@@ -670,7 +670,7 @@ def __init__(
         dataset_spec: RayDatasetSpec,
         checkpoint_manager: CheckpointManager,
         checkpoint: Optional[Union[Dict, str, Path, Checkpoint]],
-        checkpoint_strategy: Optional[CheckpointStrategy],
+        checkpoint_strategy: Optional[CheckpointConfig],
         run_dir: Optional[Path] = None,
     ):
         self._backend_executor = backend_executor
diff --git a/python/ray/tune/callback.py b/python/ray/tune/callback.py
index fcf4e24aee3d..450ee55310f7 100644
--- a/python/ray/tune/callback.py
+++ b/python/ray/tune/callback.py
@@ -3,11 +3,11 @@
 import warnings
 
 from ray.util.annotations import PublicAPI, DeveloperAPI
-from ray.util.ml_utils.checkpoint_manager import _TrackedCheckpoint
 
 if TYPE_CHECKING:
     from ray.tune.experiment import Trial
     from ray.tune.stopper import Stopper
+    from ray.util.ml_utils.checkpoint_manager import _TrackedCheckpoint
 
 
 class _CallbackMeta(ABCMeta):
@@ -251,7 +251,7 @@ def on_checkpoint(
         iteration: int,
         trials: List["Trial"],
         trial: "Trial",
-        checkpoint: _TrackedCheckpoint,
+        checkpoint: "_TrackedCheckpoint",
         **info,
     ):
         """Called after a trial saved a checkpoint with Tune.
diff --git a/python/ray/tune/execution/checkpoint_manager.py b/python/ray/tune/execution/checkpoint_manager.py
index 64b68a7fb416..f1295ac6f604 100644
--- a/python/ray/tune/execution/checkpoint_manager.py
+++ b/python/ray/tune/execution/checkpoint_manager.py
@@ -4,7 +4,7 @@
 
 from ray.tune.result import TRAINING_ITERATION
 from ray.util.ml_utils.checkpoint_manager import (
-    CheckpointStrategy,
+    CheckpointConfig,
     MIN,
     MAX,
     _CheckpointManager as CommonCheckpointManager,
@@ -51,7 +51,7 @@ def __init__(
         else:
             checkpoint_score_attr = checkpoint_score_attr
 
-        checkpoint_strategy = CheckpointStrategy(
+        checkpoint_strategy = CheckpointConfig(
             num_to_keep=keep_checkpoints_num,
             checkpoint_score_attribute=checkpoint_score_attr,
             checkpoint_score_order=MIN if checkpoint_score_desc else MAX,
diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py
index 12be4aea9f99..493be617e2d4 100644
--- a/python/ray/util/ml_utils/checkpoint_manager.py
+++ b/python/ray/util/ml_utils/checkpoint_manager.py
@@ -14,8 +14,7 @@
 import ray
 from ray.air import Checkpoint
 from ray.tune.result import NODE_IP
-from ray.util import PublicAPI
-from ray.util.annotations import DeveloperAPI
+from ray.util.annotations import Deprecated, DeveloperAPI, PublicAPI
 from ray.util.ml_utils.util import is_nan
 
 MAX = "max"
@@ -186,9 +185,10 @@ def __repr__(self):
         return f"_HeapCheckpoint({repr(self.tracked_checkpoint)})"
 
 
-@PublicAPI(stability="beta")
+# Move to ray.air.config when ml_utils is deprecated.
 @dataclass
-class CheckpointStrategy:
+@PublicAPI(stability="alpha")
+class CheckpointConfig:
     """Configurable parameters for defining the checkpointing strategy.
 
     Default behavior is to persist all checkpoints to disk. If
@@ -196,7 +196,7 @@ class CheckpointStrategy:
     checkpoints with maximum timestamp, i.e. the most recent checkpoints.
 
     Args:
-        num_to_keep (Optional[int]): The number of checkpoints to keep
+        num_to_keep: The number of checkpoints to keep
             on disk for this run. If a checkpoint is persisted to disk after
             there are already this many checkpoints, then an existing
             checkpoint will be deleted. If this is ``None`` then checkpoints
@@ -208,7 +208,7 @@ class CheckpointStrategy:
             This attribute must be a key from the checkpoint
             dictionary which has a numerical value. Per default, the last
             checkpoints will be kept.
-        checkpoint_score_order (str). Either "max" or "min".
+        checkpoint_score_order: Either "max" or "min".
             If "max", then checkpoints with highest values of
             ``checkpoint_score_attribute`` will be kept.
             If "min", then checkpoints with lowest values of
@@ -242,6 +242,23 @@ def checkpoint_score_attr(self) -> Optional[str]:
         return f"{prefix}{self.checkpoint_score_attribute}"
 
 
+# Alias for backwards compatibility
+
+deprecation_message = (
+    "`CheckpointStrategy` is deprecated and will be removed in "
+    "the future. Please use `ray.air.config.CheckpointStrategy` "
+    "instead."
+)
+
+
+@Deprecated(message=deprecation_message)
+@dataclass
+class CheckpointStrategy(CheckpointConfig):
+    def __post_init__(self):
+        logger.warning(deprecation_message)
+        super().__post_init__()
+
+
 class _CheckpointManager:
     """Common checkpoint management and bookkeeping class for Ray Train and Tune.
 
@@ -269,11 +286,11 @@ class _CheckpointManager:
 
     def __init__(
         self,
-        checkpoint_strategy: CheckpointStrategy,
+        checkpoint_strategy: CheckpointConfig,
         latest_checkpoint_id: int = 0,
         delete_fn: Optional[Callable[["_TrackedCheckpoint"], None]] = None,
     ):
-        self._checkpoint_strategy = checkpoint_strategy or CheckpointStrategy()
+        self._checkpoint_strategy = checkpoint_strategy or CheckpointConfig()
 
         # Incremental unique checkpoint ID of this run.
         self._latest_checkpoint_id = latest_checkpoint_id
diff --git a/python/ray/util/ml_utils/tests/test_checkpoint_manager.py b/python/ray/util/ml_utils/tests/test_checkpoint_manager.py
index 16fd83a8ecb8..0c0a145ad26b 100644
--- a/python/ray/util/ml_utils/tests/test_checkpoint_manager.py
+++ b/python/ray/util/ml_utils/tests/test_checkpoint_manager.py
@@ -2,13 +2,13 @@
 from ray.util.ml_utils.checkpoint_manager import (
     _CheckpointManager,
     CheckpointStorage,
-    CheckpointStrategy,
+    CheckpointConfig,
     _TrackedCheckpoint,
 )
 
 
 def test_unlimited_persistent_checkpoints():
-    cpm = _CheckpointManager(checkpoint_strategy=CheckpointStrategy(num_to_keep=None))
+    cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=None))
 
     for i in range(10):
         cpm.register_checkpoint(
@@ -19,7 +19,7 @@ def test_unlimited_persistent_checkpoints():
 
 
 def test_limited_persistent_checkpoints():
-    cpm = _CheckpointManager(checkpoint_strategy=CheckpointStrategy(num_to_keep=2))
+    cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=2))
 
     for i in range(10):
         cpm.register_checkpoint(
@@ -30,7 +30,7 @@ def test_limited_persistent_checkpoints():
 
 
 def test_no_persistent_checkpoints():
-    cpm = _CheckpointManager(checkpoint_strategy=CheckpointStrategy(num_to_keep=0))
+    cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=0))
 
     for i in range(10):
         cpm.register_checkpoint(
@@ -41,7 +41,7 @@ def test_no_persistent_checkpoints():
 
 
 def test_dont_persist_memory_checkpoints():
-    cpm = _CheckpointManager(checkpoint_strategy=CheckpointStrategy(num_to_keep=None))
+    cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=None))
     cpm._persist_memory_checkpoints = False
 
     for i in range(10):
@@ -53,7 +53,7 @@ def test_dont_persist_memory_checkpoints():
 
 
 def test_persist_memory_checkpoints():
-    cpm = _CheckpointManager(checkpoint_strategy=CheckpointStrategy(num_to_keep=None))
+    cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=None))
     cpm._persist_memory_checkpoints = True
 
     for i in range(10):
@@ -66,7 +66,7 @@ def test_persist_memory_checkpoints():
 
 def test_keep_best_checkpoints():
     cpm = _CheckpointManager(
-        checkpoint_strategy=CheckpointStrategy(
+        checkpoint_strategy=CheckpointConfig(
             num_to_keep=2,
             checkpoint_score_attribute="metric",
             checkpoint_score_order="min",

From 0482bce4c4bc0b2204283b5455eb9b5474de90b0 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Fri, 24 Jun 2022 16:10:20 +0000
Subject: [PATCH 43/70] Missed this

---
 doc/source/train/user_guide.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/train/user_guide.rst b/doc/source/train/user_guide.rst
index ff2b2556afb0..8a75792732cf 100644
--- a/doc/source/train/user_guide.rst
+++ b/doc/source/train/user_guide.rst
@@ -680,7 +680,7 @@ directory <train-log-dir>` of each run.
     # /home/ray_results/train_2021-09-01_12-00-00/run_001/checkpoints
 
     # By default, the "best" checkpoint path will refer to the most recent one.
-    # This can be configured by defining a CheckpointStrategy.
+    # This can be configured by defining a CheckpointConfig.
     print(trainer.best_checkpoint_path)
     # /home/ray_results/train_2021-09-01_12-00-00/run_001/checkpoints/checkpoint_000005
 

From 0cb579ddd3b868849453d4a72239fd3581873e07 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Fri, 24 Jun 2022 11:47:44 -0700
Subject: [PATCH 44/70] Update test_result_grid.py

---
 python/ray/tune/tests/test_result_grid.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py
index fde9713908d0..ea7bf0a5bcb3 100644
--- a/python/ray/tune/tests/test_result_grid.py
+++ b/python/ray/tune/tests/test_result_grid.py
@@ -58,7 +58,7 @@ def f(config):
     assert isinstance(result.best_checkpoints, list)
     assert isinstance(result.metrics, dict)
     assert isinstance(result.config, dict)
-    assert isinstance(result.dataframe, pd.DataFrame)
+    assert isinstance(result.metrics_dataframe, pd.DataFrame)
     assert os.path.normpath(
         result.checkpoint.get_internal_representation()[1]
     ) != os.path.normpath(
@@ -68,7 +68,7 @@ def f(config):
     )
     assert result.config == {"a": 1}
     assert result.metrics["config"] == result.config
-    assert len(result.dataframe) == 2
+    assert len(result.metrics_dataframe) == 2
 
 
 def test_result_grid_metric_mode_unset(ray_start_2_cpus):
@@ -87,10 +87,10 @@ def f(config):
     assert isinstance(result.checkpoint, Checkpoint)
     assert isinstance(result.metrics, dict)
     assert isinstance(result.config, dict)
-    assert isinstance(result.dataframe, pd.DataFrame)
+    assert isinstance(result.metrics_dataframe, pd.DataFrame)
     assert result.config == {"a": 1}
     assert result.metrics["config"] == result.config
-    assert len(result.dataframe) == 2
+    assert len(result.metrics_dataframe) == 2
 
 
 def test_result_grid_no_checkpoint(ray_start_2_cpus):
@@ -131,7 +131,7 @@ def test_result_grid_future_checkpoint(ray_start_2_cpus, to_object):
     assert isinstance(result.checkpoint, Checkpoint)
     assert isinstance(result.metrics, dict)
     assert isinstance(result.config, dict)
-    assert result.dataframe is None
+    assert result.metrics_dataframe is None
     assert result.config == {"some_config": 1}
     assert result.metrics["config"] == result.config
 

From 7ade7e4878c87212fdb6d3707fc10e3447a76164 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Fri, 24 Jun 2022 21:09:20 +0000
Subject: [PATCH 45/70] Fix

---
 python/ray/train/tests/test_examples.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py
index 2ebef818d7aa..316ff4dc5fc3 100644
--- a/python/ray/train/tests/test_examples.py
+++ b/python/ray/train/tests/test_examples.py
@@ -51,7 +51,7 @@ def test_tensorflow_mnist(ray_start_4_cpus, num_workers):
 
     assert result[TRAINING_ITERATION] == epochs
 
-    loss = list(results.dataframe["loss"])
+    loss = list(results.metrics_dataframe["loss"])
     assert len(loss) == epochs
     assert loss[-1] < loss[0]
 
@@ -107,7 +107,7 @@ def test_torch_linear(ray_start_4_cpus, num_workers):
     result = results.metrics
     assert result[TRAINING_ITERATION] == epochs
 
-    loss = list(results.dataframe["loss"])
+    loss = list(results.metrics_dataframe["loss"])
     assert len(loss) == epochs
     assert loss[-1] < loss[0]
 
@@ -146,7 +146,7 @@ def test_torch_fashion_mnist(ray_start_4_cpus):
     result = results.metrics
     assert result[TRAINING_ITERATION] == epochs
 
-    loss = list(results.dataframe["loss"])
+    loss = list(results.metrics_dataframe["loss"])
     assert len(loss) == epochs
     assert loss[-1] < loss[0]
 

From 0937dc857fb32d4cf0ff99bb8daeae5d7c2ade85 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Fri, 24 Jun 2022 21:33:14 +0000
Subject: [PATCH 46/70] Apply feeedback from code review

---
 doc/source/ray-air/package-ref.rst             | 3 +++
 doc/source/train/api.rst                       | 6 ------
 doc/source/train/user_guide.rst                | 2 +-
 python/ray/air/config.py                       | 2 +-
 python/ray/util/ml_utils/checkpoint_manager.py | 1 +
 5 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/doc/source/ray-air/package-ref.rst b/doc/source/ray-air/package-ref.rst
index 3b57ecfe6dc5..24586c5c620b 100644
--- a/doc/source/ray-air/package-ref.rst
+++ b/doc/source/ray-air/package-ref.rst
@@ -124,3 +124,6 @@ Configs
 .. automodule:: ray.air.config
     :members:
 
+.. _train-api-checkpoint-config:
+
+.. autoclass:: ray.air.config.CheckpointConfig
\ No newline at end of file
diff --git a/doc/source/train/api.rst b/doc/source/train/api.rst
index ea8b879cdcd9..babf01861019 100644
--- a/doc/source/train/api.rst
+++ b/doc/source/train/api.rst
@@ -112,12 +112,6 @@ TorchTensorboardProfilerCallback
 
 .. autoclass:: ray.train.callbacks.TorchTensorboardProfilerCallback
 
-Checkpointing
--------------
-
-.. _train-api-checkpoint-strategy:
-
-
 .. _train-api-func-utils:
 
 Training Function Utilities
diff --git a/doc/source/train/user_guide.rst b/doc/source/train/user_guide.rst
index 8a75792732cf..701492136952 100644
--- a/doc/source/train/user_guide.rst
+++ b/doc/source/train/user_guide.rst
@@ -691,7 +691,7 @@ Configuring checkpoints
 +++++++++++++++++++++++
 
 For more configurability of checkpointing behavior (specifically saving
-checkpoints to disk), a :ref:`train-api-checkpoint-strategy` can be passed into
+checkpoints to disk), a :ref:`train-api-checkpoint-config` can be passed into
 ``Trainer.run``.
 
 As an example, to completely disable writing checkpoints to disk:
diff --git a/python/ray/air/config.py b/python/ray/air/config.py
index 5b7317d283ca..cec982da4188 100644
--- a/python/ray/air/config.py
+++ b/python/ray/air/config.py
@@ -6,7 +6,7 @@
 from ray.tune.utils.log import Verbosity
 from ray.util.annotations import PublicAPI
 
-# Move here later when ml_utils is deprecated
+# Move here later when ml_utils is deprecated. Doing it now causes a circular import.
 from ray.util.ml_utils.checkpoint_manager import CheckpointConfig
 
 if TYPE_CHECKING:
diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py
index 493be617e2d4..a3153dbd5e06 100644
--- a/python/ray/util/ml_utils/checkpoint_manager.py
+++ b/python/ray/util/ml_utils/checkpoint_manager.py
@@ -186,6 +186,7 @@ def __repr__(self):
 
 
 # Move to ray.air.config when ml_utils is deprecated.
+# Doing it now causes a circular import.
 @dataclass
 @PublicAPI(stability="alpha")
 class CheckpointConfig:

From b99362770375c0bda7b0087e88fe58a0891933f5 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Fri, 24 Jun 2022 22:34:27 +0000
Subject: [PATCH 47/70] Fix lint

---
 doc/source/ray-air/package-ref.rst | 2 --
 doc/source/train/user_guide.rst    | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/doc/source/ray-air/package-ref.rst b/doc/source/ray-air/package-ref.rst
index 24586c5c620b..744206354232 100644
--- a/doc/source/ray-air/package-ref.rst
+++ b/doc/source/ray-air/package-ref.rst
@@ -124,6 +124,4 @@ Configs
 .. automodule:: ray.air.config
     :members:
 
-.. _train-api-checkpoint-config:
-
 .. autoclass:: ray.air.config.CheckpointConfig
\ No newline at end of file
diff --git a/doc/source/train/user_guide.rst b/doc/source/train/user_guide.rst
index 701492136952..be799a49e666 100644
--- a/doc/source/train/user_guide.rst
+++ b/doc/source/train/user_guide.rst
@@ -691,8 +691,8 @@ Configuring checkpoints
 +++++++++++++++++++++++
 
 For more configurability of checkpointing behavior (specifically saving
-checkpoints to disk), a :ref:`train-api-checkpoint-config` can be passed into
-``Trainer.run``.
+checkpoints to disk), a :class:`CheckpointConfig` can be passed into
+``Trainer``.
 
 As an example, to completely disable writing checkpoints to disk:
 

From ed870bd4f72bd8d64d8a24d651a21b068114c3d2 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Fri, 24 Jun 2022 15:46:51 -0700
Subject: [PATCH 48/70] Update python/ray/train/__init__.py

---
 python/ray/train/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/train/__init__.py b/python/ray/train/__init__.py
index 74d360f117b7..5039ae461499 100644
--- a/python/ray/train/__init__.py
+++ b/python/ray/train/__init__.py
@@ -14,7 +14,7 @@
 from ray.train.trainer import Trainer, TrainingIterator
 from ray.air.config import CheckpointConfig
 
-# deprecated
+# Deprecated. Alias of CheckpointConfig for backwards compat
 from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy
 
 usage_lib.record_library_usage("train")

From a4fd532ea77f16b0e4e70e738eeceacdc7912d85 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Mon, 27 Jun 2022 18:26:44 +0000
Subject: [PATCH 49/70] Fix CI

---
 .../ray/train/examples/tune_cifar_pytorch_pbt_example.py | 2 +-
 python/ray/train/tests/test_minimal.py                   | 9 ++++-----
 python/ray/train/tests/test_tune.py                      | 4 +++-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py
index a7031b3116a1..38abba231ae8 100644
--- a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py
+++ b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py
@@ -185,7 +185,7 @@ def train_func(config):
         ),
         run_config=RunConfig(
             stop={"training_iteration": 2 if args.smoke_test else 100},
-            failure=FailureConfig(max_failures=3),  # used for fault tolerance
+            failure_config=FailureConfig(max_failures=3),  # used for fault tolerance
         ),
     )
 
diff --git a/python/ray/train/tests/test_minimal.py b/python/ray/train/tests/test_minimal.py
index a23d7b4f23f9..7541edb16852 100644
--- a/python/ray/train/tests/test_minimal.py
+++ b/python/ray/train/tests/test_minimal.py
@@ -1,7 +1,7 @@
 import pytest
 
 import ray
-import ray.train as train
+from ray.air import session
 from ray.air.checkpoint import Checkpoint
 from ray.train._internal.worker_group import WorkerGroup
 from ray.train.backend import Backend, BackendConfig
@@ -38,10 +38,9 @@ def test_run(ray_start_4_cpus):
     config = TestConfig()
 
     def train_func():
-        checkpoint = train.load_checkpoint()
-        train.report(**checkpoint)
-        train.save_checkpoint(**checkpoint)
-        return checkpoint[key]
+        checkpoint = session.get_checkpoint()
+        session.report(metrics=checkpoint.to_dict(), checkpoint=checkpoint)
+        return checkpoint.to_dict()[key]
 
     checkpoint = Checkpoint.from_dict(
         {
diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py
index 0196a84e46b6..e407679268ad 100644
--- a/python/ray/train/tests/test_tune.py
+++ b/python/ray/train/tests/test_tune.py
@@ -200,7 +200,9 @@ def train_func():
     trainer = DataParallelTrainer(
         train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1)
     )
-    tuner = Tuner(trainer, run_config=RunConfig(failure=FailureConfig(max_failures=3)))
+    tuner = Tuner(
+        trainer, run_config=RunConfig(failure_config=FailureConfig(max_failures=3))
+    )
 
     analysis = tuner.fit()._experiment_analysis
     checkpoint_path = analysis.trials[0].checkpoint.dir_or_data

From d0ae2ba1998b544b01897d03768b0fa0d9a5c3e7 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 28 Jun 2022 11:21:57 -0700
Subject: [PATCH 50/70] Use warnings.warn

---
 python/ray/util/ml_utils/checkpoint_manager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py
index a3153dbd5e06..1f64633a9666 100644
--- a/python/ray/util/ml_utils/checkpoint_manager.py
+++ b/python/ray/util/ml_utils/checkpoint_manager.py
@@ -7,6 +7,7 @@
 import os
 import shutil
 import tempfile
+import warnings
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
@@ -256,7 +257,7 @@ def checkpoint_score_attr(self) -> Optional[str]:
 @dataclass
 class CheckpointStrategy(CheckpointConfig):
     def __post_init__(self):
-        logger.warning(deprecation_message)
+        warnings.warn(deprecation_message)
         super().__post_init__()
 
 

From d44f75026ad47a6fcea4035e1ee8de68bd1980a3 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 28 Jun 2022 11:47:05 -0700
Subject: [PATCH 51/70] Make method privat

---
 python/ray/air/tests/test_api.py               | 8 ++++----
 python/ray/tune/impl/tuner_internal.py         | 2 +-
 python/ray/util/ml_utils/checkpoint_manager.py | 6 ++++--
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py
index e4474c24626a..ffd2d722378c 100644
--- a/python/ray/air/tests/test_api.py
+++ b/python/ray/air/tests/test_api.py
@@ -46,20 +46,20 @@ def test_checkpointing_config():
         )
 
     checkpointing = CheckpointConfig()
-    assert checkpointing.checkpoint_score_attr is None
+    assert checkpointing._tune_legacy_checkpoint_score_attr is None
 
     checkpointing = CheckpointConfig(checkpoint_score_attribute="metric")
-    assert checkpointing.checkpoint_score_attr == "metric"
+    assert checkpointing._tune_legacy_checkpoint_score_attr == "metric"
 
     checkpointing = CheckpointConfig(
         checkpoint_score_attribute="metric", checkpoint_score_order="max"
     )
-    assert checkpointing.checkpoint_score_attr == "metric"
+    assert checkpointing._tune_legacy_checkpoint_score_attr == "metric"
 
     checkpointing = CheckpointConfig(
         checkpoint_score_attribute="metric", checkpoint_score_order="min"
     )
-    assert checkpointing.checkpoint_score_attr == "min-metric"
+    assert checkpointing._tune_legacy_checkpoint_score_attr == "min-metric"
 
 
 def test_scaling_config():
diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py
index 2d348c94d47a..d1e01e0a8e8d 100644
--- a/python/ray/tune/impl/tuner_internal.py
+++ b/python/ray/tune/impl/tuner_internal.py
@@ -168,7 +168,7 @@ def _get_tune_run_arguments(self) -> Dict[str, Any]:
                 else None
             ),
             checkpoint_score_attr=(
-                self._run_config.checkpoint_config.checkpoint_score_attr
+                self._run_config.checkpoint_config._tune_legacy_checkpoint_score_attr
                 if self._run_config.checkpoint_config
                 else None
             ),
diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py
index 1f64633a9666..c687ca48e8ad 100644
--- a/python/ray/util/ml_utils/checkpoint_manager.py
+++ b/python/ray/util/ml_utils/checkpoint_manager.py
@@ -234,8 +234,10 @@ def __post_init__(self):
             )
 
     @property
-    def checkpoint_score_attr(self) -> Optional[str]:
-        """Same as ``checkpoint_score_attr`` in ``tune.run``."""
+    def _tune_legacy_checkpoint_score_attr(self) -> Optional[str]:
+        """Same as ``checkpoint_score_attr`` in ``tune.run``.
+        
+        Only used for Legacy API compatibility."""
         if self.checkpoint_score_attribute is None:
             return self.checkpoint_score_attribute
         prefix = ""

From c9d33806a000f3bb84bb68ca467e6bd7d6675923 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 28 Jun 2022 13:39:36 -0700
Subject: [PATCH 52/70] Update python/ray/util/ml_utils/checkpoint_manager.py

---
 python/ray/util/ml_utils/checkpoint_manager.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py
index c687ca48e8ad..4b6576a20d79 100644
--- a/python/ray/util/ml_utils/checkpoint_manager.py
+++ b/python/ray/util/ml_utils/checkpoint_manager.py
@@ -236,7 +236,6 @@ def __post_init__(self):
     @property
     def _tune_legacy_checkpoint_score_attr(self) -> Optional[str]:
         """Same as ``checkpoint_score_attr`` in ``tune.run``.
-        
         Only used for Legacy API compatibility."""
         if self.checkpoint_score_attribute is None:
             return self.checkpoint_score_attribute

From 5c0a75317897b9ee37f6dfd7899342c8ed490cb8 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 28 Jun 2022 13:41:17 -0700
Subject: [PATCH 53/70] Update checkpoint_manager.py

---
 python/ray/util/ml_utils/checkpoint_manager.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py
index 4b6576a20d79..e6e58ff77402 100644
--- a/python/ray/util/ml_utils/checkpoint_manager.py
+++ b/python/ray/util/ml_utils/checkpoint_manager.py
@@ -236,7 +236,9 @@ def __post_init__(self):
     @property
     def _tune_legacy_checkpoint_score_attr(self) -> Optional[str]:
         """Same as ``checkpoint_score_attr`` in ``tune.run``.
-        Only used for Legacy API compatibility."""
+
+        Only used for Legacy API compatibility.
+        """
         if self.checkpoint_score_attribute is None:
             return self.checkpoint_score_attribute
         prefix = ""

From c7b783b05f6fd76d0ba4f2febc6d58d3720e0ccf Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Wed, 29 Jun 2022 11:39:02 -0700
Subject: [PATCH 54/70] Fix test

---
 python/ray/train/tests/test_tune.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py
index e407679268ad..e25193eddf54 100644
--- a/python/ray/train/tests/test_tune.py
+++ b/python/ray/train/tests/test_tune.py
@@ -5,7 +5,7 @@
 import ray
 import ray.train as train
 from ray import tune
-from ray.air import Checkpoint
+from ray.air import Checkpoint, session
 from ray.air.config import FailureConfig, RunConfig
 from ray.train._internal.worker_group import WorkerGroup
 from ray.train.backend import Backend, BackendConfig
@@ -154,13 +154,16 @@ def train_func():
 def test_reuse_checkpoint(ray_start_4_cpus):
     def train_func(config):
         itr = 0
-        ckpt = train.load_checkpoint()
+        ckpt = session.get_checkpoint()
         if ckpt is not None:
+            ckpt = ckpt.to_dict()
             itr = ckpt["iter"] + 1
 
         for i in range(itr, config["max_iter"]):
-            train.save_checkpoint(iter=i)
-            train.report(test=i, training_iteration=i)
+            session.report(
+                dict(test=i, training_iteration=i),
+                checkpoint=Checkpoint.from_dict(dict(iter=i)),
+            )
 
     trainer = DataParallelTrainer(
         train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1)
@@ -185,17 +188,20 @@ def train_func(config):
 
 def test_retry(ray_start_4_cpus):
     def train_func():
-        ckpt = train.load_checkpoint()
+        ckpt = session.get_checkpoint()
         restored = bool(ckpt)  # Does a previous checkpoint exist?
         itr = 0
         if ckpt:
+            ckpt = ckpt.to_dict()
             itr = ckpt["iter"] + 1
 
         for i in range(itr, 4):
             if i == 2 and not restored:
                 raise Exception("try to fail me")
-            train.save_checkpoint(iter=i)
-            train.report(test=i, training_iteration=i)
+            session.report(
+                dict(test=i, training_iteration=i),
+                checkpoint=Checkpoint.from_dict(dict(iter=i)),
+            )
 
     trainer = DataParallelTrainer(
         train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1)

From 2e9ec6644b4e6b06c47f96918df28ffd0d8e97e3 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 30 Jun 2022 16:39:04 +0000
Subject: [PATCH 55/70] Rename files

---
 doc/source/train/examples.rst                             | 8 ++++----
 doc/source/train/examples/train_fashion_mnist_example.rst | 4 ++--
 .../train/examples/train_linear_dataset_example.rst       | 4 ++--
 doc/source/train/examples/train_linear_example.rst        | 4 ++--
 .../train/examples/tune_cifar_pytorch_pbt_example.rst     | 6 +++---
 python/ray/train/BUILD                                    | 6 +++---
 python/ray/train/examples/mlflow_fashion_mnist_example.py | 2 +-
 ...on_mnist_example.py => torch_fashion_mnist_example.py} | 0
 ...dataset_example.py => torch_linear_dataset_example.py} | 0
 .../{train_linear_example.py => torch_linear_example.py}  | 0
 ...rch_pbt_example.py => tune_cifar_torch_pbt_example.py} | 1 -
 python/ray/train/examples/tune_linear_example.py          | 2 +-
 python/ray/train/tests/test_examples.py                   | 8 ++++++--
 python/ray/train/tests/test_gpu.py                        | 6 +++---
 python/ray/train/tests/test_tune.py                       | 2 +-
 .../workloads/pytorch_pbt_failure.py                      | 2 +-
 release/ml_user_tests/train/train_torch_linear_test.py    | 2 +-
 17 files changed, 30 insertions(+), 27 deletions(-)
 rename python/ray/train/examples/{train_fashion_mnist_example.py => torch_fashion_mnist_example.py} (100%)
 rename python/ray/train/examples/{train_linear_dataset_example.py => torch_linear_dataset_example.py} (100%)
 rename python/ray/train/examples/{train_linear_example.py => torch_linear_example.py} (100%)
 rename python/ray/train/examples/{tune_cifar_pytorch_pbt_example.py => tune_cifar_torch_pbt_example.py} (99%)

diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst
index 2a4e0b75bbd1..1529e63342cc 100644
--- a/doc/source/train/examples.rst
+++ b/doc/source/train/examples.rst
@@ -15,10 +15,10 @@ General Examples
 PyTorch
 ~~~~~~~
 
-* :doc:`/train/examples/train_linear_example`:
+* :doc:`/train/examples/torch_linear_example`:
   Simple example for PyTorch.
 
-* :doc:`/train/examples/train_fashion_mnist_example`:
+* :doc:`/train/examples/torch_fashion_mnist_example`:
   End-to-end example for PyTorch.
 
 * :doc:`/train/examples/transformers/transformers_example`:
@@ -59,7 +59,7 @@ Ray Datasets Integration Examples
 * :doc:`/train/examples/tensorflow_linear_dataset_example`:
   Simple example for training a linear TensorFlow model.
 
-* :doc:`/train/examples/train_linear_dataset_example`:
+* :doc:`/train/examples/torch_linear_dataset_example`:
   Simple example for training a linear PyTorch model.
 
 * :doc:`/train/examples/tune_torch_linear_dataset_example`:
@@ -75,7 +75,7 @@ Ray Tune Integration Examples
 * :doc:`/train/examples/tune_tensorflow_mnist_example`:
   End-to-end example for tuning a TensorFlow model.
 
-* :doc:`/train/examples/tune_cifar_pytorch_pbt_example`:
+* :doc:`/train/examples/tune_cifar_torch_pbt_example`:
   End-to-end example for tuning a PyTorch model with PBT.
 
 ..
diff --git a/doc/source/train/examples/train_fashion_mnist_example.rst b/doc/source/train/examples/train_fashion_mnist_example.rst
index e11849e7b5f5..7082cc1433db 100644
--- a/doc/source/train/examples/train_fashion_mnist_example.rst
+++ b/doc/source/train/examples/train_fashion_mnist_example.rst
@@ -1,6 +1,6 @@
 :orphan:
 
-train_fashion_mnist_example
+torch_fashion_mnist_example
 ===========================
 
-.. literalinclude:: /../../python/ray/train/examples/train_fashion_mnist_example.py
+.. literalinclude:: /../../python/ray/train/examples/torch_fashion_mnist_example.py
diff --git a/doc/source/train/examples/train_linear_dataset_example.rst b/doc/source/train/examples/train_linear_dataset_example.rst
index 5dfe21be0dc5..f84daeb67a7a 100644
--- a/doc/source/train/examples/train_linear_dataset_example.rst
+++ b/doc/source/train/examples/train_linear_dataset_example.rst
@@ -1,6 +1,6 @@
 :orphan:
 
-train_linear_dataset_example
+torch_linear_dataset_example
 ============================
 
-.. literalinclude:: /../../python/ray/train/examples/train_linear_dataset_example.py
+.. literalinclude:: /../../python/ray/train/examples/torch_linear_dataset_example.py
diff --git a/doc/source/train/examples/train_linear_example.rst b/doc/source/train/examples/train_linear_example.rst
index 3abb4af64c81..10f3090d5196 100644
--- a/doc/source/train/examples/train_linear_example.rst
+++ b/doc/source/train/examples/train_linear_example.rst
@@ -1,6 +1,6 @@
 :orphan:
 
-train_linear_example
+torch_linear_example
 ====================
 
-.. literalinclude:: /../../python/ray/train/examples/train_linear_example.py
+.. literalinclude:: /../../python/ray/train/examples/torch_linear_example.py
diff --git a/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst b/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst
index 5a1f156d8ee7..dae870f3247e 100644
--- a/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst
+++ b/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst
@@ -1,6 +1,6 @@
 :orphan:
 
-tune_cifar_pytorch_pbt_example
-==============================
+tune_cifar_torch_pbt_example
+============================
 
-.. literalinclude:: /../../python/ray/train/examples/tune_cifar_pytorch_pbt_example.py
+.. literalinclude:: /../../python/ray/train/examples/tune_cifar_torch_pbt_example.py
diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD
index 6f719b725e64..89f32eda50e2 100644
--- a/python/ray/train/BUILD
+++ b/python/ray/train/BUILD
@@ -64,10 +64,10 @@ py_test(
 )
 
 py_test(
-    name = "tune_cifar_pytorch_pbt_example",
+    name = "tune_cifar_torch_pbt_example",
     size = "medium",
-    main = "examples/tune_cifar_pytorch_pbt_example.py",
-    srcs = ["examples/tune_cifar_pytorch_pbt_example.py"],
+    main = "examples/tune_cifar_torch_pbt_example.py",
+    srcs = ["examples/tune_cifar_torch_pbt_example.py"],
     tags = ["team:ml", "exclusive", "pytorch", "tune"],
     deps = [":train_lib"],
     args = ["--smoke-test"]
diff --git a/python/ray/train/examples/mlflow_fashion_mnist_example.py b/python/ray/train/examples/mlflow_fashion_mnist_example.py
index 1cda7fc3e1ac..2d223c43ec1d 100644
--- a/python/ray/train/examples/mlflow_fashion_mnist_example.py
+++ b/python/ray/train/examples/mlflow_fashion_mnist_example.py
@@ -1,7 +1,7 @@
 import argparse
 
 from ray.air import RunConfig
-from ray.train.examples.train_fashion_mnist_example import train_func
+from ray.train.examples.torch_fashion_mnist_example import train_func
 from ray.train.torch import TorchTrainer
 from ray.tune.integration.mlflow import MLflowLoggerCallback
 
diff --git a/python/ray/train/examples/train_fashion_mnist_example.py b/python/ray/train/examples/torch_fashion_mnist_example.py
similarity index 100%
rename from python/ray/train/examples/train_fashion_mnist_example.py
rename to python/ray/train/examples/torch_fashion_mnist_example.py
diff --git a/python/ray/train/examples/train_linear_dataset_example.py b/python/ray/train/examples/torch_linear_dataset_example.py
similarity index 100%
rename from python/ray/train/examples/train_linear_dataset_example.py
rename to python/ray/train/examples/torch_linear_dataset_example.py
diff --git a/python/ray/train/examples/train_linear_example.py b/python/ray/train/examples/torch_linear_example.py
similarity index 100%
rename from python/ray/train/examples/train_linear_example.py
rename to python/ray/train/examples/torch_linear_example.py
diff --git a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py b/python/ray/train/examples/tune_cifar_torch_pbt_example.py
similarity index 99%
rename from python/ray/train/examples/tune_cifar_pytorch_pbt_example.py
rename to python/ray/train/examples/tune_cifar_torch_pbt_example.py
index 38abba231ae8..f0b5c786ff8d 100644
--- a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py
+++ b/python/ray/train/examples/tune_cifar_torch_pbt_example.py
@@ -58,7 +58,6 @@ def validate_epoch(dataloader, model, loss_fn):
 
 
 def train_func(config):
-    # print(config)
     epochs = config.pop("epochs", 3)
     model = ResNet18(config)
     model = train.torch.prepare_model(model)
diff --git a/python/ray/train/examples/tune_linear_example.py b/python/ray/train/examples/tune_linear_example.py
index 5d4a8edc911b..096c35547842 100644
--- a/python/ray/train/examples/tune_linear_example.py
+++ b/python/ray/train/examples/tune_linear_example.py
@@ -1,6 +1,6 @@
 import argparse
 
-from train_linear_example import train_func
+from torch_linear_example import train_func
 
 import ray
 from ray import tune
diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py
index 316ff4dc5fc3..1bb0753b1c6b 100644
--- a/python/ray/train/tests/test_examples.py
+++ b/python/ray/train/tests/test_examples.py
@@ -16,10 +16,10 @@
 from ray.train.examples.torch_quick_start import (
     train_func as torch_quick_start_train_func,
 )
-from ray.train.examples.train_fashion_mnist_example import (
+from ray.train.examples.torch_fashion_mnist_example import (
     train_func as fashion_mnist_train_func,
 )
-from ray.train.examples.train_linear_example import train_func as linear_train_func
+from ray.train.examples.torch_linear_example import train_func as linear_train_func
 from ray.train.horovod.horovod_trainer import HorovodTrainer
 from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer
 from ray.train.tests.test_trainer import KillCallback
@@ -172,6 +172,10 @@ def test_horovod_torch_mnist(ray_start_4_cpus):
     result = results.metrics
     assert result[TRAINING_ITERATION] == num_workers
 
+    loss = list(results.metrics_dataframe["loss"])
+    assert len(loss) == num_epochs
+    assert loss[-1] < loss[0]
+
 
 # TODO: Refactor as a backend test.
 def test_horovod_torch_mnist_stateful(ray_start_4_cpus):
diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py
index ac9a0afe7cfb..a4ac411eb9f5 100644
--- a/python/ray/train/tests/test_gpu.py
+++ b/python/ray/train/tests/test_gpu.py
@@ -18,10 +18,10 @@
 from ray.train.examples.tensorflow_mnist_example import (
     train_func as tensorflow_mnist_train_func,
 )
-from ray.train.examples.train_fashion_mnist_example import (
+from ray.train.examples.torch_fashion_mnist_example import (
     train_func as fashion_mnist_train_func,
 )
-from ray.train.examples.train_linear_example import LinearDataset
+from ray.train.examples.torch_linear_example import LinearDataset
 from ray.train.horovod.horovod_trainer import HorovodTrainer
 from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer
 from ray.train.torch.torch_trainer import TorchTrainer
@@ -350,7 +350,7 @@ def test_tune_tensorflow_mnist_gpu(ray_start_4_cpus_2_gpus):
 
 
 def test_train_linear_dataset_gpu(ray_start_4_cpus_2_gpus):
-    from ray.train.examples.train_linear_dataset_example import train_linear
+    from ray.train.examples.torch_linear_dataset_example import train_linear
 
     assert train_linear(num_workers=2, use_gpu=True)
 
diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py
index e25193eddf54..6c34bb7259b4 100644
--- a/python/ray/train/tests/test_tune.py
+++ b/python/ray/train/tests/test_tune.py
@@ -13,7 +13,7 @@
 from ray.train.examples.tensorflow_mnist_example import (
     train_func as tensorflow_mnist_train_func,
 )
-from ray.train.examples.train_fashion_mnist_example import (
+from ray.train.examples.torch_fashion_mnist_example import (
     train_func as fashion_mnist_train_func,
 )
 from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer
diff --git a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py
index d354b2834ac6..0704bed7ec75 100644
--- a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py
+++ b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py
@@ -6,7 +6,7 @@
 import ray
 from ray import tune
 from ray.air.config import RunConfig
-from ray.train.examples.tune_cifar_pytorch_pbt_example import train_func
+from ray.train.examples.tune_cifar_torch_pbt_example import train_func
 from ray.train.torch import TorchConfig, TorchTrainer
 from ray.tune.schedulers import PopulationBasedTraining
 from ray.tune.tune_config import TuneConfig
diff --git a/release/ml_user_tests/train/train_torch_linear_test.py b/release/ml_user_tests/train/train_torch_linear_test.py
index 2a2a0a751061..1629ec6cdda9 100644
--- a/release/ml_user_tests/train/train_torch_linear_test.py
+++ b/release/ml_user_tests/train/train_torch_linear_test.py
@@ -4,7 +4,7 @@
 
 import ray
 
-from ray.train.examples.train_linear_example import train_linear
+from ray.train.examples.torch_linear_example import train_linear
 
 if __name__ == "__main__":
     start = time.time()

From 2bf89d221e4e171b1ebb3f9f62a8f8d3532cd3da Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 30 Jun 2022 16:43:12 +0000
Subject: [PATCH 56/70] Use keras callback

---
 .../examples/tensorflow_linear_dataset_example.py     | 11 +++--------
 python/ray/train/examples/tensorflow_mnist_example.py |  8 +-------
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/python/ray/train/examples/tensorflow_linear_dataset_example.py b/python/ray/train/examples/tensorflow_linear_dataset_example.py
index ccc408455b44..0ee9d48d2077 100644
--- a/python/ray/train/examples/tensorflow_linear_dataset_example.py
+++ b/python/ray/train/examples/tensorflow_linear_dataset_example.py
@@ -2,20 +2,15 @@
 from typing import Dict, Tuple
 
 import tensorflow as tf
-from tensorflow.keras.callbacks import Callback
+from ray.air.callbacks.keras import Callback as TrainReportCallback
 
 import ray
-import ray.train as train
+from ray.air import session
 from ray.air.config import DatasetConfig
 from ray.data import Dataset
 from ray.train.tensorflow import TensorflowTrainer, prepare_dataset_shard
 
 
-class TrainReportCallback(Callback):
-    def on_epoch_end(self, epoch, logs=None):
-        train.report(**logs)
-
-
 def get_datasets_and_configs(
     a=5, b=10, size=1000
 ) -> Tuple[Dict[str, Dataset], Dict[str, DatasetConfig]]:
@@ -60,7 +55,7 @@ def train_func(config):
         # Model building/compiling need to be within `strategy.scope()`.
         multi_worker_model = build_and_compile_model(config)
 
-    dataset_pipeline = train.get_dataset_shard("train")
+    dataset_pipeline = session.get_dataset_shard("train")
     dataset_iterator = dataset_pipeline.iter_epochs()
 
     for _ in range(epochs):
diff --git a/python/ray/train/examples/tensorflow_mnist_example.py b/python/ray/train/examples/tensorflow_mnist_example.py
index a0ef319f8756..97e8db033025 100644
--- a/python/ray/train/examples/tensorflow_mnist_example.py
+++ b/python/ray/train/examples/tensorflow_mnist_example.py
@@ -7,17 +7,11 @@
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.keras.callbacks import Callback
+from ray.air.callbacks.keras import Callback as TrainReportCallback
 
-import ray.train as train
 from ray.train.tensorflow import TensorflowTrainer
 
 
-class TrainReportCallback(Callback):
-    def on_epoch_end(self, epoch, logs=None):
-        train.report(**logs)
-
-
 def mnist_dataset(batch_size):
     (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
     # The `x` arrays are in uint8 and have values in the [0, 255] range.

From 375790ecde6e07658618bedf74d6b991e801c2c7 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 30 Jun 2022 16:44:47 +0000
Subject: [PATCH 57/70] Revert docstring changes

---
 python/ray/train/train_loop_utils.py | 73 ++++++++++++----------------
 1 file changed, 30 insertions(+), 43 deletions(-)

diff --git a/python/ray/train/train_loop_utils.py b/python/ray/train/train_loop_utils.py
index 5b03fd1fffe5..58244652e961 100644
--- a/python/ray/train/train_loop_utils.py
+++ b/python/ray/train/train_loop_utils.py
@@ -38,25 +38,23 @@ def get_dataset_shard(
 
         import ray
         from ray import train
-        from ray.train.torch import TorchTrainer
 
         def train_func():
             model = Net()
             for iter in range(100):
-                data_shard = train.get_dataset_shard("train").to_torch()
+                data_shard = train.get_dataset_shard().to_torch()
                 model.train(data_shard)
             return model
 
         dataset = ray.data.read_csv("train.csv")
         dataset.filter(...).repeat().random_shuffle()
 
+        trainer = Trainer(backend="torch")
+        trainer.start()
+
         # Trainer will automatically handle sharding.
-        trainer = TorchTrainer(
-            train_func,
-            datasets={"train": dataset},
-            scaling_config={"num_workers": 2},
-        )
-        trainer.fit()
+        train_model = trainer.run(train_func, dataset=dataset)
+        trainer.shutdown()
 
     Args:
         dataset_name: If a Dictionary of Datasets was passed to ``Trainer``, then
@@ -97,15 +95,16 @@ def report(**kwargs) -> None:
 
         import time
         from ray import train
-        from ray.train.torch import TorchTrainer
 
         def train_func():
             for iter in range(100):
                 time.sleep(1)
                 train.report(hello="world")
 
-        trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2})
-        trainer.fit()
+        trainer = Trainer(backend="torch")
+        trainer.start()
+        trainer.run(train_func)
+        trainer.shutdown()
 
     Args:
         **kwargs: Any key value pair to be reported by Train.
@@ -127,7 +126,6 @@ def world_rank() -> int:
 
         import time
         from ray import train
-        from ray.train.torch import TorchTrainer
 
         def train_func():
             for iter in range(100):
@@ -135,8 +133,10 @@ def train_func():
                 if train.world_rank() == 0:
                     print("Worker 0")
 
-        trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2})
-        trainer.fit()
+        trainer = Trainer(backend="torch")
+        trainer.start()
+        trainer.run(train_func)
+        trainer.shutdown()
 
     """
     session = get_session()
@@ -153,18 +153,16 @@ def local_rank() -> int:
 
         import time
         from ray import train
-        from ray.train.torch import TorchTrainer
 
         def train_func():
             if torch.cuda.is_available():
                 torch.cuda.set_device(train.local_rank())
             ...
 
-        trainer = TorchTrainer(
-            train_func,
-            scaling_config={"use_gpu": True, "num_workers": 2},
-        )
-        trainer.fit()
+        trainer = Trainer(backend="torch", use_gpu=True)
+        trainer.start()
+        trainer.run(train_func)
+        trainer.shutdown()
 
     """
     session = get_session()
@@ -180,29 +178,18 @@ def load_checkpoint() -> Optional[Dict]:
     .. code-block:: python
 
         from ray import train
-        from ray.air import Checkpoint
-        from ray.train.torch import TorchTrainer
 
         def train_func():
             checkpoint = train.load_checkpoint()
             for iter in range(checkpoint["epoch"], 5):
                 print(iter)
 
-        checkpoint = Checkpoint.from_dict(
-            {
-                # this would be set during checkpoint saving
-                "_current_checkpoint_id": 1,
-                "epoch": 3,
-            }
-        )
-        trainer = TorchTrainer(
-            train_func,
-            resume_from_checkpoint=checkpoint,
-            scaling_config={"num_workers": 2},
-        )
-        trainer.fit()
+        trainer = Trainer(backend="torch")
+        trainer.start()
+        trainer.run(train_func, checkpoint={"epoch": 3})
         # 3
         # 4
+        trainer.shutdown()
 
     Args:
         **kwargs: Any key value pair to be checkpointed by Train.
@@ -226,16 +213,16 @@ def save_checkpoint(**kwargs) -> None:
 
         import time
         from ray import train
-        from ray.train.torch import TorchTrainer
 
         def train_func():
             for iter in range(100):
                 time.sleep(1)
                 train.save_checkpoint(epoch=iter)
 
-        trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2})
-        result = trainer.fit()
-        assert result.checkpoint
+        trainer = Trainer(backend="torch")
+        trainer.start()
+        trainer.run(train_func)
+        trainer.shutdown()
 
     Args:
         **kwargs: Any key value pair to be checkpointed by Train.
@@ -255,14 +242,14 @@ def world_size() -> int:
 
         import time
         from ray import train
-        from ray.train.torch import TorchTrainer
 
         def train_func():
             assert train.world_size() == 4
 
-        trainer = TorchTrainer(train_func, scaling_config={"num_workers": 4})
-        result = trainer.fit()
-
+        trainer = Trainer(backend="torch", num_workers=4)
+        trainer.start()
+        trainer.run(train_func)
+        trainer.shutdown()
     """
     session = get_session()
     if session is None:

From baaaf47718c6b5a46d228a9829c764e8b5f9390e Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 30 Jun 2022 17:40:37 +0000
Subject: [PATCH 58/70] Rename example files in docs

---
 ..._fashion_mnist_example.rst => torch_fashion_mnist_example.rst} | 0
 ...inear_dataset_example.rst => torch_linear_dataset_example.rst} | 0
 .../{train_linear_example.rst => torch_linear_example.rst}        | 0
 ...r_pytorch_pbt_example.rst => tune_cifar_torch_pbt_example.rst} | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename doc/source/train/examples/{train_fashion_mnist_example.rst => torch_fashion_mnist_example.rst} (100%)
 rename doc/source/train/examples/{train_linear_dataset_example.rst => torch_linear_dataset_example.rst} (100%)
 rename doc/source/train/examples/{train_linear_example.rst => torch_linear_example.rst} (100%)
 rename doc/source/train/examples/{tune_cifar_pytorch_pbt_example.rst => tune_cifar_torch_pbt_example.rst} (100%)

diff --git a/doc/source/train/examples/train_fashion_mnist_example.rst b/doc/source/train/examples/torch_fashion_mnist_example.rst
similarity index 100%
rename from doc/source/train/examples/train_fashion_mnist_example.rst
rename to doc/source/train/examples/torch_fashion_mnist_example.rst
diff --git a/doc/source/train/examples/train_linear_dataset_example.rst b/doc/source/train/examples/torch_linear_dataset_example.rst
similarity index 100%
rename from doc/source/train/examples/train_linear_dataset_example.rst
rename to doc/source/train/examples/torch_linear_dataset_example.rst
diff --git a/doc/source/train/examples/train_linear_example.rst b/doc/source/train/examples/torch_linear_example.rst
similarity index 100%
rename from doc/source/train/examples/train_linear_example.rst
rename to doc/source/train/examples/torch_linear_example.rst
diff --git a/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst b/doc/source/train/examples/tune_cifar_torch_pbt_example.rst
similarity index 100%
rename from doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst
rename to doc/source/train/examples/tune_cifar_torch_pbt_example.rst

From 691ce99d80343295a18be0947422cab225b53a19 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 30 Jun 2022 17:49:34 +0000
Subject: [PATCH 59/70] Add legacy tests

---
 python/ray/train/tests/test_minimal.py | 49 ++++++++++++++++
 python/ray/train/tests/test_tune.py    | 78 ++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)

diff --git a/python/ray/train/tests/test_minimal.py b/python/ray/train/tests/test_minimal.py
index 7541edb16852..e3a1670ed3fb 100644
--- a/python/ray/train/tests/test_minimal.py
+++ b/python/ray/train/tests/test_minimal.py
@@ -1,6 +1,11 @@
+from typing import List, Dict
+
 import pytest
 
 import ray
+import ray.train as train
+from ray.train import Trainer
+from ray.train.callbacks import TrainingCallback
 from ray.air import session
 from ray.air.checkpoint import Checkpoint
 from ray.train._internal.worker_group import WorkerGroup
@@ -30,6 +35,14 @@ def on_shutdown(self, worker_group: WorkerGroup, backend_config: TestConfig):
         pass
 
 
+class TestCallback(TrainingCallback):
+    def __init__(self):
+        self.result_list = []
+
+    def handle_result(self, results: List[Dict], **info):
+        self.result_list.append(results)
+
+
 def test_run(ray_start_4_cpus):
     """Tests that Train can be run without any specific backends."""
     num_workers = 2
@@ -61,6 +74,42 @@ def train_func():
     assert results.checkpoint.to_dict()[key] == checkpoint.to_dict()[key]
 
 
+def test_run_legacy(ray_start_4_cpus):
+    """Tests that Train can be run without any specific backends."""
+    num_workers = 2
+    key = "value"
+    value = 1
+    config = TestConfig()
+
+    def train_func():
+        checkpoint = train.load_checkpoint()
+        train.report(**checkpoint)
+        train.save_checkpoint(**checkpoint)
+        return checkpoint[key]
+
+    checkpoint = {key: value}
+    test_callback = TestCallback()
+
+    trainer = Trainer(config, num_workers=num_workers)
+    trainer.start()
+    results = trainer.run(train_func, checkpoint=checkpoint, callbacks=[test_callback])
+
+    # Test results.
+    assert len(results) == num_workers
+    assert all(result == 1 for result in results)
+
+    # Test reporting and callbacks.
+    assert len(test_callback.result_list) == value
+    assert len(test_callback.result_list[0]) == num_workers
+    print(test_callback.result_list[0])
+    assert all(result[key] == value for result in test_callback.result_list[0])
+
+    # Test checkpointing.
+    assert trainer.latest_checkpoint[key] == value
+
+    trainer.shutdown()
+
+
 def test_failure():
     """Tests that backend frameworks and non-critical libraries are not imported."""
     with pytest.raises(ModuleNotFoundError):
diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py
index 6c34bb7259b4..640fa98a19a0 100644
--- a/python/ray/train/tests/test_tune.py
+++ b/python/ray/train/tests/test_tune.py
@@ -5,6 +5,7 @@
 import ray
 import ray.train as train
 from ray import tune
+from ray.tune import TuneError
 from ray.air import Checkpoint, session
 from ray.air.config import FailureConfig, RunConfig
 from ray.train._internal.worker_group import WorkerGroup
@@ -18,6 +19,7 @@
 )
 from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer
 from ray.train.torch.torch_trainer import TorchTrainer
+from ray.train.trainer import Trainer
 from ray.tune.tune_config import TuneConfig
 from ray.tune.tuner import Tuner
 
@@ -219,6 +221,82 @@ def train_func():
     assert len(trial_dfs[0]["training_iteration"]) == 4
 
 
+def test_tune_error_legacy(ray_start_4_cpus):
+    def train_func(config):
+        raise RuntimeError("Error in training function!")
+
+    trainer = Trainer(TestConfig(), num_workers=1)
+    TestTrainable = trainer.to_tune_trainable(train_func)
+
+    with pytest.raises(TuneError):
+        tune.run(TestTrainable)
+
+
+def test_tune_checkpoint_legacy(ray_start_4_cpus):
+    def train_func():
+        for i in range(10):
+            train.report(test=i)
+        train.save_checkpoint(hello="world")
+
+    trainer = Trainer(TestConfig(), num_workers=1)
+    TestTrainable = trainer.to_tune_trainable(train_func)
+
+    [trial] = tune.run(TestTrainable).trials
+    checkpoint_path = trial.checkpoint.dir_or_data
+    assert os.path.exists(checkpoint_path)
+    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
+    assert checkpoint["hello"] == "world"
+
+
+def test_reuse_checkpoint_legacy(ray_start_4_cpus):
+    def train_func(config):
+        itr = 0
+        ckpt = train.load_checkpoint()
+        if ckpt is not None:
+            itr = ckpt["iter"] + 1
+
+        for i in range(itr, config["max_iter"]):
+            train.save_checkpoint(iter=i)
+            train.report(test=i, training_iteration=i)
+
+    trainer = Trainer(TestConfig(), num_workers=1)
+    TestTrainable = trainer.to_tune_trainable(train_func)
+
+    [trial] = tune.run(TestTrainable, config={"max_iter": 5}).trials
+    checkpoint_path = trial.checkpoint.dir_or_data
+    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
+    assert checkpoint["iter"] == 4
+    analysis = tune.run(TestTrainable, config={"max_iter": 10}, restore=checkpoint_path)
+    trial_dfs = list(analysis.trial_dataframes.values())
+    assert len(trial_dfs[0]["training_iteration"]) == 5
+
+
+def test_retry_legacy(ray_start_4_cpus):
+    def train_func():
+        ckpt = train.load_checkpoint()
+        restored = bool(ckpt)  # Does a previous checkpoint exist?
+        itr = 0
+        if ckpt:
+            itr = ckpt["iter"] + 1
+
+        for i in range(itr, 4):
+            if i == 2 and not restored:
+                raise Exception("try to fail me")
+            train.save_checkpoint(iter=i)
+            train.report(test=i, training_iteration=i)
+
+    trainer = Trainer(TestConfig(), num_workers=1)
+    TestTrainable = trainer.to_tune_trainable(train_func)
+
+    analysis = tune.run(TestTrainable, max_failures=3)
+    checkpoint_path = analysis.trials[0].checkpoint.dir_or_data
+    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
+    assert checkpoint["iter"] == 3
+
+    trial_dfs = list(analysis.trial_dataframes.values())
+    assert len(trial_dfs[0]["training_iteration"]) == 4
+
+
 if __name__ == "__main__":
     import sys
 

From d9122c38620d646fa923a130dbeed2b47951d0b1 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 5 Jul 2022 18:00:31 +0000
Subject: [PATCH 60/70] Switch to session in train code

---
 .../horovod/horovod_pytorch_example.py        | 18 ++++----
 .../examples/horovod/horovod_tune_example.py  |  4 +-
 .../pytorch/torch_fashion_mnist_example.py    |  9 ++--
 .../pytorch/torch_linear_dataset_example.py   | 12 ++---
 .../examples/pytorch/torch_linear_example.py  |  3 +-
 .../distributed_sage_example.py               | 21 +++++----
 .../tf/tensorflow_autoencoder_example.ipynb   | 17 +++----
 .../tf/tensorflow_autoencoder_example.py      | 12 ++---
 .../tf/tensorflow_linear_dataset_example.py   | 12 ++---
 .../examples/tf/tensorflow_mnist_example.py   |  9 +---
 python/ray/air/tests/test_dataset_config.py   | 10 ++---
 python/ray/air/util/check_ingest.py           | 18 ++++----
 .../ray/train/_internal/backend_executor.py   | 19 ++++----
 python/ray/train/_internal/checkpoint.py      |  2 +-
 python/ray/train/_internal/dataset_spec.py    |  8 ++--
 python/ray/train/_internal/session.py         |  2 +-
 python/ray/train/backend.py                   |  3 +-
 python/ray/train/constants.py                 |  6 +--
 python/ray/train/data_parallel_trainer.py     | 27 ++++++------
 .../train/examples/horovod/horovod_example.py |  4 +-
 .../train/examples/mlflow_simple_example.py   |  5 +--
 .../examples/torch_fashion_mnist_example.py   | 10 +++--
 .../examples/torch_linear_dataset_example.py  |  7 +--
 .../train/examples/torch_linear_example.py    |  3 +-
 .../examples/tune_cifar_torch_pbt_example.py  | 14 +++---
 python/ray/train/horovod/horovod_trainer.py   | 38 +++++++++-------
 .../train/huggingface/_huggingface_utils.py   | 17 +++----
 .../train/huggingface/huggingface_trainer.py  | 11 +++--
 .../train/tensorflow/tensorflow_trainer.py    | 44 ++++++++++---------
 .../train/tests/test_tensorflow_trainer.py    |  3 +-
 python/ray/train/tests/test_torch_trainer.py  |  7 +--
 python/ray/train/tests/test_tune.py           |  8 ++--
 python/ray/train/torch/torch_trainer.py       | 37 +++++++++-------
 python/ray/train/torch/train_loop_utils.py    | 27 ++++++++++--
 .../horovod/workloads/horovod_tune_test.py    | 21 +++++----
 35 files changed, 243 insertions(+), 225 deletions(-)

diff --git a/python/ray/air/examples/horovod/horovod_pytorch_example.py b/python/ray/air/examples/horovod/horovod_pytorch_example.py
index fe7355eb8a00..946cddc4fd59 100644
--- a/python/ray/air/examples/horovod/horovod_pytorch_example.py
+++ b/python/ray/air/examples/horovod/horovod_pytorch_example.py
@@ -2,6 +2,7 @@
 from filelock import FileLock
 import horovod.torch as hvd
 import os
+from ray.air.checkpoint import Checkpoint
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
@@ -9,7 +10,7 @@
 from torchvision import datasets, transforms
 
 import ray
-from ray import train
+from ray.air import session
 from ray.train.horovod import HorovodTrainer
 
 
@@ -141,19 +142,16 @@ def train_func(config):
 
     model, optimizer, train_loader, train_sampler = setup(config)
 
-    results = []
     for epoch in range(num_epochs):
         loss = train_epoch(
             model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda
         )
-        results.append(loss)
-    if save_model_as_dict:
-        train.save_checkpoint(model=model.state_dict())
-    else:
-        train.save_checkpoint(model=model)
-    print("losses of each epoch:")
-    print(results)
-    return results
+        if save_model_as_dict:
+            checkpoint_dict = dict(model=model.state_dict())
+        else:
+            checkpoint_dict = dict(model=model)
+        checkpoint_dict = Checkpoint.from_dict(checkpoint_dict)
+        session.report(dict(loss=loss), checkpoint=checkpoint_dict)
 
 
 def main(num_workers, use_gpu, kwargs):
diff --git a/python/ray/air/examples/horovod/horovod_tune_example.py b/python/ray/air/examples/horovod/horovod_tune_example.py
index 05ab4032924d..24539d759a40 100644
--- a/python/ray/air/examples/horovod/horovod_tune_example.py
+++ b/python/ray/air/examples/horovod/horovod_tune_example.py
@@ -3,8 +3,8 @@
 import torch
 
 import ray
-from ray import train
 from ray import tune
+from ray.air import session
 from ray.train.horovod import HorovodTrainer
 from ray.tune.tune_config import TuneConfig
 from ray.tune.tuner import Tuner
@@ -83,7 +83,7 @@ def train_loop_per_worker(config):
 
         optimizer.step()
         time.sleep(0.1)
-        train.report(loss=loss.item())
+        session.report(dict(loss=loss.item()))
     total = time.time() - start
     print(f"Took {total:0.3f} s. Avg: {total / num_steps:0.3f} s.")
 
diff --git a/python/ray/air/examples/pytorch/torch_fashion_mnist_example.py b/python/ray/air/examples/pytorch/torch_fashion_mnist_example.py
index b4e2088b9d68..a618292ecf70 100644
--- a/python/ray/air/examples/pytorch/torch_fashion_mnist_example.py
+++ b/python/ray/air/examples/pytorch/torch_fashion_mnist_example.py
@@ -1,5 +1,6 @@
 import argparse
 from typing import Dict
+from ray.air import session
 
 import torch
 from torch import nn
@@ -48,7 +49,7 @@ def forward(self, x):
 
 
 def train_epoch(dataloader, model, loss_fn, optimizer):
-    size = len(dataloader.dataset) // train.world_size()
+    size = len(dataloader.dataset) // session.get_world_size()
     model.train()
     for batch, (X, y) in enumerate(dataloader):
         # Compute prediction error
@@ -66,7 +67,7 @@ def train_epoch(dataloader, model, loss_fn, optimizer):
 
 
 def validate_epoch(dataloader, model, loss_fn):
-    size = len(dataloader.dataset) // train.world_size()
+    size = len(dataloader.dataset) // session.get_world_size()
     num_batches = len(dataloader)
     model.eval()
     test_loss, correct = 0, 0
@@ -90,7 +91,7 @@ def train_func(config: Dict):
     lr = config["lr"]
     epochs = config["epochs"]
 
-    worker_batch_size = batch_size // train.world_size()
+    worker_batch_size = batch_size // session.get_world_size()
 
     # Create data loaders.
     train_dataloader = DataLoader(training_data, batch_size=worker_batch_size)
@@ -109,7 +110,7 @@ def train_func(config: Dict):
     for _ in range(epochs):
         train_epoch(train_dataloader, model, loss_fn, optimizer)
         loss = validate_epoch(test_dataloader, model, loss_fn)
-        train.report(loss=loss)
+        session.report(dict(loss=loss))
 
 
 def train_fashion_mnist(num_workers=2, use_gpu=False):
diff --git a/python/ray/air/examples/pytorch/torch_linear_dataset_example.py b/python/ray/air/examples/pytorch/torch_linear_dataset_example.py
index 42d1569e6623..6da379ba8de7 100644
--- a/python/ray/air/examples/pytorch/torch_linear_dataset_example.py
+++ b/python/ray/air/examples/pytorch/torch_linear_dataset_example.py
@@ -1,13 +1,14 @@
 import argparse
 import random
 from typing import Tuple
+from ray.air.checkpoint import Checkpoint
 
 import torch
 import torch.nn as nn
 
 import ray
 import ray.train as train
-from ray.air import train_test_split
+from ray.air import session, train_test_split
 from ray.air.result import Result
 from ray.data import Dataset
 from ray.train.batch_predictor import BatchPredictor
@@ -64,8 +65,8 @@ def train_func(config):
     lr = config.get("lr", 1e-2)
     epochs = config.get("epochs", 3)
 
-    train_dataset_shard = train.get_dataset_shard("train")
-    validation_dataset = train.get_dataset_shard("validation")
+    train_dataset_shard = session.get_dataset_shard("train")
+    validation_dataset = session.get_dataset_shard("validation")
 
     model = nn.Linear(1, hidden_size)
     model = train.torch.prepare_model(model)
@@ -95,13 +96,12 @@ def train_func(config):
         device = train.torch.get_device()
 
         train_epoch(train_torch_dataset, model, loss_fn, optimizer, device)
-        if train.world_rank() == 0:
+        if session.get_world_rank() == 0:
             result = validate_epoch(validation_torch_dataset, model, loss_fn, device)
         else:
             result = {}
-        train.report(**result)
         results.append(result)
-        train.save_checkpoint(model=model)
+        session.report(result, checkpoint=Checkpoint.from_dict(dict(model=model)))
 
     return results
 
diff --git a/python/ray/air/examples/pytorch/torch_linear_example.py b/python/ray/air/examples/pytorch/torch_linear_example.py
index 31f856416296..ba0d12fd1197 100644
--- a/python/ray/air/examples/pytorch/torch_linear_example.py
+++ b/python/ray/air/examples/pytorch/torch_linear_example.py
@@ -1,6 +1,7 @@
 import argparse
 
 import numpy as np
+from ray.air import session
 import torch
 import torch.nn as nn
 import ray.train as train
@@ -78,8 +79,8 @@ def train_func(config):
     for _ in range(epochs):
         train_epoch(train_loader, model, loss_fn, optimizer)
         result = validate_epoch(validation_loader, model, loss_fn)
-        train.report(**result)
         results.append(result)
+        session.report(result)
 
     return results
 
diff --git a/python/ray/air/examples/pytorch_geometric/distributed_sage_example.py b/python/ray/air/examples/pytorch_geometric/distributed_sage_example.py
index a1740e6cefc6..9c1ab2e00063 100644
--- a/python/ray/air/examples/pytorch_geometric/distributed_sage_example.py
+++ b/python/ray/air/examples/pytorch_geometric/distributed_sage_example.py
@@ -4,6 +4,7 @@
 import os
 import argparse
 from filelock import FileLock
+from ray.air import session
 
 import torch
 import torch.nn.functional as F
@@ -63,8 +64,8 @@ def train_loop_per_worker(train_loop_config):
 
     data = dataset[0]
     train_idx = data.train_mask.nonzero(as_tuple=False).view(-1)
-    train_idx = train_idx.split(train_idx.size(0) // train.world_size())[
-        train.world_rank()
+    train_idx = train_idx.split(train_idx.size(0) // session.get_world_size())[
+        session.get_world_rank()
     ]
 
     train_loader = NeighborSampler(
@@ -79,7 +80,7 @@ def train_loop_per_worker(train_loop_config):
     train_loader = train.torch.prepare_data_loader(train_loader, add_dist_sampler=False)
 
     # Do validation on rank 0 worker only.
-    if train.world_rank() == 0:
+    if session.get_world_rank() == 0:
         subgraph_loader = NeighborSampler(
             data.edge_index, node_idx=None, sizes=[-1], batch_size=2048, shuffle=False
         )
@@ -112,13 +113,13 @@ def train_loop_per_worker(train_loop_config):
             loss.backward()
             optimizer.step()
 
-        if train.world_rank() == 0:
+        if session.get_world_rank() == 0:
             print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}")
 
         train_accuracy = validation_accuracy = test_accuracy = None
 
         # Do validation on rank 0 worker only.
-        if train.world_rank() == 0:
+        if session.get_world_rank() == 0:
             model.eval()
             with torch.no_grad():
                 out = model.module.test(x, subgraph_loader)
@@ -131,10 +132,12 @@ def train_loop_per_worker(train_loop_config):
             )
             test_accuracy = int(res[data.test_mask].sum()) / int(data.test_mask.sum())
 
-        train.report(
-            train_accuracy=train_accuracy,
-            validation_accuracy=validation_accuracy,
-            test_accuracy=test_accuracy,
+        session.report(
+            dict(
+                train_accuracy=train_accuracy,
+                validation_accuracy=validation_accuracy,
+                test_accuracy=test_accuracy,
+            )
         )
 
 
diff --git a/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb b/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb
index bbab099eaced..0d8e36efcebc 100644
--- a/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb
+++ b/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb
@@ -238,8 +238,8 @@
    "source": [
     "`train_func` contains regular TensorFlow code with a few notable exceptions:\n",
     "* We build and compile our model in the [`MultiWorkerMirrioredStrategy`](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy) context.\n",
-    "* We call {py:func}`train.get_dataset_shard <ray.train.get_dataset_shard>` to get a subset of our training data, and call {py:meth}`Dataset.to_tf <ray.data.Dataset.to_tf>` with {py:func}`prepare_dataset_shard <ray.train.tensorflow.prepare_dataset_shard>` to convert the subset to a TensorFlow dataset.\n",
-    "* We save model state using {py:func}`train.save_checkpoint <ray.train.save_checkpoint>`.\n"
+    "* We call {py:func}`session.get_dataset_shard <ray.air.session.get_dataset_shard>` to get a subset of our training data, and call {py:meth}`Dataset.to_tf <ray.data.Dataset.to_tf>` with {py:func}`prepare_dataset_shard <ray.train.tensorflow.prepare_dataset_shard>` to convert the subset to a TensorFlow dataset.\n",
+    "* We use the {py:class}`ray.air.callbacks.keras.Callback <ray.air.callbacks.keras.Callback>` for metric reporting and checkpointing.\n"
    ]
   },
   {
@@ -250,14 +250,9 @@
    "source": [
     "import os\n",
     "import json\n",
-    "from ray import train\n",
+    "from ray.air import session\n",
+    "from ray.air.callbacks.keras import Callback\n",
     "from ray.train.tensorflow import prepare_dataset_shard\n",
-    "from tensorflow.keras.callbacks import Callback\n",
-    "\n",
-    "class TrainCheckpointReportCallback(Callback):\n",
-    "    def on_epoch_end(self, epoch, logs=None):\n",
-    "        train.save_checkpoint(**{\"model\": self.model.get_weights()})\n",
-    "        train.report(**logs)\n",
     "\n",
     "def train_func(config: dict):\n",
     "\n",
@@ -268,7 +263,7 @@
     "    tf_config = json.loads(os.environ[\"TF_CONFIG\"])\n",
     "    num_workers = len(tf_config[\"cluster\"][\"worker\"])\n",
     "\n",
-    "    dataset_shard = train.get_dataset_shard(\"train\")\n",
+    "    dataset_shard = session.get_dataset_shard(\"train\")\n",
     "\n",
     "    strategy = tf.distribute.MultiWorkerMirroredStrategy()\n",
     "\n",
@@ -296,7 +291,7 @@
     "            )\n",
     "        )\n",
     "        history = multi_worker_model.fit(\n",
-    "            tf_dataset, callbacks=[TrainCheckpointReportCallback()]\n",
+    "            tf_dataset, callbacks=[Callback()]\n",
     "        )\n",
     "        results.append(history.history)\n",
     "    return results"
diff --git a/python/ray/air/examples/tf/tensorflow_autoencoder_example.py b/python/ray/air/examples/tf/tensorflow_autoencoder_example.py
index e90a334de885..c3b61d2738f2 100644
--- a/python/ray/air/examples/tf/tensorflow_autoencoder_example.py
+++ b/python/ray/air/examples/tf/tensorflow_autoencoder_example.py
@@ -5,7 +5,7 @@
 import argparse
 import numpy as np
 import pandas as pd
-import ray.train as train
+from ray.air import session
 import tensorflow as tf
 import tensorflow_datasets as tfds
 from ray.data.datasource import SimpleTensorFlowDatasource
@@ -14,19 +14,13 @@
 from ray.air.result import Result
 from ray.air.train.integrations.tensorflow import TensorflowTrainer
 from ray.train.tensorflow import prepare_dataset_shard
-from tensorflow.keras.callbacks import Callback
+from ray.air.callbacks.keras import Callback as TrainCheckpointReportCallback
 
 import ray
 
 from ray.data.extensions import TensorArray
 
 
-class TrainCheckpointReportCallback(Callback):
-    def on_epoch_end(self, epoch, logs=None):
-        train.save_checkpoint(**{"model": self.model.get_weights()})
-        train.report(**logs)
-
-
 def get_dataset(split_type="train"):
     def dataset_factory():
         return tfds.load("mnist", split=[split_type], as_supervised=True)[0].take(128)
@@ -80,7 +74,7 @@ def train_func(config: dict):
     per_worker_batch_size = config.get("batch_size", 64)
     epochs = config.get("epochs", 3)
 
-    dataset_shard = train.get_dataset_shard("train")
+    dataset_shard = session.get_dataset_shard("train")
 
     strategy = tf.distribute.MultiWorkerMirroredStrategy()
 
diff --git a/python/ray/air/examples/tf/tensorflow_linear_dataset_example.py b/python/ray/air/examples/tf/tensorflow_linear_dataset_example.py
index c38afc61c92c..18a74e3cc343 100644
--- a/python/ray/air/examples/tf/tensorflow_linear_dataset_example.py
+++ b/python/ray/air/examples/tf/tensorflow_linear_dataset_example.py
@@ -2,10 +2,10 @@
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.keras.callbacks import Callback
 
 import ray
-import ray.train as train
+from ray.air import session
+from ray.air.callbacks.keras import Callback as TrainCheckpointReportCallback
 from ray.air.result import Result
 from ray.data import Dataset
 from ray.train.batch_predictor import BatchPredictor
@@ -16,12 +16,6 @@
 )
 
 
-class TrainCheckpointReportCallback(Callback):
-    def on_epoch_end(self, epoch, logs=None):
-        train.save_checkpoint(**{"model": self.model.get_weights()})
-        train.report(**logs)
-
-
 def get_dataset(a=5, b=10, size=1000) -> Dataset:
     items = [i / size for i in range(size)]
     dataset = ray.data.from_items([{"x": x, "y": a * x + b} for x in items])
@@ -53,7 +47,7 @@ def train_func(config: dict):
             metrics=[tf.keras.metrics.mean_squared_error],
         )
 
-    dataset = train.get_dataset_shard("train")
+    dataset = session.get_dataset_shard("train")
 
     results = []
     for _ in range(epochs):
diff --git a/python/ray/air/examples/tf/tensorflow_mnist_example.py b/python/ray/air/examples/tf/tensorflow_mnist_example.py
index cfa3701f2fdd..e008bc0ae28f 100644
--- a/python/ray/air/examples/tf/tensorflow_mnist_example.py
+++ b/python/ray/air/examples/tf/tensorflow_mnist_example.py
@@ -8,16 +8,9 @@
 import numpy as np
 from ray.air.result import Result
 import tensorflow as tf
-from tensorflow.keras.callbacks import Callback
 
-import ray.train as train
 from ray.train.tensorflow import TensorflowTrainer
-
-
-class TrainCheckpointReportCallback(Callback):
-    def on_epoch_end(self, epoch, logs=None):
-        train.save_checkpoint(**{"model": self.model.get_weights()})
-        train.report(**logs)
+from ray.air.callbacks.keras import Callback as TrainCheckpointReportCallback
 
 
 def mnist_dataset(batch_size: int) -> tf.data.Dataset:
diff --git a/python/ray/air/tests/test_dataset_config.py b/python/ray/air/tests/test_dataset_config.py
index 9b30f7252e0d..c6a352ac33c0 100644
--- a/python/ray/air/tests/test_dataset_config.py
+++ b/python/ray/air/tests/test_dataset_config.py
@@ -4,7 +4,7 @@
 import pytest
 
 import ray
-from ray import train
+from ray.air import session
 from ray.air.config import DatasetConfig
 from ray.data import Dataset, DatasetPipeline
 from ray.data.preprocessors import BatchMapper
@@ -29,13 +29,13 @@ def __init__(
         self, num_workers: int, expect_ds: bool, expect_sizes: Optional[dict], **kwargs
     ):
         def train_loop_per_worker():
-            data_shard = train.get_dataset_shard("train")
+            data_shard = session.get_dataset_shard("train")
             if expect_ds:
                 assert isinstance(data_shard, Dataset), data_shard
             else:
                 assert isinstance(data_shard, DatasetPipeline), data_shard
             for k, v in expect_sizes.items():
-                shard = train.get_dataset_shard(k)
+                shard = session.get_dataset_shard(k)
                 if v == -1:
                     assert shard is None, shard
                 else:
@@ -197,7 +197,7 @@ class TestStream(DataParallelTrainer):
 
     def __init__(self, check_results_fn, **kwargs):
         def train_loop_per_worker():
-            data_shard = train.get_dataset_shard("train")
+            data_shard = session.get_dataset_shard("train")
             assert isinstance(data_shard, DatasetPipeline), data_shard
             results = []
             for epoch in data_shard.iter_epochs(2):
@@ -218,7 +218,7 @@ class TestBatch(DataParallelTrainer):
 
     def __init__(self, check_results_fn, **kwargs):
         def train_loop_per_worker():
-            data_shard = train.get_dataset_shard("train")
+            data_shard = session.get_dataset_shard("train")
             assert isinstance(data_shard, Dataset), data_shard
             results = data_shard.take()
             check_results_fn(data_shard, results)
diff --git a/python/ray/air/util/check_ingest.py b/python/ray/air/util/check_ingest.py
index 74c72107f149..0073a9bbaf20 100755
--- a/python/ray/air/util/check_ingest.py
+++ b/python/ray/air/util/check_ingest.py
@@ -7,7 +7,7 @@
 import numpy as np
 
 import ray
-from ray import train
+from ray.air import session
 from ray.air.config import DatasetConfig
 from ray.data import DatasetPipeline, Dataset
 from ray.data.preprocessors import BatchMapper, Chain
@@ -67,8 +67,8 @@ def make_train_loop(
         def train_loop_per_worker():
             import pandas as pd
 
-            rank = train.world_rank()
-            data_shard = train.get_dataset_shard("train")
+            rank = session.get_world_rank()
+            data_shard = session.get_dataset_shard("train")
             start = time.perf_counter()
             epochs_read, batches_read, bytes_read = 0, 0, 0
             batch_delays = []
@@ -102,11 +102,13 @@ def generate_epochs(data: Union[Dataset, DatasetPipeline], epochs: int):
                         # NOTE: This isn't recursive and will just return the size of
                         # the object pointers if list of non-primitive types.
                         bytes_read += sys.getsizeof(batch)
-                    train.report(
-                        bytes_read=bytes_read,
-                        batches_read=batches_read,
-                        epochs_read=epochs_read,
-                        batch_delay=batch_delay,
+                    session.report(
+                        dict(
+                            bytes_read=bytes_read,
+                            batches_read=batches_read,
+                            epochs_read=epochs_read,
+                            batch_delay=batch_delay,
+                        )
                     )
                     batch_start = time.perf_counter()
             delta = time.perf_counter() - start
diff --git a/python/ray/train/_internal/backend_executor.py b/python/ray/train/_internal/backend_executor.py
index b8ccec69ca3c..c658a3ae66b1 100644
--- a/python/ray/train/_internal/backend_executor.py
+++ b/python/ray/train/_internal/backend_executor.py
@@ -44,7 +44,7 @@ class BackendExecutor:
 
     This class holds a worker group and is responsible for executing the
     training function on the workers, and collecting intermediate results
-    from ``train.report()`` and ``train.checkpoint()``.
+    from ``session.report()``.
 
     Args:
         backend_config: The configurations for this
@@ -288,7 +288,7 @@ def start_training(
                 Dataset.
             checkpoint: The checkpoint data that
                 should be loaded onto each worker and accessed by the
-                training function via ``train.load_checkpoint()``. If this
+                training function via ``session.get_checkpoint()``. If this
                 is ``None`` then no checkpoint will be loaded.
         """
         use_detailed_autofilled_metrics = env_integer(
@@ -362,8 +362,7 @@ def get_next_results(self) -> Optional[List[TrainingResult]]:
         """Fetches the next ``TrainingResult`` from each worker.
 
         Each ``TrainingResult`` is expected to correspond to the same step from
-        each worker (e.g. the same call to ``train.report()`` or
-        ``train.checkpoint()``).
+        each worker (e.g. the same call to ``session.report()``).
 
         Returns:
             A list of ``TrainingResult``s with the same
@@ -396,7 +395,8 @@ def get_next():
                 raise RuntimeError(
                     "Some workers returned results while "
                     "others didn't. Make sure that "
-                    "`train.report()` and `train.save_checkpoint()` "
+                    "`session.report()` (legacy API:"
+                    "`train.report()` and `train.save_checkpoint()`) "
                     "are called the same number of times on all "
                     "workers."
                 )
@@ -408,10 +408,11 @@ def get_next():
         if any(r.type != result_type for r in results):
             raise RuntimeError(
                 "Some workers returned results with "
-                "different types. Make sure `train.report()` "
-                "and `train.save_checkpoint()` are called the "
-                "same number of times and in the same order on "
-                "each worker."
+                "different types. Make sure that "
+                "`session.report()` (legacy API:"
+                "`train.report()` and `train.save_checkpoint()`) "
+                "are called the same number of times on all "
+                "workers."
             )
         return results
 
diff --git a/python/ray/train/_internal/checkpoint.py b/python/ray/train/_internal/checkpoint.py
index 8bffe957833d..bef2a63fe9ac 100644
--- a/python/ray/train/_internal/checkpoint.py
+++ b/python/ray/train/_internal/checkpoint.py
@@ -116,7 +116,7 @@ def _process_checkpoint(
                 f"checkpoint_score_attribute: "
                 f"{score_attr}. "
                 f"Include this attribute in the call to "
-                f"train.save_checkpoint."
+                f"`session.report()`."
             )
 
         tracked_checkpoint = _TrackedCheckpoint(
diff --git a/python/ray/train/_internal/dataset_spec.py b/python/ray/train/_internal/dataset_spec.py
index 6e96a522a84d..f4e3d5e3fb9e 100644
--- a/python/ray/train/_internal/dataset_spec.py
+++ b/python/ray/train/_internal/dataset_spec.py
@@ -17,10 +17,10 @@ class RayDatasetSpec:
 
     dataset_or_dict: An optional Ray Dataset (or DatasetPipeline) or a dictionary of
         datasets to be sharded across all the training workers, which can be accessed
-        from the training function via ``train.get_dataset_shard()``. Multiple Datasets
-        can be passed in as a dictionary that maps each name key to a Dataset value,
-        and each Dataset can be accessed from the training function by passing in a
-        `dataset_name` argument to ``train.get_dataset_shard()``.
+        from the training function via ``session.get_dataset_shard()``. Multiple
+        Datasets can be passed in as a dictionary that maps each name key to a
+        Dataset value, and each Dataset can be accessed from the training function
+        by passing in a `dataset_name` argument to ``session.get_dataset_shard()``.
     dataset_split_fn: An optional callable to specify how the provided ``dataset``
         should be split across the training workers. It is expected to take in two
         arguments. The first one is the ``dataset``, just as is passed in to the
diff --git a/python/ray/train/_internal/session.py b/python/ray/train/_internal/session.py
index 10008bda62a4..b184cd3787df 100644
--- a/python/ray/train/_internal/session.py
+++ b/python/ray/train/_internal/session.py
@@ -124,7 +124,7 @@ def start(self):
         self.training_thread.start()
 
     def pause_reporting(self):
-        """Ignore all future ``train.report()`` calls."""
+        """Ignore all future ``session.report()`` calls."""
         self.ignore_report = True
 
     def finish(self):
diff --git a/python/ray/train/backend.py b/python/ray/train/backend.py
index f450b27544be..7e22f0f431cb 100644
--- a/python/ray/train/backend.py
+++ b/python/ray/train/backend.py
@@ -46,8 +46,7 @@ def encode_data(data_dict: Dict) -> EncodedData:
         """Logic to encode a data dict before sending to the driver.
 
         This function will be called on the workers for any data that is
-        sent to the driver via ``train.report()`` or
-        ``train.save_checkpoint()``.
+        sent to the driver via ``session.report()``.
         """
 
         return data_dict
diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py
index d640293799a5..b814380972a9 100644
--- a/python/ray/train/constants.py
+++ b/python/ray/train/constants.py
@@ -14,7 +14,7 @@
     WILDCARD_KEY,
 )
 
-# Autofilled train.report() metrics. Keys should be consistent with Tune.
+# Autofilled session.report() metrics. Keys should be consistent with Tune.
 TIMESTAMP = "_timestamp"
 TIME_THIS_ITER_S = "_time_this_iter_s"
 TRAINING_ITERATION = "_training_iteration"
@@ -72,10 +72,10 @@
 
 # Reserved keyword used by the ``TorchWorkerProfiler`` and
 # ``TorchTensorboardProfilerCallback`` for passing PyTorch Profiler data
-# through ``train.report()``
+# through ``session.report()``
 PYTORCH_PROFILER_KEY = "_train_torch_profiler"
 
 # Reserved keys used across all Callbacks.
-# By default these will be filtered out from ``train.report()``.
+# By default these will be filtered out from ``session.report()``.
 # See ``TrainingCallback._preprocess_results`` for more details.
 ALL_RESERVED_KEYS = {PYTORCH_PROFILER_KEY}
diff --git a/python/ray/train/data_parallel_trainer.py b/python/ray/train/data_parallel_trainer.py
index c8729282ab40..be8eae27d351 100644
--- a/python/ray/train/data_parallel_trainer.py
+++ b/python/ray/train/data_parallel_trainer.py
@@ -76,36 +76,35 @@ def train_loop_per_worker(config: Dict):
 
     If the ``datasets`` dict contains a training dataset (denoted by
     the "train" key), then it will be split into multiple dataset
-    shards that can then be accessed by ``ray.train.get_dataset_shard("train")`` inside
+    shards that can then be accessed by ``session.get_dataset_shard("train")`` inside
     ``train_loop_per_worker``. All the other datasets will not be split and
-    ``ray.train.get_dataset_shard(...)`` will return the the entire Dataset.
+    ``session.get_dataset_shard(...)`` will return the the entire Dataset.
 
     Inside the ``train_loop_per_worker`` function, you can use any of the
+    :ref:`Ray AIR session methods <air-session-ref>` and
     :ref:`Ray Train function utils <train-api-func-utils>`.
 
     .. code-block:: python
 
         def train_loop_per_worker():
-            # Report intermediate results for callbacks or logging.
-            train.report(...)
-
-            # Checkpoints the provided args as restorable state.
-            train.save_checkpoint(...)
+            # Report intermediate results for callbacks or logging and
+            # checkpoint data.
+            session.report(...)
 
             # Returns dict of last saved checkpoint.
-            train.load_checkpoint()
+            session.get_checkpoint()
 
             # Returns the Ray Dataset shard for the given key.
-            train.get_dataset_shard("my_dataset")
+            session.get_dataset_shard("my_dataset")
 
             # Returns the total number of workers executing training.
-            train.get_world_size()
+            session.get_world_size()
 
             # Returns the rank of this worker.
-            train.get_world_rank()
+            session.get_world_rank()
 
             # Returns the rank of the worker on the current node.
-            train.get_local_rank()
+            session.get_local_rank()
 
     **How do I use ``DataParallelTrainer`` or any of its subclasses?**
 
@@ -114,10 +113,10 @@ def train_loop_per_worker():
     .. code-block:: python
 
         import ray
-        from ray import train
+        from ray.air import session
 
         def train_loop_for_worker():
-            dataset_shard_for_this_worker = train.get_dataset_shard("train")
+            dataset_shard_for_this_worker = session.get_dataset_shard("train")
 
             assert len(dataset_shard_for_this_worker) == 1
 
diff --git a/python/ray/train/examples/horovod/horovod_example.py b/python/ray/train/examples/horovod/horovod_example.py
index c01788008ec5..bb5e4ee7a567 100644
--- a/python/ray/train/examples/horovod/horovod_example.py
+++ b/python/ray/train/examples/horovod/horovod_example.py
@@ -2,6 +2,7 @@
 import os
 
 import horovod.torch as hvd
+from ray.air import session
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
@@ -10,7 +11,6 @@
 from torchvision import datasets, transforms
 
 import ray
-from ray import train
 from ray.train.horovod import HorovodTrainer
 
 
@@ -148,7 +148,7 @@ def train_func(config):
         loss = train_epoch(
             model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda
         )
-        train.report(loss=loss)
+        session.report(dict(loss=loss))
 
 
 def main(num_workers, use_gpu, kwargs):
diff --git a/python/ray/train/examples/mlflow_simple_example.py b/python/ray/train/examples/mlflow_simple_example.py
index d64d0525ae58..a5803a072a71 100644
--- a/python/ray/train/examples/mlflow_simple_example.py
+++ b/python/ray/train/examples/mlflow_simple_example.py
@@ -1,5 +1,4 @@
-from ray import train
-from ray.air import RunConfig
+from ray.air import RunConfig, session
 from ray.train.torch import TorchTrainer
 from ray.tune.integration.mlflow import MLflowLoggerCallback
 from ray.tune.logger import TBXLoggerCallback
@@ -7,7 +6,7 @@
 
 def train_func():
     for i in range(3):
-        train.report(epoch=i)
+        session.report(dict(epoch=i))
 
 
 trainer = TorchTrainer(
diff --git a/python/ray/train/examples/torch_fashion_mnist_example.py b/python/ray/train/examples/torch_fashion_mnist_example.py
index 6e8db3220db4..725ba078e5d5 100644
--- a/python/ray/train/examples/torch_fashion_mnist_example.py
+++ b/python/ray/train/examples/torch_fashion_mnist_example.py
@@ -1,5 +1,6 @@
 import argparse
 from typing import Dict
+from ray.air import session
 
 import torch
 from torch import nn
@@ -48,7 +49,7 @@ def forward(self, x):
 
 
 def train_epoch(dataloader, model, loss_fn, optimizer):
-    size = len(dataloader.dataset) // train.world_size()
+    size = len(dataloader.dataset) // session.get_world_size()
     model.train()
     for batch, (X, y) in enumerate(dataloader):
         # Compute prediction error
@@ -66,7 +67,7 @@ def train_epoch(dataloader, model, loss_fn, optimizer):
 
 
 def validate_epoch(dataloader, model, loss_fn):
-    size = len(dataloader.dataset) // train.world_size()
+    size = len(dataloader.dataset) // session.get_world_size()
     num_batches = len(dataloader)
     model.eval()
     test_loss, correct = 0, 0
@@ -90,7 +91,7 @@ def train_func(config: Dict):
     lr = config["lr"]
     epochs = config["epochs"]
 
-    worker_batch_size = batch_size // train.world_size()
+    worker_batch_size = batch_size // session.get_world_size()
 
     # Create data loaders.
     train_dataloader = DataLoader(training_data, batch_size=worker_batch_size)
@@ -111,9 +112,10 @@ def train_func(config: Dict):
     for _ in range(epochs):
         train_epoch(train_dataloader, model, loss_fn, optimizer)
         loss = validate_epoch(test_dataloader, model, loss_fn)
-        train.report(loss=loss)
         loss_results.append(loss)
+        session.report(dict(loss=loss))
 
+    # return required for backwards compatibility with the old API
     return loss_results
 
 
diff --git a/python/ray/train/examples/torch_linear_dataset_example.py b/python/ray/train/examples/torch_linear_dataset_example.py
index acfa0ce2e637..a1e8889c4ea6 100644
--- a/python/ray/train/examples/torch_linear_dataset_example.py
+++ b/python/ray/train/examples/torch_linear_dataset_example.py
@@ -1,5 +1,6 @@
 import argparse
 from typing import Dict, Tuple
+from ray.air import session
 
 import torch
 import torch.nn as nn
@@ -77,8 +78,8 @@ def train_func(config):
     lr = config.get("lr", 1e-2)
     epochs = config.get("epochs", 3)
 
-    train_dataset_pipeline_shard = train.get_dataset_shard("train")
-    validation_dataset_pipeline_shard = train.get_dataset_shard("validation")
+    train_dataset_pipeline_shard = session.get_dataset_shard("train")
+    validation_dataset_pipeline_shard = session.get_dataset_shard("validation")
 
     model = nn.Linear(1, hidden_size)
     model = train.torch.prepare_model(model)
@@ -113,7 +114,7 @@ def train_func(config):
 
         train_epoch(train_torch_dataset, model, loss_fn, optimizer, device)
         result = validate_epoch(validation_torch_dataset, model, loss_fn, device)
-        train.report(**result)
+        session.report(result)
 
 
 def train_linear(num_workers=2, use_gpu=False):
diff --git a/python/ray/train/examples/torch_linear_example.py b/python/ray/train/examples/torch_linear_example.py
index ceabd0c2853f..15d7ade41622 100644
--- a/python/ray/train/examples/torch_linear_example.py
+++ b/python/ray/train/examples/torch_linear_example.py
@@ -1,6 +1,7 @@
 import argparse
 
 import numpy as np
+from ray.air import session
 import torch
 import torch.nn as nn
 
@@ -78,8 +79,8 @@ def train_func(config):
     for _ in range(epochs):
         train_epoch(train_loader, model, loss_fn, optimizer)
         result = validate_epoch(validation_loader, model, loss_fn)
-        train.report(**result)
         results.append(result)
+        session.report(result)
     # return required for backwards compatibility with the old API
     return results
 
diff --git a/python/ray/train/examples/tune_cifar_torch_pbt_example.py b/python/ray/train/examples/tune_cifar_torch_pbt_example.py
index f0b5c786ff8d..50389cd6e503 100644
--- a/python/ray/train/examples/tune_cifar_torch_pbt_example.py
+++ b/python/ray/train/examples/tune_cifar_torch_pbt_example.py
@@ -1,6 +1,7 @@
 import argparse
 
 import numpy as np
+from ray.air import session
 import torch
 import torch.nn as nn
 import torchvision.transforms as transforms
@@ -20,7 +21,7 @@
 
 
 def train_epoch(dataloader, model, loss_fn, optimizer):
-    size = len(dataloader.dataset) // train.world_size()
+    size = len(dataloader.dataset) // session.get_world_size()
     model.train()
     for batch, (X, y) in enumerate(dataloader):
         # Compute prediction error
@@ -38,7 +39,7 @@ def train_epoch(dataloader, model, loss_fn, optimizer):
 
 
 def validate_epoch(dataloader, model, loss_fn):
-    size = len(dataloader.dataset) // train.world_size()
+    size = len(dataloader.dataset) // session.get_world_size()
     num_batches = len(dataloader)
     model.eval()
     test_loss, correct = 0, 0
@@ -98,7 +99,7 @@ def train_func(config):
         train_dataset = Subset(train_dataset, list(range(64)))
         validation_dataset = Subset(validation_dataset, list(range(64)))
 
-    worker_batch_size = config["batch_size"] // train.world_size()
+    worker_batch_size = config["batch_size"] // session.get_world_size()
 
     train_loader = DataLoader(train_dataset, batch_size=worker_batch_size)
     validation_loader = DataLoader(validation_dataset, batch_size=worker_batch_size)
@@ -109,15 +110,10 @@ def train_func(config):
     # Create loss.
     criterion = nn.CrossEntropyLoss()
 
-    results = []
-
     for _ in range(epochs):
         train_epoch(train_loader, model, criterion, optimizer)
         result = validate_epoch(validation_loader, model, criterion)
-        train.report(**result)
-        results.append(result)
-
-    return results
+        session.report(result)
 
 
 if __name__ == "__main__":
diff --git a/python/ray/train/horovod/horovod_trainer.py b/python/ray/train/horovod/horovod_trainer.py
index 3b5e6e76c5ee..527170c62ee6 100644
--- a/python/ray/train/horovod/horovod_trainer.py
+++ b/python/ray/train/horovod/horovod_trainer.py
@@ -38,40 +38,40 @@ def train_loop_per_worker(config: Dict):
 
     If the ``datasets`` dict contains a training dataset (denoted by
     the "train" key), then it will be split into multiple dataset
-    shards that can then be accessed by ``ray.train.get_dataset_shard("train")`` inside
+    shards that can then be accessed by ``session.get_dataset_shard("train")`` inside
     ``train_loop_per_worker``. All the other datasets will not be split and
-    ``ray.train.get_dataset_shard(...)`` will return the the entire Dataset.
+    ``session.get_dataset_shard(...)`` will return the the entire Dataset.
 
     Inside the ``train_loop_per_worker`` function, you can use any of the
+    :ref:`Ray AIR session methods <air-session-ref>` and
     :ref:`Ray Train function utils <train-api-func-utils>`.
 
     .. code-block:: python
 
         def train_loop_per_worker():
-            # Report intermediate results for callbacks or logging.
-            train.report(...)
-
-            # Checkpoints the provided args as restorable state.
-            train.save_checkpoint(...)
+            # Report intermediate results for callbacks or logging and
+            # checkpoint data.
+            session.report(...)
 
             # Returns dict of last saved checkpoint.
-            train.load_checkpoint()
+            session.get_checkpoint()
 
             # Returns the Ray Dataset shard for the given key.
-            train.get_dataset_shard("my_dataset")
+            session.get_dataset_shard("my_dataset")
 
             # Returns the total number of workers executing training.
-            train.get_world_size()
+            session.get_world_size()
 
             # Returns the rank of this worker.
-            train.get_world_rank()
+            session.get_world_rank()
 
             # Returns the rank of the worker on the current node.
-            train.get_local_rank()
+            session.get_local_rank()
 
     You could use ``TensorflowPredictor`` or ``TorchPredictor`` in conjunction with
-    HorovodTrainer. You must save the model under the "model" kwarg in
-    ``train.save_checkpoint()``, so that it can be used by corresponding predictors.
+    HorovodTrainer. You must save the model under the "model" kwarg in the
+    ``Checkpoint`` passed to ``session.report()``, so that it can be used by
+    corresponding predictors.
 
     Example:
 
@@ -83,6 +83,7 @@ def train_loop_per_worker():
         import horovod.torch as hvd
         import torch
         import torch.nn as nn
+        from ray.air import session, Checkpoint
         from ray.train.horovod import HorovodTrainer
 
         input_size = 1
@@ -101,7 +102,7 @@ def forward(self, input):
 
         def train_loop_per_worker():
             hvd.init()
-            dataset_shard = train.get_dataset_shard("train")
+            dataset_shard = session.get_dataset_shard("train")
             model = NeuralNetwork()
             device = train.torch.get_device()
             model.to(device)
@@ -132,7 +133,12 @@ def train_loop_per_worker():
                     loss.backward()
                     optimizer.step()
                     print(f"epoch: {epoch}, loss: {loss.item()}")
-                train.save_checkpoint(model=model.state_dict())
+                session.report(
+                    {},
+                    checkpoint=Checkpoint.from_dict(
+                        dict(model=model.state_dict())
+                    ),
+                )
         train_dataset = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])
         scaling_config = {"num_workers": 3}
         # If using GPUs, use the below scaling config instead.
diff --git a/python/ray/train/huggingface/_huggingface_utils.py b/python/ray/train/huggingface/_huggingface_utils.py
index 2f3dd53f0ad8..d7b50f810d99 100644
--- a/python/ray/train/huggingface/_huggingface_utils.py
+++ b/python/ray/train/huggingface/_huggingface_utils.py
@@ -5,7 +5,8 @@
 import transformers.trainer
 from transformers.trainer_callback import TrainerCallback
 
-from ray import train
+from ray.air import session
+from ray.air.checkpoint import Checkpoint
 from ray.util import get_node_ip_address
 from ray.data.dataset import Dataset
 
@@ -118,9 +119,9 @@ def __init__(self) -> None:
         # HF first logs metrics, and then checkpoints. With Ray AIR, we need the
         # opposite. Furthermore, some metrics are logged just at the end.
         # Therefore, if we detect that a checkpoint will be created,
-        # we delay the train.report call after the checkpoint is reported
+        # we delay the session.report call after the checkpoint is reported
         # to Ray Train.
-        self.delayed_report = {}
+        self.delayed_report = {"metrics": {}, "checkpoint": None}
         super().__init__()
 
     def on_step_end(self, args, state, control, **kwargs):
@@ -132,7 +133,7 @@ def on_step_end(self, args, state, control, **kwargs):
     def on_log(self, args, state, control, model=None, logs=None, **kwargs):
         # Log is called in multiple places (evaluation, train metrics).
         report = {**logs, "step": state.global_step, "epoch": state.epoch}
-        self.delayed_report.update(report)
+        self.delayed_report["metrics"].update(report)
 
     def on_save(self, args, state, control, **kwargs):
         # Save is called after evaluation.
@@ -140,8 +141,8 @@ def on_save(self, args, state, control, **kwargs):
             transformers.trainer.get_last_checkpoint(args.output_dir)
         ).absolute()
         if checkpoint_path:
-            train.save_checkpoint(
-                **{
+            self.delayed_report["checkpoint"] = Checkpoint.from_dict(
+                {
                     NODE_IP_KEY: get_node_ip_address(),
                     CHECKPOINT_PATH_ON_NODE_KEY: str(checkpoint_path),
                 }
@@ -149,8 +150,8 @@ def on_save(self, args, state, control, **kwargs):
 
     def _report(self):
         if self.delayed_report:
-            train.report(**self.delayed_report)
-            self.delayed_report = {}
+            session.report(**self.delayed_report)
+            self.delayed_report = {"metrics": {}, "checkpoint": None}
 
     def on_epoch_begin(self, args, state, control, **kwargs):
         # Report previous epoch - this way we ensure everything
diff --git a/python/ray/train/huggingface/huggingface_trainer.py b/python/ray/train/huggingface/huggingface_trainer.py
index 786909b927b3..e8ef4a78d21f 100644
--- a/python/ray/train/huggingface/huggingface_trainer.py
+++ b/python/ray/train/huggingface/huggingface_trainer.py
@@ -13,7 +13,6 @@
 import transformers.training_args
 from torch.utils.data import Dataset as TorchDataset
 
-from ray import train
 from ray.air import session
 from ray.air._internal.checkpointing import (
     save_preprocessor_to_dir,
@@ -408,12 +407,12 @@ def _huggingface_train_loop_per_worker(config):
     trainer_init_per_worker = config.pop("_trainer_init_per_worker")
 
     # Env vars necessary for HF to setup DDP
-    os.environ["RANK"] = str(train.world_rank())
-    os.environ["WORLD_SIZE"] = str(train.world_size())
-    os.environ["LOCAL_RANK"] = str(train.local_rank())
+    os.environ["RANK"] = str(session.get_world_rank())
+    os.environ["WORLD_SIZE"] = str(session.get_world_size())
+    os.environ["LOCAL_RANK"] = str(session.get_local_rank())
 
-    train_dataset = train.get_dataset_shard(TRAIN_DATASET_KEY)
-    eval_dataset = train.get_dataset_shard(EVALUATION_DATASET_KEY)
+    train_dataset = session.get_dataset_shard(TRAIN_DATASET_KEY)
+    eval_dataset = session.get_dataset_shard(EVALUATION_DATASET_KEY)
 
     train_torch_dataset, eval_torch_dataset = process_datasets(
         train_dataset,
diff --git a/python/ray/train/tensorflow/tensorflow_trainer.py b/python/ray/train/tensorflow/tensorflow_trainer.py
index 3d5921767724..ffda25726780 100644
--- a/python/ray/train/tensorflow/tensorflow_trainer.py
+++ b/python/ray/train/tensorflow/tensorflow_trainer.py
@@ -38,36 +38,35 @@ def train_loop_per_worker(config: Dict):
 
     If the ``datasets`` dict contains a training dataset (denoted by
     the "train" key), then it will be split into multiple dataset
-    shards that can then be accessed by ``ray.train.get_dataset_shard("train")`` inside
+    shards that can then be accessed by ``session.get_dataset_shard("train")`` inside
     ``train_loop_per_worker``. All the other datasets will not be split and
-    ``ray.train.get_dataset_shard(...)`` will return the the entire Dataset.
+    ``session.get_dataset_shard(...)`` will return the the entire Dataset.
 
     Inside the ``train_loop_per_worker`` function, you can use any of the
+    :ref:`Ray AIR session methods <air-session-ref>` and
     :ref:`Ray Train function utils <train-api-func-utils>`.
 
     .. code-block:: python
 
         def train_loop_per_worker():
-            # Report intermediate results for callbacks or logging.
-            train.report(...)
-
-            # Checkpoints the provided args as restorable state.
-            train.save_checkpoint(...)
+            # Report intermediate results for callbacks or logging and
+            # checkpoint data.
+            session.report(...)
 
             # Returns dict of last saved checkpoint.
-            train.load_checkpoint()
+            session.get_checkpoint()
 
             # Returns the Ray Dataset shard for the given key.
-            train.get_dataset_shard("my_dataset")
+            session.get_dataset_shard("my_dataset")
 
             # Returns the total number of workers executing training.
-            train.get_world_size()
+            session.get_world_size()
 
             # Returns the rank of this worker.
-            train.get_world_rank()
+            session.get_world_rank()
 
             # Returns the rank of the worker on the current node.
-            train.get_local_rank()
+            session.get_local_rank()
 
     You can also use any of the :ref:`TensorFlow specific function utils
     <train-api-tensorflow-utils>`.
@@ -77,12 +76,12 @@ def train_loop_per_worker():
         def train_loop_per_worker():
             # Turns off autosharding for a dataset.
             # You should use this if you are doing
-            # `train.get_dataset_shard(...).to_tf(...)`
+            # `session.get_dataset_shard(...).to_tf(...)`
             # as the data will be already sharded.
             train.tensorflow.prepare_dataset_shard(...)
 
     To save a model to use for the ``TensorflowPredictor``, you must save it under the
-    "model" kwarg in ``train.save_checkpoint()``.
+    "model" kwarg in ``Checkpoint`` passed to ``session.report()``.
 
     Example:
 
@@ -92,9 +91,8 @@ def train_loop_per_worker():
 
         import ray
         from ray import train
-        from ray.train.tensorflow import prepare_dataset_shard
-
-        from ray.train.tensorflow import TensorflowTrainer
+        from ray.air import session, Checkpoint
+        from ray.train.tensorflow import prepare_dataset_shard, TensorflowTrainer
 
         input_size = 1
 
@@ -106,7 +104,7 @@ def build_model():
             )
 
         def train_loop_for_worker(config):
-            dataset_shard = train.get_dataset_shard("train")
+            dataset_shard = session.get_dataset_shard("train")
             strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
             with strategy.scope():
                 model = build_model()
@@ -125,8 +123,14 @@ def train_loop_for_worker(config):
                     )
                 )
                 model.fit(tf_dataset)
-                train.save_checkpoint(
-                    epoch=epoch, model=model.get_weights())
+                # You can also use ray.air.callbacks.keras.Callback
+                # for reporting and checkpointing instead of reporting manually.
+                session.report(
+                    {},
+                    checkpoint=Checkpoint.from_dict(
+                        dict(epoch=epoch, model=model.get_weights())
+                    ),
+                )
 
         train_dataset = ray.data.from_items(
             [{"x": x, "y": x + 1} for x in range(32)])
diff --git a/python/ray/train/tests/test_tensorflow_trainer.py b/python/ray/train/tests/test_tensorflow_trainer.py
index 95c3f84b92e1..1f17de0c3e25 100644
--- a/python/ray/train/tests/test_tensorflow_trainer.py
+++ b/python/ray/train/tests/test_tensorflow_trainer.py
@@ -4,7 +4,6 @@
 import pytest
 
 import ray
-from ray import train
 from ray.air import session
 from ray.air.checkpoint import Checkpoint
 from ray.air.examples.tf.tensorflow_linear_dataset_example import get_dataset
@@ -65,7 +64,7 @@ def train_func(config):
 def test_tensorflow_e2e(ray_start_4_cpus):
     def train_func():
         model = build_model().get_weights()
-        train.save_checkpoint(**{MODEL_KEY: model})
+        session.report({}, checkpoint=Checkpoint.from_dict({MODEL_KEY: model}))
 
     scaling_config = {"num_workers": 2}
     trainer = TensorflowTrainer(
diff --git a/python/ray/train/tests/test_torch_trainer.py b/python/ray/train/tests/test_torch_trainer.py
index de0b6f8606c4..0d158e3456a2 100644
--- a/python/ray/train/tests/test_torch_trainer.py
+++ b/python/ray/train/tests/test_torch_trainer.py
@@ -1,8 +1,9 @@
 import pytest
+from ray.air import session
+from ray.air.checkpoint import Checkpoint
 import torch
 
 import ray
-from ray import train
 from ray.air.examples.pytorch.torch_linear_example import (
     train_func as linear_train_func,
 )
@@ -39,7 +40,7 @@ def train_func(config):
 def test_torch_e2e(ray_start_4_cpus):
     def train_func():
         model = torch.nn.Linear(1, 1)
-        train.save_checkpoint(model=model)
+        session.report({}, checkpoint=Checkpoint.from_dict(dict(model=model)))
 
     scaling_config = {"num_workers": 2}
     trainer = TorchTrainer(
@@ -65,7 +66,7 @@ def __call__(self, x):
 def test_torch_e2e_state_dict(ray_start_4_cpus):
     def train_func():
         model = torch.nn.Linear(1, 1).state_dict()
-        train.save_checkpoint(model=model)
+        session.report({}, checkpoint=Checkpoint.from_dict(dict(model=model)))
 
     scaling_config = {"num_workers": 2}
     trainer = TorchTrainer(
diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py
index 640fa98a19a0..4d1480a2c1c5 100644
--- a/python/ray/train/tests/test_tune.py
+++ b/python/ray/train/tests/test_tune.py
@@ -134,9 +134,11 @@ def train_func(config):
 
 def test_tune_checkpoint(ray_start_4_cpus):
     def train_func():
-        for i in range(10):
-            train.report(test=i)
-        train.save_checkpoint(hello="world")
+        for i in range(9):
+            session.report(dict(test=i))
+        session.report(
+            dict(test=i + 1), checkpoint=Checkpoint.from_dict(dict(hello="world"))
+        )
 
     trainer = DataParallelTrainer(
         train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1)
diff --git a/python/ray/train/torch/torch_trainer.py b/python/ray/train/torch/torch_trainer.py
index f43a8e259f17..fb9a0e148ae0 100644
--- a/python/ray/train/torch/torch_trainer.py
+++ b/python/ray/train/torch/torch_trainer.py
@@ -38,36 +38,35 @@ def train_loop_per_worker(config: Dict):
 
     If the ``datasets`` dict contains a training dataset (denoted by
     the "train" key), then it will be split into multiple dataset
-    shards that can then be accessed by ``ray.train.get_dataset_shard("train")`` inside
+    shards that can then be accessed by ``session.get_dataset_shard("train")`` inside
     ``train_loop_per_worker``. All the other datasets will not be split and
-    ``ray.train.get_dataset_shard(...)`` will return the the entire Dataset.
+    ``session.get_dataset_shard(...)`` will return the the entire Dataset.
 
     Inside the ``train_loop_per_worker`` function, you can use any of the
+    :ref:`Ray AIR session methods <air-session-ref>` and
     :ref:`Ray Train function utils <train-api-func-utils>`.
 
     .. code-block:: python
 
         def train_loop_per_worker():
-            # Report intermediate results for callbacks or logging.
-            train.report(...)
-
-            # Checkpoints the provided args as restorable state.
-            train.save_checkpoint(...)
+            # Report intermediate results for callbacks or logging and
+            # checkpoint data.
+            session.report(...)
 
             # Returns dict of last saved checkpoint.
-            train.load_checkpoint()
+            session.get_checkpoint()
 
             # Returns the Ray Dataset shard for the given key.
-            train.get_dataset_shard("my_dataset")
+            session.get_dataset_shard("my_dataset")
 
             # Returns the total number of workers executing training.
-            train.get_world_size()
+            session.get_world_size()
 
             # Returns the rank of this worker.
-            train.get_world_rank()
+            session.get_world_rank()
 
             # Returns the rank of the worker on the current node.
-            train.get_local_rank()
+            session.get_local_rank()
 
     You can also use any of the :ref:`Torch specific function utils
     <train-api-torch-utils>`.
@@ -82,14 +81,14 @@ def train_loop_per_worker():
             # Configures the dataloader for distributed training by adding a
             # `DistributedSampler`.
             # You should NOT use this if you are doing
-            # `train.get_dataset_shard(...).to_torch(...)`
+            # `session.get_dataset_shard(...).to_torch(...)`
             train.torch.prepare_data_loader(...)
 
             # Returns the current torch device.
             train.torch.get_device()
 
     To save a model to use for the ``TorchPredictor``, you must save it under the
-    "model" kwarg in ``train.save_checkpoint()``.
+    "model" kwarg in ``Checkpoint`` passed to ``session.report()``.
 
     Example:
         .. code-block:: python
@@ -99,6 +98,7 @@ def train_loop_per_worker():
 
             import ray
             from ray import train
+            from ray.air import session, Checkpoint
             from ray.train.torch import TorchTrainer
 
             input_size = 1
@@ -117,7 +117,7 @@ def forward(self, input):
                     return self.layer2(self.relu(self.layer1(input)))
 
             def train_loop_per_worker():
-                dataset_shard = train.get_dataset_shard("train")
+                dataset_shard = session.get_dataset_shard("train")
                 model = NeuralNetwork()
                 loss_fn = nn.MSELoss()
                 optimizer = optim.SGD(model.parameters(), lr=0.1)
@@ -133,7 +133,12 @@ def train_loop_per_worker():
                         optimizer.step()
                         print(f"epoch: {epoch}, loss: {loss.item()}")
 
-                    train.save_checkpoint(model=model.state_dict())
+                    session.report(
+                        {},
+                        checkpoint=Checkpoint.from_dict(
+                            dict(epoch=epoch, model=model.state_dict())
+                        ),
+                    )
 
             train_dataset = ray.data.from_items([1, 2, 3])
             scaling_config = {"num_workers": 3}
diff --git a/python/ray/train/torch/train_loop_utils.py b/python/ray/train/torch/train_loop_utils.py
index 65b8368cbcf0..dee94bd2ec8c 100644
--- a/python/ray/train/torch/train_loop_utils.py
+++ b/python/ray/train/torch/train_loop_utils.py
@@ -10,6 +10,7 @@
 
 import ray
 from ray import train
+from ray.air import session
 from ray.train._internal.accelerator import Accelerator
 from ray.train.constants import PYTORCH_PROFILER_KEY
 from torch.optim import Optimizer
@@ -282,7 +283,11 @@ def prepare_model(
         """
         ddp_kwargs = ddp_kwargs or {}
 
-        rank = train.local_rank()
+        # Backwards compatibility
+        try:
+            rank = session.get_local_rank()
+        except Exception:
+            rank = train.local_rank()
 
         device = self.get_device()
 
@@ -327,7 +332,13 @@ def model_get_state(self):
             # See https://stackoverflow.com/questions/972/adding-a-method-to-an-existing-object-instance.  # noqa: E501
             model.__getstate__ = types.MethodType(model_get_state, model)
 
-        if wrap_ddp and train.world_size() > 1:
+        # Backwards compatibility
+        try:
+            world_size = session.get_world_size()
+        except Exception:
+            world_size = train.world_size()
+
+        if wrap_ddp and world_size > 1:
             logger.info("Wrapping provided model in DDP.")
             if torch.cuda.is_available():
                 model = DistributedDataParallel(
@@ -365,13 +376,21 @@ def prepare_data_loader(
                 if ``move_to_device`` is False.
         """
 
+        # Backwards compatibility
+        try:
+            world_size = session.get_world_size()
+            world_rank = session.get_world_rank()
+        except Exception:
+            world_size = train.world_size()
+            world_rank = train.world_rank()
+
         # Only add Distributed Sampler if the following conditions hold:
         # 1. More than one training worker is being used.
         # 2. A DistributedSampler has not already been added by the user.
         # 3. The dataset is not an IterableDataset. Samplers do not worker with
         # IterableDatasets.
         if (
-            train.world_size() > 1
+            world_size > 1
             and not isinstance(data_loader.sampler, DistributedSampler)
             and not (
                 hasattr(data_loader, "dataset")
@@ -413,7 +432,7 @@ def wrapper(worker_id):
                 using_default_sampler = isinstance(
                     loader.sampler, (SequentialSampler, RandomSampler)
                 )
-                if not using_default_sampler and train.world_rank() == 0:
+                if not using_default_sampler and world_rank == 0:
                     logger.warn(
                         f"The {loader.sampler.__class__.__name__} will be overwritten "
                         "with a DistributedSampler. You can disable this by setting "
diff --git a/release/air_tests/horovod/workloads/horovod_tune_test.py b/release/air_tests/horovod/workloads/horovod_tune_test.py
index 5008834bea5d..bab14b9bbf0e 100755
--- a/release/air_tests/horovod/workloads/horovod_tune_test.py
+++ b/release/air_tests/horovod/workloads/horovod_tune_test.py
@@ -2,7 +2,7 @@
 import torch.nn as nn
 import numpy as np
 import torchvision
-from ray.air import RunConfig
+from ray.air import RunConfig, session
 from ray.train.horovod import HorovodTrainer
 from ray.tune.tune_config import TuneConfig
 from ray.tune.tuner import Tuner
@@ -12,7 +12,7 @@
 
 import ray
 from ray import tune
-from ray import train
+from ray.air.checkpoint import Checkpoint
 from ray.tune.schedulers import create_scheduler
 
 from ray.util.ml_utils.resnet import ResNet18
@@ -37,7 +37,7 @@ def train_loop_per_worker(config):
     )
     epoch = 0
 
-    checkpoint = train.load_checkpoint()
+    checkpoint = session.get_checkpoint()
     if checkpoint:
         model_state = checkpoint["model_state"]
         optimizer_state = checkpoint["optimizer_state"]
@@ -79,17 +79,20 @@ def train_loop_per_worker(config):
             # print statistics
             running_loss += loss.item()
             epoch_steps += 1
-            train.report(loss=running_loss / epoch_steps)
             if i % 2000 == 1999:  # print every 2000 mini-batches
                 print(
                     "[%d, %5d] loss: %.3f"
                     % (epoch + 1, i + 1, running_loss / epoch_steps)
                 )
-
-        train.save_checkpoint(
-            model_state=net.state_dict(),
-            optimizer_state=optimizer.state_dict(),
-            epoch=epoch,
+        session.report(
+            dict(loss=running_loss / epoch_steps),
+            checkpoint=Checkpoint.from_dict(
+                dict(
+                    model_state=net.state_dict(),
+                    optimizer_state=optimizer.state_dict(),
+                    epoch=epoch,
+                )
+            ),
         )
 
 

From 17366cef2a9c621afd60c2fecdcdb97a40da504e Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 5 Jul 2022 18:32:47 +0000
Subject: [PATCH 61/70] Update docs

---
 doc/source/ray-air/doc_code/air_ingest.py     |  8 +--
 .../ray-air/doc_code/pytorch_starter.py       |  9 +--
 doc/source/ray-air/doc_code/tf_starter.py     | 16 ++---
 ...ert_existing_pytorch_code_to_ray_air.ipynb | 38 ++++++----
 .../examples/tfx_tabular_train_to_serve.ipynb | 12 ++--
 .../examples/torch_image_example.ipynb        | 12 ++--
 .../examples/torch_incremental_learning.ipynb | 23 +++---
 .../datasets_train/datasets_train.py          | 70 +++++++++----------
 doc/source/train/faq.rst                      |  8 +--
 doc/source/tune/examples/horovod_simple.ipynb |  4 +-
 .../tf/tensorflow_autoencoder_example.ipynb   |  2 +-
 11 files changed, 106 insertions(+), 96 deletions(-)

diff --git a/doc/source/ray-air/doc_code/air_ingest.py b/doc/source/ray-air/doc_code/air_ingest.py
index 4671406ce306..c14d285de5ad 100644
--- a/doc/source/ray-air/doc_code/air_ingest.py
+++ b/doc/source/ray-air/doc_code/air_ingest.py
@@ -86,7 +86,7 @@
 
 # __config_4__
 import ray
-from ray import train
+from ray.air import session
 from ray.data import Dataset
 from ray.train.torch import TorchTrainer
 from ray.air.config import DatasetConfig
@@ -94,7 +94,7 @@
 
 def train_loop_per_worker():
     # By default, bulk loading is used and returns a Dataset object.
-    data_shard: Dataset = train.get_dataset_shard("train")
+    data_shard: Dataset = session.get_dataset_shard("train")
 
     # Manually iterate over the data 10 times (10 epochs).
     for _ in range(10):
@@ -117,7 +117,7 @@ def train_loop_per_worker():
 
 # __config_5__
 import ray
-from ray import train
+from ray.air import session
 from ray.data import DatasetPipeline
 from ray.train.torch import TorchTrainer
 from ray.air.config import DatasetConfig
@@ -125,7 +125,7 @@ def train_loop_per_worker():
 
 def train_loop_per_worker():
     # A DatasetPipeline object is returned when `use_stream_api` is set.
-    data_shard: DatasetPipeline = train.get_dataset_shard("train")
+    data_shard: DatasetPipeline = session.get_dataset_shard("train")
 
     # Use iter_epochs(10) to iterate over 10 epochs of data.
     for epoch in data_shard.iter_epochs(10):
diff --git a/doc/source/ray-air/doc_code/pytorch_starter.py b/doc/source/ray-air/doc_code/pytorch_starter.py
index 9c57ac24b4b3..01837655eab8 100644
--- a/doc/source/ray-air/doc_code/pytorch_starter.py
+++ b/doc/source/ray-air/doc_code/pytorch_starter.py
@@ -29,6 +29,7 @@
 from torch import nn
 from torch.utils.data import DataLoader
 import ray.train as train
+from ray.air import session
 from ray.train.torch import TorchTrainer
 
 # Define model
@@ -52,7 +53,7 @@ def forward(self, x):
 
 
 def train_epoch(dataloader, model, loss_fn, optimizer):
-    size = len(dataloader.dataset) // train.world_size()
+    size = len(dataloader.dataset) // session.get_world_size()
     model.train()
     for batch, (X, y) in enumerate(dataloader):
         # Compute prediction error
@@ -70,7 +71,7 @@ def train_epoch(dataloader, model, loss_fn, optimizer):
 
 
 def validate_epoch(dataloader, model, loss_fn):
-    size = len(dataloader.dataset) // train.world_size()
+    size = len(dataloader.dataset) // session.get_world_size()
     num_batches = len(dataloader)
     model.eval()
     test_loss, correct = 0, 0
@@ -94,7 +95,7 @@ def train_func(config):
     lr = config["lr"]
     epochs = config["epochs"]
 
-    worker_batch_size = batch_size // train.world_size()
+    worker_batch_size = batch_size // session.get_world_size()
 
     # Create data loaders.
     train_dataloader = DataLoader(training_data, batch_size=worker_batch_size)
@@ -113,7 +114,7 @@ def train_func(config):
     for _ in range(epochs):
         train_epoch(train_dataloader, model, loss_fn, optimizer)
         loss = validate_epoch(test_dataloader, model, loss_fn)
-        train.report(loss=loss)
+        session.report(dict(loss=loss))
 
 
 num_workers = 2
diff --git a/doc/source/ray-air/doc_code/tf_starter.py b/doc/source/ray-air/doc_code/tf_starter.py
index 4116ff5dba58..2f20a65a159e 100644
--- a/doc/source/ray-air/doc_code/tf_starter.py
+++ b/doc/source/ray-air/doc_code/tf_starter.py
@@ -15,9 +15,9 @@
 
 # __air_tf_train_start__
 import tensorflow as tf
-from tensorflow.keras.callbacks import Callback
 
-import ray.train as train
+from ray.air import session
+from ray.air.callbacks.keras import Callback
 from ray.train.tensorflow import prepare_dataset_shard
 from ray.train.tensorflow import TensorflowTrainer
 
@@ -33,12 +33,6 @@ def build_model() -> tf.keras.Model:
     return model
 
 
-class TrainCheckpointReportCallback(Callback):
-    def on_epoch_end(self, epoch, logs=None):
-        train.save_checkpoint(**{"model": self.model.get_weights()})
-        train.report(**logs)
-
-
 def train_func(config: dict):
     batch_size = config.get("batch_size", 64)
     epochs = config.get("epochs", 3)
@@ -53,7 +47,7 @@ def train_func(config: dict):
             metrics=[tf.keras.metrics.mean_squared_error],
         )
 
-    dataset = train.get_dataset_shard("train")
+    dataset = session.get_dataset_shard("train")
 
     results = []
     for _ in range(epochs):
@@ -67,9 +61,7 @@ def train_func(config: dict):
                 batch_size=batch_size,
             )
         )
-        history = multi_worker_model.fit(
-            tf_dataset, callbacks=[TrainCheckpointReportCallback()]
-        )
+        history = multi_worker_model.fit(tf_dataset, callbacks=[Callback()])
         results.append(history.history)
     return results
 
diff --git a/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb b/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb
index a94a9e4cfc81..715703cd7e82 100644
--- a/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb
+++ b/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb
@@ -674,10 +674,11 @@
     "\n",
     "To facilitate this, we only need a few changes to the code:\n",
     "\n",
-    "1. We import Ray Train:\n",
+    "1. We import Ray Train and Ray AIR Session:\n",
     "\n",
     "```python\n",
     "import ray.train as train\n",
+    "from ray.air import session\n",
     "```\n",
     "\n",
     "\n",
@@ -693,7 +694,7 @@
     "3. We dynamically adjust the worker batch size according to the number of workers:\n",
     "\n",
     "```python\n",
-    "    batch_size_per_worker = batch_size // train.world_size()\n",
+    "    batch_size_per_worker = batch_size // session.get_world_size()\n",
     "```\n",
     "\n",
     "4. We prepare the data loader for distributed data sharding:\n",
@@ -716,13 +717,13 @@
     "\n",
     "```python\n",
     "        test_loss = test(test_dataloader, model, loss_fn)\n",
-    "        train.report(loss=test_loss)\n",
+    "        session.report(dict(loss=test_loss))\n",
     "```\n",
     "\n",
     "7. In the `train_epoch()` and `test_epoch()` functions we divide the `size` by the world size:\n",
     "\n",
     "```python\n",
-    "    size = len(dataloader.dataset) // train.world_size()  # Divide by word size\n",
+    "    size = len(dataloader.dataset) // session.get_world_size()  # Divide by word size\n",
     "```\n",
     "\n",
     "8. In the `train_epoch()` function we can get rid of the device mapping. Ray Train does this for us:\n",
@@ -745,7 +746,7 @@
    "outputs": [],
    "source": [
     "def train_epoch(dataloader, model, loss_fn, optimizer):\n",
-    "    size = len(dataloader.dataset) // train.world_size()  # Divide by word size\n",
+    "    size = len(dataloader.dataset) // session.get_world_size()  # Divide by word size\n",
     "    model.train()\n",
     "    for batch, (X, y) in enumerate(dataloader):\n",
     "        # We don't need this anymore! Ray Train does this automatically:\n",
@@ -781,7 +782,7 @@
    "outputs": [],
    "source": [
     "def test_epoch(dataloader, model, loss_fn):\n",
-    "    size = len(dataloader.dataset) // train.world_size()  # Divide by word size\n",
+    "    size = len(dataloader.dataset) // session.get_world_size()  # Divide by word size\n",
     "    num_batches = len(dataloader)\n",
     "    model.eval()\n",
     "    test_loss, correct = 0, 0\n",
@@ -821,14 +822,14 @@
    ],
    "source": [
     "import ray.train as train\n",
-    "\n",
+    "from ray.air import session\n",
     "\n",
     "def train_func(config: dict):\n",
     "    batch_size = config[\"batch_size\"]\n",
     "    lr = config[\"lr\"]\n",
     "    epochs = config[\"epochs\"]\n",
     "    \n",
-    "    batch_size_per_worker = batch_size // train.world_size()\n",
+    "    batch_size_per_worker = batch_size // session.get_world_size()\n",
     "    \n",
     "    # Create data loaders.\n",
     "    train_dataloader = DataLoader(training_data, batch_size=batch_size_per_worker)\n",
@@ -846,7 +847,7 @@
     "    for t in range(epochs):\n",
     "        train_epoch(train_dataloader, model, loss_fn, optimizer)\n",
     "        test_loss = test_epoch(test_dataloader, model, loss_fn)\n",
-    "        train.report(loss=test_loss)\n",
+    "        session.report(dict(loss=test_loss))\n",
     "\n",
     "    print(\"Done!\")"
    ]
@@ -1062,10 +1063,15 @@
    "metadata": {},
    "source": [
     "### Enabling checkpointing to retrieve the model\n",
-    "Enabling checkpointing is pretty easy - we just need to call the `train.save_checkpoint()` API and pass the model state to it:\n",
+    "Enabling checkpointing is pretty easy - we just need to pass a `Checkpoint` object with the model state to the `session.report()` API.\n",
     "\n",
     "```python\n",
-    "    train.save_checkpoint(epoch=t, model=model.module.state_dict())\n",
+    "    from ray.air import Checkpoint\n",
+    "\n",
+    "    checkpoint = Checkpoint.from_dict(\n",
+    "        dict(epoch=t, model=model.module.state_dict())\n",
+    "    )\n",
+    "    session.report(dict(loss=test_loss), checkpoint=checkpoint)\n",
     "```\n",
     "\n",
     "Note that the `model.module` part is needed because the model gets wrapped in `torch.nn.DistributedDataParallel` by `train.torch.prepare_model`.\n",
@@ -1086,6 +1092,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from ray.air import Checkpoint\n",
+    "\n",
     "def load_data():\n",
     "    # Download training data from open datasets.\n",
     "    training_data = datasets.FashionMNIST(\n",
@@ -1110,7 +1118,7 @@
     "    lr = config[\"lr\"]\n",
     "    epochs = config[\"epochs\"]\n",
     "    \n",
-    "    batch_size_per_worker = batch_size // train.world_size()\n",
+    "    batch_size_per_worker = batch_size // session.get_world_size()\n",
     "    \n",
     "    training_data, test_data = load_data()  # <- this is new!\n",
     "    \n",
@@ -1130,8 +1138,10 @@
     "    for t in range(epochs):\n",
     "        train_epoch(train_dataloader, model, loss_fn, optimizer)\n",
     "        test_loss = test_epoch(test_dataloader, model, loss_fn)\n",
-    "        train.save_checkpoint(epoch=t, model=model.module.state_dict())  # <- this is new!\n",
-    "        train.report(loss=test_loss)\n",
+    "        checkpoint = Checkpoint.from_dict(\n",
+    "            dict(epoch=t, model=model.module.state_dict())\n",
+    "        )\n",
+    "        session.report(dict(loss=test_loss), checkpoint=checkpoint)\n",
     "\n",
     "    print(\"Done!\")"
    ]
diff --git a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb
index f9609fe8d962..514e8fad3a4e 100644
--- a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb
+++ b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb
@@ -619,12 +619,11 @@
    },
    "outputs": [],
    "source": [
-    "from ray import train\n",
+    "from ray.air import session, Checkpoint\n",
     "from ray.train.tensorflow import prepare_dataset_shard\n",
-    "from ray.tune.integration.keras import TuneReportCallback\n",
     "\n",
     "def train_loop_per_worker():\n",
-    "    dataset_shard = train.get_dataset_shard(\"train\")\n",
+    "    dataset_shard = session.get_dataset_shard(\"train\")\n",
     "\n",
     "    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n",
     "    with strategy.scope():\n",
@@ -653,7 +652,12 @@
     "\n",
     "        model.fit(tf_dataset, verbose=0)\n",
     "        # This saves checkpoint in a way that can be used by Ray Serve coherently.\n",
-    "        train.save_checkpoint(epoch=epoch, model=model.get_weights())"
+    "        session.report(\n",
+    "            {},\n",
+    "            checkpoint=Checkpoint.from_dict(\n",
+    "                dict(epoch=epoch, model=model.get_weights())\n",
+    "            ),\n",
+    "        )"
    ]
   },
   {
diff --git a/doc/source/ray-air/examples/torch_image_example.ipynb b/doc/source/ray-air/examples/torch_image_example.ipynb
index 98cbfbee814e..10600a6291ed 100644
--- a/doc/source/ray-air/examples/torch_image_example.ipynb
+++ b/doc/source/ray-air/examples/torch_image_example.ipynb
@@ -253,8 +253,8 @@
     "\n",
     "`train_loop_per_worker` contains regular PyTorch code with a few notable exceptions:\n",
     "* We wrap our model with {py:func}`train.torch.prepare_model <ray.train.torch.prepare_model>`.\n",
-    "* We call {py:func}`train.get_dataset_shard <ray.train.get_dataset_shard>` and {py:meth}`Dataset.to_torch <ray.data.Dataset.to_torch>` to convert a subset of our training data to a Torch dataset.\n",
-    "* We save model state using {py:func}`train.save_checkpoint <ray.train.save_checkpoint>`."
+    "* We call {py:func}`session.get_dataset_shard <ray.air.session.get_dataset_shard>` and {py:meth}`Dataset.to_torch <ray.data.Dataset.to_torch>` to convert a subset of our training data to a Torch dataset.\n",
+    "* We save model state using {py:func}`session.report <ray.air.session.report>`."
    ]
   },
   {
@@ -265,6 +265,7 @@
    "outputs": [],
    "source": [
     "from ray import train\n",
+    "from ray.air import session, Checkpoint\n",
     "import torch.optim as optim\n",
     "\n",
     "\n",
@@ -274,7 +275,7 @@
     "    criterion = nn.CrossEntropyLoss()\n",
     "    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)\n",
     "\n",
-    "    train_dataset_shard: torch.utils.data.Dataset = train.get_dataset_shard(\"train\").to_torch(\n",
+    "    train_dataset_shard: torch.utils.data.Dataset = session.get_dataset_shard(\"train\").to_torch(\n",
     "        feature_columns=[\"image\"],\n",
     "        label_column=\"label\",\n",
     "        batch_size=config[\"batch_size\"],\n",
@@ -303,7 +304,10 @@
     "                print(f\"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}\")\n",
     "                running_loss = 0.0\n",
     "\n",
-    "        train.save_checkpoint(model=model.module.state_dict())"
+    "        session.report(\n",
+    "            dict(running_loss=running_loss),\n",
+    "            checkpoint=Checkpoint.from_dict(dict(model=model.module.state_dict())),\n",
+    "        )"
    ]
   },
   {
diff --git a/doc/source/ray-air/examples/torch_incremental_learning.ipynb b/doc/source/ray-air/examples/torch_incremental_learning.ipynb
index 03d7ed476781..dffcd987b438 100644
--- a/doc/source/ray-air/examples/torch_incremental_learning.ipynb
+++ b/doc/source/ray-air/examples/torch_incremental_learning.ipynb
@@ -498,11 +498,11 @@
     "\n",
     "The training loop takes in a `config` Dict as an argument that we can use to pass in any configurations for training.\n",
     "\n",
-    "This is just standard PyTorch training, with the difference being that we can leverage [Ray Train's utility functions](https://docs.ray.io/en/master/train/api.html#training-function-utilities):\n",
+    "This is just standard PyTorch training, with the difference being that we can leverage [Ray Train's utility functions](https://docs.ray.io/en/master/train/api.html#training-function-utilities) and [Ray AIR Sesssion](https://docs.ray.io/en/master/ray-air/package-ref.html#module-ray.air.session):\n",
     "- `ray.train.torch.prepare_model(...)`: This will prepare the model for distributed training by wrapping it in PyTorch `DistributedDataParallel` and moving it to the correct accelerator device.\n",
-    "- `ray.train.get_dataset_shard(...)`: This will get the Ray Dataset shard for this particular Data Parallel worker.\n",
-    "- `ray.train.save_checkpoint(...)`: This will tell Ray Train to save the provided arguments as a checkpoint. Checkpoints will be written to disk under the `~/ray_results` directory.\n",
-    "- `ray.train.load_checkpoint()`: Returns a checkpoint to resume from. This is useful for either fault tolerance purposes, or for our purposes, to continue training the same model on a new incoming dataset."
+    "- `ray.air.session.get_dataset_shard(...)`: This will get the Ray Dataset shard for this particular Data Parallel worker.\n",
+    "- `ray.air.session.report({}, checkpoint=...)`: This will tell Ray Train to persist the provided `Checkpoint` object.\n",
+    "- `ray.air.session.get_checkpoint()`: Returns a checkpoint to resume from. This is useful for either fault tolerance purposes, or for our purposes, to continue training the same model on a new incoming dataset."
    ]
   },
   {
@@ -514,6 +514,7 @@
    "outputs": [],
    "source": [
     "from ray import train\n",
+    "from ray.air import session, Checkpoint\n",
     "\n",
     "from torch.optim import SGD\n",
     "from torch.nn import CrossEntropyLoss\n",
@@ -529,9 +530,9 @@
     "    model = SimpleMLP(num_classes=10)\n",
     "\n",
     "    # Load model from checkpoint if there is a checkpoint to load from.\n",
-    "    checkpoint_to_load = train.load_checkpoint()\n",
+    "    checkpoint_to_load = session.get_checkpoint()\n",
     "    if checkpoint_to_load:\n",
-    "        state_dict_to_resume_from = checkpoint_to_load[\"model\"]\n",
+    "        state_dict_to_resume_from = checkpoint_to_load.to_dict()[\"model\"]\n",
     "        model.load_state_dict(state_dict=state_dict_to_resume_from)\n",
     "\n",
     "    model = train.torch.prepare_model(model)\n",
@@ -540,7 +541,7 @@
     "    criterion = CrossEntropyLoss()\n",
     "\n",
     "    # Get the Ray Dataset shard for this data parallel worker, and convert it to a PyTorch Dataset.\n",
-    "    dataset_shard = train.get_dataset_shard(\"train\").to_torch(\n",
+    "    dataset_shard = session.get_dataset_shard(\"train\").to_torch(\n",
     "        label_column=\"label\",\n",
     "        batch_size=batch_size,\n",
     "        unsqueeze_feature_tensors=False,\n",
@@ -548,6 +549,7 @@
     "    )\n",
     "\n",
     "    for epoch_idx in range(num_epochs):\n",
+    "        running_loss = 0\n",
     "        for iteration, (train_mb_x, train_mb_y) in enumerate(dataset_shard):\n",
     "            optimizer.zero_grad()\n",
     "            train_mb_x = train_mb_x.to(train.torch.get_device())\n",
@@ -562,13 +564,15 @@
     "            # Update\n",
     "            optimizer.step()\n",
     "\n",
-    "            if train.world_rank() == 0 and iteration % 500 == 0:\n",
+    "            running_loss += loss.item()\n",
+    "            if session.get_world_rank() == 0 and iteration % 500 == 0:\n",
     "                print(f\"loss: {loss.item():>7f}, epoch: {epoch_idx}, iteration: {iteration}\")\n",
     "\n",
     "        # Checkpoint model after every epoch.\n",
     "        state_dict = model.state_dict()\n",
     "        consume_prefix_in_state_dict_if_present(state_dict, \"module.\")\n",
-    "        train.save_checkpoint(model=state_dict)"
+    "        checkpoint = Checkpoint.from_dict(dict(model=state_dict))\n",
+    "        session.report({\"loss\": running_loss}, checkpoint=checkpoint)"
    ]
   },
   {
@@ -1237,7 +1241,6 @@
    "source": [
     "from ray.train.torch import TorchTrainer\n",
     "from ray.train.torch import TorchPredictor\n",
-    "from ray.air import Checkpoint\n",
     "from ray import serve\n",
     "from ray.serve.model_wrappers import ModelWrapperDeployment\n",
     "from ray.serve.http_adapters import json_to_ndarray\n",
diff --git a/doc/source/ray-core/_examples/datasets_train/datasets_train.py b/doc/source/ray-core/_examples/datasets_train/datasets_train.py
index dcc9635251ba..a915c3029f27 100644
--- a/doc/source/ray-core/_examples/datasets_train/datasets_train.py
+++ b/doc/source/ray-core/_examples/datasets_train/datasets_train.py
@@ -18,6 +18,7 @@
 import boto3
 import mlflow
 import pandas as pd
+from ray.train.torch.torch_trainer import TorchTrainer
 import torch
 import torch.nn as nn
 import torch.optim as optim
@@ -25,10 +26,9 @@
 
 import ray
 from ray import train
+from ray.air import session, Checkpoint, RunConfig
 from ray.data.aggregate import Mean, Std
-from ray.train import Trainer
-from ray.train.callbacks import TBXLoggerCallback
-from ray.train.callbacks.logging import MLflowLoggerCallback
+from ray.air.callbacks.mlflow import MLflowLoggerCallback
 
 
 def make_and_upload_dataset(dir_path):
@@ -404,14 +404,16 @@ def train_func(config):
 
     # Setup device.
     device = torch.device(
-        f"cuda:{train.local_rank()}" if use_gpu and torch.cuda.is_available() else "cpu"
+        f"cuda:{session.get_local_rank()}"
+        if use_gpu and torch.cuda.is_available()
+        else "cpu"
     )
     print(f"Device: {device}")
 
     # Setup data.
-    train_dataset_pipeline = train.get_dataset_shard("train_dataset")
+    train_dataset_pipeline = session.get_dataset_shard("train")
     train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs()
-    test_dataset = train.get_dataset_shard("test_dataset")
+    test_dataset = session.get_dataset_shard("test")
     test_torch_dataset = test_dataset.to_torch(
         label_column="label", batch_size=batch_size
     )
@@ -456,20 +458,20 @@ def train_func(config):
             f"{test_num_correct} / {test_num_total} = {test_acc:.4f}"
         )
 
-        # Record and log stats.
-        train.report(
-            train_acc=train_acc,
-            train_loss=train_running_loss,
-            test_acc=test_acc,
-            test_loss=test_running_loss,
-        )
-
         # Checkpoint model.
         module = net.module if isinstance(net, DistributedDataParallel) else net
-        train.save_checkpoint(model_state_dict=module.state_dict())
+        checkpoint = Checkpoint.from_dict(dict(model=module.cpu()))
 
-    if train.world_rank() == 0:
-        return module.cpu()
+        # Record and log stats.
+        session.report(
+            dict(
+                train_acc=train_acc,
+                train_loss=train_running_loss,
+                test_acc=test_acc,
+                test_loss=test_running_loss,
+            ),
+            checkpoint=checkpoint,
+        )
 
 
 if __name__ == "__main__":
@@ -598,7 +600,7 @@ def train_func(config):
     train_dataset_pipeline = train_dataset.repeat().random_shuffle_each_window()
     del train_dataset
 
-    datasets = {"train_dataset": train_dataset_pipeline, "test_dataset": test_dataset}
+    datasets = {"train": train_dataset_pipeline, "test": test_dataset}
 
     config = {
         "use_gpu": use_gpu,
@@ -611,15 +613,8 @@ def train_func(config):
         "num_features": num_features,
     }
 
-    # Create 2 callbacks: one for TensorBoard Logging and one for MLflow
-    # logging. Pass these into Trainer, and all results that are
-    # reported by ``train.report()`` will be logged to these 2 places.
-    # TODO: TBXLoggerCallback should create nonexistent logdir
-    #       and should also create 1 directory per file.
-    tbx_logdir = "./runs"
-    os.makedirs(tbx_logdir, exist_ok=True)
+    # Create the MLflowLoggerCallback
     callbacks = [
-        TBXLoggerCallback(logdir=tbx_logdir),
         MLflowLoggerCallback(
             experiment_name="cuj-big-data-training", save_artifact=True
         ),
@@ -628,18 +623,19 @@ def train_func(config):
     # Remove CPU resource so Datasets can be scheduled.
     resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None
 
-    trainer = Trainer(
-        backend="torch",
-        num_workers=num_workers,
-        use_gpu=use_gpu,
-        resources_per_worker=resources_per_worker,
-    )
-    trainer.start()
-    results = trainer.run(
-        train_func=train_func, config=config, callbacks=callbacks, dataset=datasets
+    trainer = TorchTrainer(
+        train_func,
+        train_loop_config=config,
+        datasets=datasets,
+        scaling_config=dict(
+            num_workers=num_workers,
+            use_gpu=use_gpu,
+            resources_per_worker=resources_per_worker,
+        ),
+        run_config=RunConfig(callbacks=callbacks),
     )
-    model = results[0]
-    trainer.shutdown()
+    results = trainer.fit()
+    model = results.checkpoint.to_dict()["model"]
 
     if args.mlflow_register_model:
         mlflow.pytorch.log_model(
diff --git a/doc/source/train/faq.rst b/doc/source/train/faq.rst
index 3230b4ff6041..f28fbecaf9d1 100644
--- a/doc/source/train/faq.rst
+++ b/doc/source/train/faq.rst
@@ -9,7 +9,7 @@ How fast is Ray Train compared to PyTorch, TensorFlow, etc.?
 At its core, training speed should be the same - while Ray Train launches distributed training workers via Ray Actors,
 communication during training (e.g. gradient synchronization) is handled by the backend training framework itself.
 
-For example, when running Ray Train with the ``"torch"`` backend,
+For example, when running Ray Train with the ``TorchTrainer``,
 distributed training communication is done with Torch's ``DistributedDataParallel``.
 
 How do I set resources?
@@ -18,7 +18,7 @@ How do I set resources?
 By default, each worker will reserve 1 CPU resource, and an additional 1 GPU resource if ``use_gpu=True``.
 
 To override these resource requests or request additional custom resources,
-you can initialize the ``Trainer`` with ``resources_per_worker``.
+you can initialize the ``Trainer`` with ``resources_per_worker`` specified in ``scaling_config``.
 
 .. note::
    Some GPU utility functions (e.g. :ref:`train-api-torch-get-device`, :ref:`train-api-torch-prepare-model`)
@@ -36,5 +36,5 @@ If you try to create a Matplotlib plot in the training function, you may encount
 
 To handle this, consider the following approaches:
 
-1. If there is no dependency on any code in your training function, simply move the Matplotlib logic out and execute it before or after ``trainer.run``.
-2. If you are plotting metrics, you can pass the metrics via ``train.report()`` and create a :ref:`custom callback <train-custom-callbacks>` to plot the results.
+1. If there is no dependency on any code in your training function, simply move the Matplotlib logic out and execute it before or after ``trainer.fit()``.
+2. If you are plotting metrics, you can pass the metrics via ``session.report()`` and create a :ref:`custom callback <train-custom-callbacks>` to plot the results.
diff --git a/doc/source/tune/examples/horovod_simple.ipynb b/doc/source/tune/examples/horovod_simple.ipynb
index bcf716f99dc3..5fe71d15f809 100644
--- a/doc/source/tune/examples/horovod_simple.ipynb
+++ b/doc/source/tune/examples/horovod_simple.ipynb
@@ -40,8 +40,8 @@
     "import torch\n",
     "\n",
     "import ray\n",
-    "from ray import train\n",
     "from ray import tune\n",
+    "from ray.air import session\n",
     "from ray.train.horovod import HorovodTrainer\n",
     "from ray.tune.tune_config import TuneConfig\n",
     "from ray.tune.tuner import Tuner\n",
@@ -119,7 +119,7 @@
     "\n",
     "        optimizer.step()\n",
     "        time.sleep(0.1)\n",
-    "        train.report(loss=loss.item())\n",
+    "        session.report(dict(loss=loss.item()))\n",
     "    total = time.time() - start\n",
     "    print(f\"Took {total:0.3f} s. Avg: {total / num_steps:0.3f} s.\")\n",
     "\n",
diff --git a/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb b/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb
index 0d8e36efcebc..5862d75e0d43 100644
--- a/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb
+++ b/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb
@@ -2034,7 +2034,7 @@
     }
    ],
    "source": [
-    "from ray.air.train.integrations.tensorflow import TensorflowTrainer\n",
+    "from ray.train.tensorflow import TensorflowTrainer\n",
     "from ray.air.result import Result\n",
     "\n",
     "def train_tensorflow_mnist(\n",

From cc7d0663167cbf2e77c5fb76402f64442fdcc9fc Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 5 Jul 2022 19:25:35 +0000
Subject: [PATCH 62/70] Fix horovod test

---
 .../horovod/workloads/horovod_tune_test.py    | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/release/air_tests/horovod/workloads/horovod_tune_test.py b/release/air_tests/horovod/workloads/horovod_tune_test.py
index bab14b9bbf0e..e437d4e99abe 100755
--- a/release/air_tests/horovod/workloads/horovod_tune_test.py
+++ b/release/air_tests/horovod/workloads/horovod_tune_test.py
@@ -58,6 +58,7 @@ def train_loop_per_worker(config):
     trainloader = DataLoader(
         trainset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=4
     )
+    trainloader_len = len(trainloader)
 
     for epoch in range(epoch, 40):  # loop over the dataset multiple times
         running_loss = 0.0
@@ -79,21 +80,22 @@ def train_loop_per_worker(config):
             # print statistics
             running_loss += loss.item()
             epoch_steps += 1
+            if i == trainloader_len - 1:
+                checkpoint = Checkpoint.from_dict(
+                    dict(
+                        model_state=net.state_dict(),
+                        optimizer_state=optimizer.state_dict(),
+                        epoch=epoch,
+                    )
+                )
+            else:
+                checkpoint = None
+            session.report(dict(loss=running_loss / epoch_steps), checkpoint=checkpoint)
             if i % 2000 == 1999:  # print every 2000 mini-batches
                 print(
                     "[%d, %5d] loss: %.3f"
                     % (epoch + 1, i + 1, running_loss / epoch_steps)
                 )
-        session.report(
-            dict(loss=running_loss / epoch_steps),
-            checkpoint=Checkpoint.from_dict(
-                dict(
-                    model_state=net.state_dict(),
-                    optimizer_state=optimizer.state_dict(),
-                    epoch=epoch,
-                )
-            ),
-        )
 
 
 if __name__ == "__main__":

From 6cf961ea14f297165097412ce88e179ca9a537a0 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 5 Jul 2022 20:56:29 +0000
Subject: [PATCH 63/70] Fix CI

---
 .../_examples/datasets_train/datasets_train.py         | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/source/ray-core/_examples/datasets_train/datasets_train.py b/doc/source/ray-core/_examples/datasets_train/datasets_train.py
index a915c3029f27..69560c14c31c 100644
--- a/doc/source/ray-core/_examples/datasets_train/datasets_train.py
+++ b/doc/source/ray-core/_examples/datasets_train/datasets_train.py
@@ -18,6 +18,7 @@
 import boto3
 import mlflow
 import pandas as pd
+from ray.air.config import DatasetConfig
 from ray.train.torch.torch_trainer import TorchTrainer
 import torch
 import torch.nn as nn
@@ -596,11 +597,7 @@ def train_func(config):
     DROPOUT_EVERY = 5
     DROPOUT_PROB = 0.2
 
-    # Random global shuffle
-    train_dataset_pipeline = train_dataset.repeat().random_shuffle_each_window()
-    del train_dataset
-
-    datasets = {"train": train_dataset_pipeline, "test": test_dataset}
+    datasets = {"train": train_dataset, "test": test_dataset}
 
     config = {
         "use_gpu": use_gpu,
@@ -633,6 +630,9 @@ def train_func(config):
             resources_per_worker=resources_per_worker,
         ),
         run_config=RunConfig(callbacks=callbacks),
+        dataset_config={
+            "train": DatasetConfig(use_stream_api=True, global_shuffle=True)
+        },
     )
     results = trainer.fit()
     model = results.checkpoint.to_dict()["model"]

From 330c36be89b7a9b294d52caf69380146333b0128 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 5 Jul 2022 22:05:24 +0000
Subject: [PATCH 64/70] Fix CI

---
 .../ray-core/_examples/datasets_train/datasets_train.py       | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/source/ray-core/_examples/datasets_train/datasets_train.py b/doc/source/ray-core/_examples/datasets_train/datasets_train.py
index 69560c14c31c..8f539d37ae04 100644
--- a/doc/source/ray-core/_examples/datasets_train/datasets_train.py
+++ b/doc/source/ray-core/_examples/datasets_train/datasets_train.py
@@ -631,7 +631,9 @@ def train_func(config):
         ),
         run_config=RunConfig(callbacks=callbacks),
         dataset_config={
-            "train": DatasetConfig(use_stream_api=True, global_shuffle=True)
+            "train": DatasetConfig(
+                use_stream_api=True, stream_window_size=-1, global_shuffle=True
+            )
         },
     )
     results = trainer.fit()

From 30c9ab896cda4d1cc7e8ba94c03b93cf015e024d Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 5 Jul 2022 22:59:58 +0000
Subject: [PATCH 65/70] Fix CI

---
 .../datasets_train/datasets_train.py          | 45 +++++++++----------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/doc/source/ray-core/_examples/datasets_train/datasets_train.py b/doc/source/ray-core/_examples/datasets_train/datasets_train.py
index 8f539d37ae04..350fe50d61a2 100644
--- a/doc/source/ray-core/_examples/datasets_train/datasets_train.py
+++ b/doc/source/ray-core/_examples/datasets_train/datasets_train.py
@@ -416,7 +416,7 @@ def train_func(config):
     train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs()
     test_dataset = session.get_dataset_shard("test")
     test_torch_dataset = test_dataset.to_torch(
-        label_column="label", batch_size=batch_size
+        label_column="label", batch_size=batch_size, drop_last=True
     )
 
     net = Net(
@@ -461,9 +461,10 @@ def train_func(config):
 
         # Checkpoint model.
         module = net.module if isinstance(net, DistributedDataParallel) else net
-        checkpoint = Checkpoint.from_dict(dict(model=module.cpu()))
+        checkpoint = Checkpoint.from_dict(dict(model=module.state_dict()))
 
         # Record and log stats.
+        print(f"session report on {session.get_world_rank()}")
         session.report(
             dict(
                 train_acc=train_acc,
@@ -637,9 +638,27 @@ def train_func(config):
         },
     )
     results = trainer.fit()
-    model = results.checkpoint.to_dict()["model"]
+    state_dict = results.checkpoint.to_dict()["model"]
+
+    def load_model_func():
+        num_layers = config["num_layers"]
+        num_hidden = config["num_hidden"]
+        dropout_every = config["dropout_every"]
+        dropout_prob = config["dropout_prob"]
+        num_features = config["num_features"]
+
+        model = Net(
+            n_layers=num_layers,
+            n_features=num_features,
+            num_hidden=num_hidden,
+            dropout_every=dropout_every,
+            drop_prob=dropout_prob,
+        )
+        model.load_state_dict(state_dict)
+        return model
 
     if args.mlflow_register_model:
+        model = load_model_func()
         mlflow.pytorch.log_model(
             model, artifact_path="models", registered_model_name="torch_model"
         )
@@ -658,26 +677,6 @@ def load_model_func():
             model_uri = f"models:/torch_model/{latest_version}"
             return mlflow.pytorch.load_model(model_uri)
 
-    else:
-        state_dict = model.state_dict()
-
-        def load_model_func():
-            num_layers = config["num_layers"]
-            num_hidden = config["num_hidden"]
-            dropout_every = config["dropout_every"]
-            dropout_prob = config["dropout_prob"]
-            num_features = config["num_features"]
-
-            model = Net(
-                n_layers=num_layers,
-                n_features=num_features,
-                num_hidden=num_hidden,
-                dropout_every=dropout_every,
-                drop_prob=dropout_prob,
-            )
-            model.load_state_dict(state_dict)
-            return model
-
     class BatchInferModel:
         def __init__(self, load_model_func):
             self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

From d0affbcfa2f452bfd42d7ae4852431a533908437 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Wed, 6 Jul 2022 16:06:48 +0000
Subject: [PATCH 66/70] Fix tests

---
 .../examples/horovod/horovod_pytorch_example.py    |  5 +++++
 python/ray/train/huggingface/_huggingface_utils.py |  2 +-
 python/ray/train/tests/test_huggingface_trainer.py | 14 +++++++-------
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/python/ray/air/examples/horovod/horovod_pytorch_example.py b/python/ray/air/examples/horovod/horovod_pytorch_example.py
index 946cddc4fd59..62bcfaa4e92d 100644
--- a/python/ray/air/examples/horovod/horovod_pytorch_example.py
+++ b/python/ray/air/examples/horovod/horovod_pytorch_example.py
@@ -142,6 +142,7 @@ def train_func(config):
 
     model, optimizer, train_loader, train_sampler = setup(config)
 
+    results = []
     for epoch in range(num_epochs):
         loss = train_epoch(
             model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda
@@ -151,8 +152,12 @@ def train_func(config):
         else:
             checkpoint_dict = dict(model=model)
         checkpoint_dict = Checkpoint.from_dict(checkpoint_dict)
+        results.append(loss)
         session.report(dict(loss=loss), checkpoint=checkpoint_dict)
 
+    # Only used for testing.
+    return results
+
 
 def main(num_workers, use_gpu, kwargs):
     trainer = HorovodTrainer(
diff --git a/python/ray/train/huggingface/_huggingface_utils.py b/python/ray/train/huggingface/_huggingface_utils.py
index d7b50f810d99..7fc4237d16cd 100644
--- a/python/ray/train/huggingface/_huggingface_utils.py
+++ b/python/ray/train/huggingface/_huggingface_utils.py
@@ -149,7 +149,7 @@ def on_save(self, args, state, control, **kwargs):
             )
 
     def _report(self):
-        if self.delayed_report:
+        if self.delayed_report["metrics"]:
             session.report(**self.delayed_report)
             self.delayed_report = {"metrics": {}, "checkpoint": None}
 
diff --git a/python/ray/train/tests/test_huggingface_trainer.py b/python/ray/train/tests/test_huggingface_trainer.py
index bde24f1bf7a1..a1b30cdeb48d 100644
--- a/python/ray/train/tests/test_huggingface_trainer.py
+++ b/python/ray/train/tests/test_huggingface_trainer.py
@@ -107,7 +107,7 @@ def test_reporting():
     def _fake_report(**kwargs):
         reports.append(kwargs)
 
-    with patch("ray.train.report", _fake_report):
+    with patch("ray.air.session.report", _fake_report):
         state = TrainerState()
         report_callback = TrainReportCallback()
         report_callback.on_epoch_begin(None, state, None)
@@ -125,12 +125,12 @@ def _fake_report(**kwargs):
         report_callback.on_train_end(None, state, None)
 
     assert len(reports) == 2
-    assert "log1" in reports[0]
-    assert "log2" in reports[0]
-    assert reports[0]["epoch"] == 1
-    assert "log1" in reports[1]
-    assert "log2" in reports[1]
-    assert reports[1]["epoch"] == 2
+    assert "log1" in reports[0]["metrics"]
+    assert "log2" in reports[0]["metrics"]
+    assert reports[0]["metrics"]["epoch"] == 1
+    assert "log1" in reports[1]["metrics"]
+    assert "log2" in reports[1]["metrics"]
+    assert reports[1]["metrics"]["epoch"] == 2
 
 
 if __name__ == "__main__":

From 587ad5634779021acc6ac63a6df49ae67bbf47d3 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Wed, 6 Jul 2022 18:24:59 +0000
Subject: [PATCH 67/70] Add todo

---
 python/ray/train/examples/torch_fashion_mnist_example.py  | 2 ++
 python/ray/train/examples/torch_linear_example.py         | 1 +
 python/ray/train/examples/tune_cifar_torch_pbt_example.py | 2 ++
 3 files changed, 5 insertions(+)

diff --git a/python/ray/train/examples/torch_fashion_mnist_example.py b/python/ray/train/examples/torch_fashion_mnist_example.py
index 6e8db3220db4..5d716cb2dd91 100644
--- a/python/ray/train/examples/torch_fashion_mnist_example.py
+++ b/python/ray/train/examples/torch_fashion_mnist_example.py
@@ -114,6 +114,8 @@ def train_func(config: Dict):
         train.report(loss=loss)
         loss_results.append(loss)
 
+    # return required for backwards compatibility with the old API
+    # TODO(team-ml) clean up and remove return
     return loss_results
 
 
diff --git a/python/ray/train/examples/torch_linear_example.py b/python/ray/train/examples/torch_linear_example.py
index ceabd0c2853f..892cbb486244 100644
--- a/python/ray/train/examples/torch_linear_example.py
+++ b/python/ray/train/examples/torch_linear_example.py
@@ -81,6 +81,7 @@ def train_func(config):
         train.report(**result)
         results.append(result)
     # return required for backwards compatibility with the old API
+    # TODO(team-ml) clean up and remove return
     return results
 
 
diff --git a/python/ray/train/examples/tune_cifar_torch_pbt_example.py b/python/ray/train/examples/tune_cifar_torch_pbt_example.py
index f0b5c786ff8d..bddc01e1cd95 100644
--- a/python/ray/train/examples/tune_cifar_torch_pbt_example.py
+++ b/python/ray/train/examples/tune_cifar_torch_pbt_example.py
@@ -117,6 +117,8 @@ def train_func(config):
         train.report(**result)
         results.append(result)
 
+    # return required for backwards compatibility with the old API
+    # TODO(team-ml) clean up and remove return
     return results
 
 

From 139f44d495083261ac0cb3ff33c80f18d59a5ff0 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Wed, 6 Jul 2022 18:33:30 +0000
Subject: [PATCH 68/70] Use `trial_logdir` instead

---
 python/ray/air/result.py                           | 3 ++-
 python/ray/train/examples/mlflow_simple_example.py | 2 +-
 python/ray/tune/result_grid.py                     | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/ray/air/result.py b/python/ray/air/result.py
index 833aa7660f33..a6cf8fe47353 100644
--- a/python/ray/air/result.py
+++ b/python/ray/air/result.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
 from ray.air.checkpoint import Checkpoint
@@ -38,7 +39,7 @@ class Result:
     metrics: Optional[Dict[str, Any]]
     checkpoint: Optional[Checkpoint]
     error: Optional[Exception]
-    log_dir: Optional[str]
+    log_dir: Optional[Path]
     metrics_dataframe: Optional[pd.DataFrame]
     best_checkpoints: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]]
 
diff --git a/python/ray/train/examples/mlflow_simple_example.py b/python/ray/train/examples/mlflow_simple_example.py
index d64d0525ae58..b3a7264b20d8 100644
--- a/python/ray/train/examples/mlflow_simple_example.py
+++ b/python/ray/train/examples/mlflow_simple_example.py
@@ -41,7 +41,7 @@ def train_func():
 
 # Print the latest run directory and keep note of it.
 # For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06
-print("Run directory:", result.log_dir)
+print("Run directory:", result.log_dir.parent)  # TensorBoard is saved in parent dir
 
 # How to visualize the logs
 
diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index 77994e2f491b..bdf39b97f4ef 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -1,4 +1,5 @@
 import os
+from pathlib import Path
 from typing import Optional, Union
 
 import pandas as pd
@@ -180,7 +181,7 @@ def _trial_to_result(self, trial: Trial) -> Result:
             checkpoint=checkpoint,
             metrics=trial.last_result.copy(),
             error=self._populate_exception(trial),
-            log_dir=trial.local_dir,
+            log_dir=Path(trial.logdir),
             metrics_dataframe=self._experiment_analysis.trial_dataframes.get(
                 trial.logdir
             )

From 3a4d3f347d3f1170269ce77ad4adbd7ab057e32d Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Wed, 6 Jul 2022 21:49:00 +0000
Subject: [PATCH 69/70] Fix

---
 python/ray/tune/result_grid.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index bdf39b97f4ef..bec6a1438bee 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -181,7 +181,7 @@ def _trial_to_result(self, trial: Trial) -> Result:
             checkpoint=checkpoint,
             metrics=trial.last_result.copy(),
             error=self._populate_exception(trial),
-            log_dir=Path(trial.logdir),
+            log_dir=Path(trial.logdir) if trial.logdir else None,
             metrics_dataframe=self._experiment_analysis.trial_dataframes.get(
                 trial.logdir
             )

From 2ea93d78a4bffc4d2b3e4e000dafdae3dc18390f Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 7 Jul 2022 17:51:47 +0000
Subject: [PATCH 70/70] Only print metrics

---
 python/ray/train/examples/horovod/horovod_example.py            | 2 +-
 python/ray/train/examples/mlflow_fashion_mnist_example.py       | 2 +-
 python/ray/train/examples/tensorflow_linear_dataset_example.py  | 2 +-
 python/ray/train/examples/tensorflow_mnist_example.py           | 2 +-
 .../auto_pipeline_for_host_to_device_data_transfer.py           | 2 +-
 python/ray/train/examples/torch_fashion_mnist_example.py        | 2 +-
 python/ray/train/examples/torch_linear_dataset_example.py       | 2 +-
 python/ray/train/examples/torch_linear_example.py               | 2 +-
 python/ray/train/examples/transformers/transformers_example.py  | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/ray/train/examples/horovod/horovod_example.py b/python/ray/train/examples/horovod/horovod_example.py
index c01788008ec5..8e930f7d151f 100644
--- a/python/ray/train/examples/horovod/horovod_example.py
+++ b/python/ray/train/examples/horovod/horovod_example.py
@@ -158,7 +158,7 @@ def main(num_workers, use_gpu, kwargs):
         scaling_config={"use_gpu": use_gpu, "num_workers": num_workers},
     )
     results = trainer.fit()
-    print(results)
+    print(results.metrics)
 
 
 # Horovod Class API.
diff --git a/python/ray/train/examples/mlflow_fashion_mnist_example.py b/python/ray/train/examples/mlflow_fashion_mnist_example.py
index 2d223c43ec1d..99f7b73a525a 100644
--- a/python/ray/train/examples/mlflow_fashion_mnist_example.py
+++ b/python/ray/train/examples/mlflow_fashion_mnist_example.py
@@ -17,7 +17,7 @@ def main(num_workers=2, use_gpu=False):
     )
     final_results = trainer.fit()
 
-    print("Full results for rank 0 worker: ", final_results)
+    print("Final metrics: ", final_results.metrics)
 
 
 if __name__ == "__main__":
diff --git a/python/ray/train/examples/tensorflow_linear_dataset_example.py b/python/ray/train/examples/tensorflow_linear_dataset_example.py
index 0ee9d48d2077..f3a938e06c0e 100644
--- a/python/ray/train/examples/tensorflow_linear_dataset_example.py
+++ b/python/ray/train/examples/tensorflow_linear_dataset_example.py
@@ -83,7 +83,7 @@ def train_tensorflow_linear(num_workers=2, use_gpu=False):
         scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
     )
     results = trainer.fit()
-    print(f"Results: {results}")
+    print(f"Results: {results.metrics}")
     return results
 
 
diff --git a/python/ray/train/examples/tensorflow_mnist_example.py b/python/ray/train/examples/tensorflow_mnist_example.py
index 97e8db033025..14f4cf6dc7ef 100644
--- a/python/ray/train/examples/tensorflow_mnist_example.py
+++ b/python/ray/train/examples/tensorflow_mnist_example.py
@@ -81,7 +81,7 @@ def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4):
         scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
     )
     results = trainer.fit()
-    print(f"Results: {results[0]}")
+    print(f"Results: {results.metrics}")
 
 
 if __name__ == "__main__":
diff --git a/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py b/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py
index 03e69ca67f96..1220f541d034 100644
--- a/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py
+++ b/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py
@@ -109,7 +109,7 @@ def train_linear(num_workers=1, num_hidden_layers=1, use_auto_transfer=True, epo
     )
     results = trainer.fit()
 
-    print(results)
+    print(results.metrics)
     return results
 
 
diff --git a/python/ray/train/examples/torch_fashion_mnist_example.py b/python/ray/train/examples/torch_fashion_mnist_example.py
index 5d716cb2dd91..7ad5017bbc5c 100644
--- a/python/ray/train/examples/torch_fashion_mnist_example.py
+++ b/python/ray/train/examples/torch_fashion_mnist_example.py
@@ -126,7 +126,7 @@ def train_fashion_mnist(num_workers=2, use_gpu=False):
         scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
     )
     result = trainer.fit()
-    print(f"Results: {result}")
+    print(f"Results: {result.metrics}")
 
 
 if __name__ == "__main__":
diff --git a/python/ray/train/examples/torch_linear_dataset_example.py b/python/ray/train/examples/torch_linear_dataset_example.py
index acfa0ce2e637..15fbf0da97b9 100644
--- a/python/ray/train/examples/torch_linear_dataset_example.py
+++ b/python/ray/train/examples/torch_linear_dataset_example.py
@@ -128,7 +128,7 @@ def train_linear(num_workers=2, use_gpu=False):
         scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
     )
     results = trainer.fit()
-    print(results)
+    print(results.metrics)
     return results
 
 
diff --git a/python/ray/train/examples/torch_linear_example.py b/python/ray/train/examples/torch_linear_example.py
index 892cbb486244..8be2e1d2dcc6 100644
--- a/python/ray/train/examples/torch_linear_example.py
+++ b/python/ray/train/examples/torch_linear_example.py
@@ -94,7 +94,7 @@ def train_linear(num_workers=2, use_gpu=False, epochs=3):
     )
     results = trainer.fit()
 
-    print(results)
+    print(results.metrics)
     return results
 
 
diff --git a/python/ray/train/examples/transformers/transformers_example.py b/python/ray/train/examples/transformers/transformers_example.py
index 1b47e5ce2e31..30f9f0158f06 100644
--- a/python/ray/train/examples/transformers/transformers_example.py
+++ b/python/ray/train/examples/transformers/transformers_example.py
@@ -619,7 +619,7 @@ def main():
             scaling_config={"num_workers": args.num_workers, "use_gpu": args.use_gpu},
         )
         results = trainer.fit()
-        print(results)
+        print(results.metrics)
     else:
         # Run training locally.
         train_func(config)