From b39a86490665efa7a307c21841edd6611f4bdfa8 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 13 Jun 2022 21:10:19 +0000 Subject: [PATCH 01/70] Use new Train API for examples --- doc/source/train/examples.rst | 2 +- .../examples/tune_linear_dataset_example.rst | 6 -- .../tune_torch_linear_dataset_example.rst | 6 ++ python/ray/air/result.py | 6 +- python/ray/train/BUILD | 10 --- .../train/examples/horovod/horovod_example.py | 17 +++-- .../examples/mlflow_fashion_mnist_example.py | 21 +++--- .../tensorflow_linear_dataset_example.py | 18 +++-- .../examples/tensorflow_mnist_example.py | 12 ++-- .../train/examples/tensorflow_quick_start.py | 17 +++-- ...peline_for_host_to_device_data_transfer.py | 13 ++-- .../ray/train/examples/torch_quick_start.py | 19 +++--- .../examples/train_fashion_mnist_example.py | 20 +++--- .../examples/train_linear_dataset_example.py | 15 ++-- .../train/examples/train_linear_example.py | 14 ++-- .../transformers/transformers_example.py | 17 +++-- .../tune_cifar_pytorch_pbt_example.py | 56 ++++++++------- .../examples/tune_linear_dataset_example.py | 68 ------------------- .../ray/train/examples/tune_linear_example.py | 34 ++++++---- .../examples/tune_tensorflow_mnist_example.py | 36 +++++----- python/ray/tune/result_grid.py | 6 +- 21 files changed, 178 insertions(+), 235 deletions(-) delete mode 100644 doc/source/train/examples/tune_linear_dataset_example.rst create mode 100644 doc/source/train/examples/tune_torch_linear_dataset_example.rst delete mode 100644 python/ray/train/examples/tune_linear_dataset_example.py diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst index 6affd7457a1c..e644f708b639 100644 --- a/doc/source/train/examples.rst +++ b/doc/source/train/examples.rst @@ -62,7 +62,7 @@ Ray Datasets Integration Examples * :doc:`/train/examples/train_linear_dataset_example`: Simple example for training a linear PyTorch model. -* :doc:`/train/examples/tune_linear_dataset_example`: +* :doc:`/air/examples/tune_torch_linear_dataset_example`: Simple example for tuning a linear PyTorch model. diff --git a/doc/source/train/examples/tune_linear_dataset_example.rst b/doc/source/train/examples/tune_linear_dataset_example.rst deleted file mode 100644 index d25af796465c..000000000000 --- a/doc/source/train/examples/tune_linear_dataset_example.rst +++ /dev/null @@ -1,6 +0,0 @@ -:orphan: - -tune_linear_dataset_example -=========================== - -.. literalinclude:: /../../python/ray/train/examples/tune_linear_dataset_example.py diff --git a/doc/source/train/examples/tune_torch_linear_dataset_example.rst b/doc/source/train/examples/tune_torch_linear_dataset_example.rst new file mode 100644 index 000000000000..22ad2e562660 --- /dev/null +++ b/doc/source/train/examples/tune_torch_linear_dataset_example.rst @@ -0,0 +1,6 @@ +:orphan: + +tune_torch_linear_dataset_example +================================= + +.. literalinclude:: /../../python/ray/air/examples/tune_torch_linear_dataset_example.py diff --git a/python/ray/air/result.py b/python/ray/air/result.py index 69cfd69926b8..97472c64d395 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -1,5 +1,5 @@ -from typing import Any, Dict, Optional from dataclasses import dataclass +from typing import Any, Dict, Optional from ray.air.checkpoint import Checkpoint from ray.util.annotations import PublicAPI @@ -13,7 +13,7 @@ class Result: This is the class produced by Trainer.fit(). It contains a checkpoint, which can be used for resuming training and for creating a Predictor object. It also contains a metrics object describing - training metrics. `error` is included so that non successful runs + training metrics. ``error`` is included so that non successful runs and trials can be represented as well. The constructor is a private API. @@ -22,11 +22,13 @@ class Result: metrics: The final metrics as reported by an Trainable. checkpoint: The final checkpoint of the Trainable. error: The execution error of the Trainable run, if the trial finishes in error. + log_dir: Directory where the trial logs are saved. """ metrics: Optional[Dict[str, Any]] checkpoint: Optional[Checkpoint] error: Optional[Exception] + log_dir: Optional[str] @property def config(self) -> Optional[Dict[str, Any]]: diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index dc2d6357b6cd..d33bad4d9ac2 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -82,16 +82,6 @@ py_test( args = ["--smoke-test"] ) -py_test( - name = "tune_linear_dataset_example", - size = "medium", - main = "examples/tune_linear_dataset_example.py", - srcs = ["examples/tune_linear_dataset_example.py"], - tags = ["team:ml", "exclusive", "gpu_only", "tune"], - deps = [":train_lib"], - args = ["--smoke-test", "--use-gpu"] -) - py_test( name = "tune_linear_example", size = "medium", diff --git a/python/ray/train/examples/horovod/horovod_example.py b/python/ray/train/examples/horovod/horovod_example.py index cb578b1fb18f..c3202307755f 100644 --- a/python/ray/train/examples/horovod/horovod_example.py +++ b/python/ray/train/examples/horovod/horovod_example.py @@ -2,15 +2,16 @@ import os import horovod.torch as hvd -import ray import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch.utils.data.distributed from filelock import FileLock -from ray.train import Trainer from torchvision import datasets, transforms +import ray +from ray.train.horovod import HorovodTrainer + def metric_average(val, name): tensor = torch.tensor(val) @@ -152,11 +153,13 @@ def train_func(config): def main(num_workers, use_gpu, kwargs): - trainer = Trainer("horovod", use_gpu=use_gpu, num_workers=num_workers) - trainer.start() - loss_per_epoch = trainer.run(train_func, config=kwargs) - trainer.shutdown() - print(loss_per_epoch) + trainer = HorovodTrainer( + train_func, + train_loop_config=kwargs, + scaling_config={"use_gpu": use_gpu, "num_workers": num_workers}, + ) + results = trainer.fit() + print(results) # Horovod Class API. diff --git a/python/ray/train/examples/mlflow_fashion_mnist_example.py b/python/ray/train/examples/mlflow_fashion_mnist_example.py index 05f915523543..7cd54b821859 100644 --- a/python/ray/train/examples/mlflow_fashion_mnist_example.py +++ b/python/ray/train/examples/mlflow_fashion_mnist_example.py @@ -1,20 +1,23 @@ import argparse -from ray.train import Trainer +from ray.air import RunConfig from ray.train.examples.train_fashion_mnist_example import train_func -from ray.train.callbacks.logging import MLflowLoggerCallback +from ray.train.torch import TorchTrainer +from ray.tune.integration.mlflow import MLflowLoggerCallback def main(num_workers=2, use_gpu=False): - trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu) - trainer.start() - final_results = trainer.run( - train_func=train_func, - config={"lr": 1e-3, "batch_size": 64, "epochs": 4}, - callbacks=[MLflowLoggerCallback(experiment_name="train_fashion_mnist")], + trainer = TorchTrainer( + train_func, + train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4}, + scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, + run_config=RunConfig( + callbacks=[MLflowLoggerCallback(experiment_name="train_fashion_mnist")] + ), ) + final_results = trainer.fit() - print("Full losses for rank 0 worker: ", final_results) + print("Full results for rank 0 worker: ", final_results) if __name__ == "__main__": diff --git a/python/ray/train/examples/tensorflow_linear_dataset_example.py b/python/ray/train/examples/tensorflow_linear_dataset_example.py index c1360195b36c..9271c5125da4 100644 --- a/python/ray/train/examples/tensorflow_linear_dataset_example.py +++ b/python/ray/train/examples/tensorflow_linear_dataset_example.py @@ -7,8 +7,7 @@ import ray.train as train from ray.data import Dataset from ray.data.dataset_pipeline import DatasetPipeline -from ray.train import Trainer -from ray.train.tensorflow import prepare_dataset_shard +from ray.train.tensorflow import TensorflowTrainer, prepare_dataset_shard class TrainReportCallback(Callback): @@ -55,7 +54,7 @@ def train_func(config): # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = build_and_compile_model(config) - dataset_pipeline = train.get_dataset_shard() + dataset_pipeline = train.get_dataset_shard("train") dataset_iterator = dataset_pipeline.iter_epochs() results = [] @@ -78,14 +77,13 @@ def train_func(config): def train_tensorflow_linear(num_workers=2, use_gpu=False): dataset_pipeline = get_dataset_pipeline() - trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) - trainer.start() - results = trainer.run( - train_func=train_func, - dataset=dataset_pipeline, - config={"lr": 1e-3, "batch_size": 32, "epochs": 4}, + trainer = TensorflowTrainer( + train_func, + train_loop_config={"lr": 1e-3, "batch_size": 32, "epochs": 4}, + datasets={"train": dataset_pipeline}, + scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) - trainer.shutdown() + results = trainer.fit() print(f"Results: {results[0]}") return results diff --git a/python/ray/train/examples/tensorflow_mnist_example.py b/python/ray/train/examples/tensorflow_mnist_example.py index 3e89969cc58e..980e58652d95 100644 --- a/python/ray/train/examples/tensorflow_mnist_example.py +++ b/python/ray/train/examples/tensorflow_mnist_example.py @@ -10,7 +10,7 @@ from tensorflow.keras.callbacks import Callback import ray.train as train -from ray.train import Trainer +from ray.train.tensorflow import TensorflowTrainer class TrainReportCallback(Callback): @@ -81,12 +81,12 @@ def train_func(config): def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4): - trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) - trainer.start() - results = trainer.run( - train_func=train_func, config={"lr": 1e-3, "batch_size": 64, "epochs": epochs} + trainer = TensorflowTrainer( + train_func, + train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": epochs}, + scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) - trainer.shutdown() + results = trainer.fit() print(f"Results: {results[0]}") diff --git a/python/ray/train/examples/tensorflow_quick_start.py b/python/ray/train/examples/tensorflow_quick_start.py index 0907853135b9..0ac3666672e2 100644 --- a/python/ray/train/examples/tensorflow_quick_start.py +++ b/python/ray/train/examples/tensorflow_quick_start.py @@ -3,6 +3,9 @@ # __tf_setup_begin__ +import json +import os + import numpy as np import tensorflow as tf @@ -47,8 +50,6 @@ def train_func(): # __tf_distributed_begin__ -import json -import os def train_func_distributed(): per_worker_batch_size = 64 @@ -78,15 +79,13 @@ def train_func_distributed(): # __tf_trainer_begin__ - from ray.train import Trainer - - trainer = Trainer(backend="tensorflow", num_workers=4) + from ray.train.tensorflow import TensorflowTrainer # For GPU Training, set `use_gpu` to True. - # trainer = Trainer(backend="tensorflow", num_workers=4, use_gpu=True) + use_gpu = False + + trainer = TensorflowTrainer(train_func_distributed, scaling_config={"num_workers":4, "use_gpu":use_gpu}) - trainer.start() - results = trainer.run(train_func_distributed) - trainer.shutdown() + trainer.fit() # __tf_trainer_end__ diff --git a/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py b/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py index c8cc25b044a0..03e69ca67f96 100644 --- a/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py +++ b/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py @@ -5,8 +5,9 @@ import numpy as np import torch import torch.nn as nn + import ray.train as train -from ray.train import Trainer +from ray.train.torch import TorchTrainer class Net(nn.Module): @@ -94,7 +95,6 @@ def train_func(config): def train_linear(num_workers=1, num_hidden_layers=1, use_auto_transfer=True, epochs=3): - trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=True) config = { "lr": 1e-2, "hidden_size": num_hidden_layers, @@ -102,9 +102,12 @@ def train_linear(num_workers=1, num_hidden_layers=1, use_auto_transfer=True, epo "epochs": epochs, "use_auto_transfer": use_auto_transfer, } - trainer.start() - results = trainer.run(train_func, config) - trainer.shutdown() + trainer = TorchTrainer( + train_func, + train_loop_config=config, + scaling_config={"use_gpu": True, "num_workers": num_workers}, + ) + results = trainer.fit() print(results) return results diff --git a/python/ray/train/examples/torch_quick_start.py b/python/ray/train/examples/torch_quick_start.py index e152c8604610..eaf07a95a5d1 100644 --- a/python/ray/train/examples/torch_quick_start.py +++ b/python/ray/train/examples/torch_quick_start.py @@ -4,6 +4,10 @@ # __torch_setup_begin__ import torch import torch.nn as nn +import torch.optim as optim + +import ray.train.torch +from ray import train num_samples = 20 input_size = 10 @@ -28,7 +32,6 @@ def forward(self, input): # __torch_single_begin__ -import torch.optim as optim def train_func(): num_epochs = 3 @@ -48,8 +51,6 @@ def train_func(): # __torch_distributed_begin__ -from ray import train -import ray.train.torch def train_func_distributed(): num_epochs = 3 @@ -78,15 +79,13 @@ def train_func_distributed(): # __torch_trainer_begin__ - from ray.train import Trainer - - trainer = Trainer(backend="torch", num_workers=4) + from ray.train.torch import TorchTrainer # For GPU Training, set `use_gpu` to True. - # trainer = Trainer(backend="torch", num_workers=4, use_gpu=True) + use_gpu = False + + trainer = TorchTrainer(train_func_distributed, scaling_config={"num_workers":4, "use_gpu":use_gpu}) - trainer.start() - results = trainer.run(train_func_distributed) - trainer.shutdown() + results = trainer.fit() # __torch_trainer_end__ diff --git a/python/ray/train/examples/train_fashion_mnist_example.py b/python/ray/train/examples/train_fashion_mnist_example.py index 5c172dc5a949..6e8db3220db4 100644 --- a/python/ray/train/examples/train_fashion_mnist_example.py +++ b/python/ray/train/examples/train_fashion_mnist_example.py @@ -2,14 +2,14 @@ from typing import Dict import torch -import ray.train as train -from ray.train.trainer import Trainer -from ray.train.callbacks import JsonLoggerCallback from torch import nn from torch.utils.data import DataLoader from torchvision import datasets from torchvision.transforms import ToTensor +import ray.train as train +from ray.train.torch import TorchTrainer + # Download training data from open datasets. training_data = datasets.FashionMNIST( root="~/data", @@ -118,15 +118,13 @@ def train_func(config: Dict): def train_fashion_mnist(num_workers=2, use_gpu=False): - trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu) - trainer.start() - result = trainer.run( - train_func=train_func, - config={"lr": 1e-3, "batch_size": 64, "epochs": 4}, - callbacks=[JsonLoggerCallback()], + trainer = TorchTrainer( + train_func, + train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4}, + scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) - trainer.shutdown() - print(f"Loss results: {result}") + result = trainer.fit() + print(f"Results: {result}") if __name__ == "__main__": diff --git a/python/ray/train/examples/train_linear_dataset_example.py b/python/ray/train/examples/train_linear_dataset_example.py index 2ce30c9b81b8..1cfbff434c9f 100644 --- a/python/ray/train/examples/train_linear_dataset_example.py +++ b/python/ray/train/examples/train_linear_dataset_example.py @@ -8,8 +8,7 @@ import ray.train as train from ray.data import Dataset from ray.data.dataset_pipeline import DatasetPipeline -from ray.train import Trainer -from ray.train.callbacks import JsonLoggerCallback, TBXLoggerCallback +from ray.train.torch import TorchTrainer def get_datasets(a=5, b=10, size=1000, split=0.8) -> Dict[str, DatasetPipeline]: @@ -120,16 +119,14 @@ def train_func(config): def train_linear(num_workers=2, use_gpu=False): datasets = get_datasets() - trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} - trainer.start() - results = trainer.run( + trainer = TorchTrainer( train_func, - config, - dataset=datasets, - callbacks=[JsonLoggerCallback(), TBXLoggerCallback()], + train_loop_config=config, + datasets=datasets, + scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) - trainer.shutdown() + results = trainer.fit() print(results) return results diff --git a/python/ray/train/examples/train_linear_example.py b/python/ray/train/examples/train_linear_example.py index 8a784190d3cc..40d850754401 100644 --- a/python/ray/train/examples/train_linear_example.py +++ b/python/ray/train/examples/train_linear_example.py @@ -3,9 +3,9 @@ import numpy as np import torch import torch.nn as nn + import ray.train as train -from ray.train import Trainer -from ray.train.callbacks import JsonLoggerCallback, TBXLoggerCallback +from ray.train.torch import TorchTrainer class LinearDataset(torch.utils.data.Dataset): @@ -86,13 +86,13 @@ def train_func(config): def train_linear(num_workers=2, use_gpu=False, epochs=3): - trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} - trainer.start() - results = trainer.run( - train_func, config, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()] + trainer = TorchTrainer( + train_func, + train_loop_config=config, + scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) - trainer.shutdown() + results = trainer.fit() print(results) return results diff --git a/python/ray/train/examples/transformers/transformers_example.py b/python/ray/train/examples/transformers/transformers_example.py index ce269733d4de..b6cb461c4b73 100644 --- a/python/ray/train/examples/transformers/transformers_example.py +++ b/python/ray/train/examples/transformers/transformers_example.py @@ -20,14 +20,12 @@ import math import os import random -from typing import Dict, Any +from typing import Any, Dict import datasets -import ray import transformers from accelerate import Accelerator from datasets import load_dataset, load_metric -from ray.train import Trainer from torch.utils.data.dataloader import DataLoader from tqdm.auto import tqdm from transformers import ( @@ -44,6 +42,9 @@ ) from transformers.utils.versions import require_version +import ray +from ray.train.torch import TorchTrainer + logger = logging.getLogger(__name__) require_version( @@ -612,9 +613,13 @@ def main(): else: # Connect to a Ray cluster for distributed training. ray.init(address=args.address) - trainer = Trainer("torch", num_workers=args.num_workers, use_gpu=args.use_gpu) - trainer.start() - trainer.run(train_func, config) + trainer = TorchTrainer( + train_func, + train_loop_config=config, + scaling_config={"num_workers": args.num_workers, "use_gpu": args.use_gpu}, + ) + results = trainer.fit() + print(results) else: # Run training locally. train_func(config) diff --git a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py index c600684e2479..5e4711adae84 100644 --- a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py +++ b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py @@ -11,9 +11,11 @@ import ray import ray.train as train from ray import tune -from ray.train import Trainer -from ray.tune import CLIReporter +from ray.air.config import FailureConfig, RunConfig +from ray.train.torch import TorchTrainer from ray.tune.schedulers import PopulationBasedTraining +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner from ray.util.ml_utils.resnet import ResNet18 @@ -149,8 +151,10 @@ def train_func(config): else: ray.init(address=args.address) - trainer = Trainer("torch", num_workers=args.num_workers, use_gpu=args.use_gpu) - Trainable = trainer.to_tune_trainable(train_func) + trainer = TorchTrainer( + train_func, + scaling_config={"num_workers": args.num_workers, "use_gpu": args.use_gpu}, + ) pbt_scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="loss", @@ -158,32 +162,32 @@ def train_func(config): perturbation_interval=1, hyperparam_mutations={ # distribution for resampling - "lr": lambda: np.random.uniform(0.001, 1), + "train_loop_config/lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values - "momentum": [0.8, 0.9, 0.99], + "train_loop_config/momentum": [0.8, 0.9, 0.99], }, ) - reporter = CLIReporter() - reporter.add_metric_column("loss", "loss") - - analysis = tune.run( - Trainable, - num_samples=4, - config={ - "lr": tune.choice([0.001, 0.01, 0.1]), - "momentum": 0.8, - "batch_size": 128 * args.num_workers, - "epochs": args.num_epochs, - "test_mode": args.smoke_test, # whether to to subset the data + tuner = Tuner( + trainer, + param_space={ + "train_loop_config": { + "lr": tune.choice([0.001, 0.01, 0.1]), + "momentum": 0.8, + "batch_size": 128 * args.num_workers, + "epochs": args.num_epochs, + "test_mode": args.smoke_test, # whether to to subset the data + } }, - stop={"training_iteration": 2 if args.smoke_test else 100}, - max_failures=3, # used for fault tolerance - checkpoint_freq=3, # used for fault tolerance - keep_checkpoints_num=1, # used for fault tolerance - verbose=2, - progress_reporter=reporter, - scheduler=pbt_scheduler, + tune_config=TuneConfig( + num_samples=4, metric="loss", mode="min", scheduler=pbt_scheduler + ), + run_config=RunConfig( + stop={"training_iteration": 2 if args.smoke_test else 100}, + failure=FailureConfig(max_failures=3), # used for fault tolerance + ), ) - print(analysis.get_best_config(metric="loss", mode="min")) + results = tuner.fit() + + print(results.get_best_result(metric="loss", mode="min")) diff --git a/python/ray/train/examples/tune_linear_dataset_example.py b/python/ray/train/examples/tune_linear_dataset_example.py deleted file mode 100644 index adc04f9ba3e3..000000000000 --- a/python/ray/train/examples/tune_linear_dataset_example.py +++ /dev/null @@ -1,68 +0,0 @@ -import argparse - -import ray -from ray import tune -from ray.train import Trainer - -from train_linear_dataset_example import train_func, get_datasets - - -def tune_linear(num_workers, num_samples, use_gpu): - datasets = get_datasets() - - trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu) - Trainable = trainer.to_tune_trainable(train_func, dataset=datasets) - analysis = tune.run( - Trainable, - num_samples=num_samples, - config={ - "lr": tune.loguniform(1e-4, 1e-1), - "batch_size": tune.choice([4, 16, 32]), - "epochs": 3, - }, - ) - results = analysis.get_best_config(metric="loss", mode="min") - print(results) - return results - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--smoke-test", - action="store_true", - default=False, - help="Finish quickly for testing.", - ) - parser.add_argument( - "--address", required=False, type=str, help="the address to use for Ray" - ) - parser.add_argument( - "--num-workers", - "-n", - type=int, - default=2, - help="Sets number of workers for training.", - ) - parser.add_argument( - "--num-samples", - type=int, - default=2, - help="Sets number of samples for training.", - ) - parser.add_argument( - "--use-gpu", action="store_true", default=False, help="Use GPU for training." - ) - - args = parser.parse_args() - - if args.smoke_test: - # 1 for driver, 1 for datasets - num_cpus = args.num_workers + 2 - num_gpus = args.num_workers if args.use_gpu else 0 - ray.init(num_cpus=args.num_workers + 2, num_gpus=num_gpus) - else: - ray.init(address=args.address) - tune_linear( - num_workers=args.num_workers, use_gpu=args.use_gpu, num_samples=args.num_samples - ) diff --git a/python/ray/train/examples/tune_linear_example.py b/python/ray/train/examples/tune_linear_example.py index a0641906c202..5d4a8edc911b 100644 --- a/python/ray/train/examples/tune_linear_example.py +++ b/python/ray/train/examples/tune_linear_example.py @@ -1,27 +1,31 @@ import argparse +from train_linear_example import train_func + import ray from ray import tune -from ray.train import Trainer - -from train_linear_example import train_func +from ray.train.torch import TorchTrainer +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner def tune_linear(num_workers, num_samples): - trainer = Trainer("torch", num_workers=num_workers) - Trainable = trainer.to_tune_trainable(train_func) - analysis = tune.run( - Trainable, - num_samples=num_samples, - config={ - "lr": tune.loguniform(1e-4, 1e-1), - "batch_size": tune.choice([4, 16, 32]), - "epochs": 3, + trainer = TorchTrainer(train_func, scaling_config={"num_workers": num_workers}) + tuner = Tuner( + trainer, + param_space={ + "train_loop_config": { + "lr": tune.loguniform(1e-4, 1e-1), + "batch_size": tune.choice([4, 16, 32]), + "epochs": 3, + }, }, + tune_config=TuneConfig(num_samples=num_samples), ) - results = analysis.get_best_config(metric="loss", mode="min") - print(results) - return results + analysis = tuner.fit() + result = analysis.get_best_result(metric="loss", mode="min") + print(result) + return result if __name__ == "__main__": diff --git a/python/ray/train/examples/tune_tensorflow_mnist_example.py b/python/ray/train/examples/tune_tensorflow_mnist_example.py index 8ab6776c3b64..4fc408b2d6eb 100644 --- a/python/ray/train/examples/tune_tensorflow_mnist_example.py +++ b/python/ray/train/examples/tune_tensorflow_mnist_example.py @@ -1,28 +1,32 @@ import argparse +from tensorflow_mnist_example import train_func + import ray from ray import tune -from ray.train import Trainer - -from tensorflow_mnist_example import train_func +from ray.train.tensorflow import TensorflowTrainer +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner def tune_tensorflow_mnist(num_workers, num_samples): - trainer = Trainer(backend="tensorflow", num_workers=num_workers) - Trainable = trainer.to_tune_trainable(train_func) - analysis = tune.run( - Trainable, - num_samples=num_samples, - config={ - "lr": tune.loguniform(1e-4, 1e-1), - "batch_size": tune.choice([32, 64, 128]), - "epochs": 3, + trainer = TensorflowTrainer(train_func, scaling_config={"num_workers": num_workers}) + tuner = Tuner( + trainer, + param_space={ + "train_loop_config": { + "lr": tune.loguniform(1e-4, 1e-1), + "batch_size": tune.choice([32, 64, 128]), + "epochs": 3, + }, }, + tune_config=TuneConfig(num_samples=num_samples), ) - best_loss = analysis.get_best_config(metric="loss", mode="min") - best_accuracy = analysis.get_best_config(metric="accuracy", mode="max") - print(f"Best loss config: {best_loss}") - print(f"Best accuracy config: {best_accuracy}") + analysis = tuner.fit() + best_loss = analysis.get_best_result(metric="loss", mode="min") + best_accuracy = analysis.get_best_result(metric="accuracy", mode="max") + print(f"Best loss result: {best_loss}") + print(f"Best accuracy result: {best_accuracy}") return analysis diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 66440074a62f..f770e3f05f35 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -2,10 +2,11 @@ from typing import Optional, Union import pandas as pd -from ray.cloudpickle import cloudpickle -from ray.exceptions import RayTaskError + from ray.air.checkpoint import Checkpoint from ray.air.result import Result +from ray.cloudpickle import cloudpickle +from ray.exceptions import RayTaskError from ray.tune import ExperimentAnalysis from ray.tune.error import TuneError from ray.tune.trial import Trial @@ -177,5 +178,6 @@ def _trial_to_result(self, trial: Trial) -> Result: checkpoint=checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), + log_dir=trial.logdir, ) return result From b31399ef71afd24f0ac84df9bcbb8a761be893b6 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 16:07:15 +0000 Subject: [PATCH 02/70] Fix FailureConfig not being a dataclass --- python/ray/air/config.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/python/ray/air/config.py b/python/ray/air/config.py index 40b63603f51c..a8f5c2b85c66 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -1,14 +1,5 @@ from dataclasses import dataclass -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - List, - Mapping, - Optional, - Union, -) +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Mapping, Optional, Union from ray.air.constants import WILDCARD_KEY from ray.tune.syncer import SyncConfig @@ -267,6 +258,7 @@ def _merge(self, other: "DatasetConfig") -> "DatasetConfig": return new_config +@dataclass @PublicAPI(stability="alpha") class FailureConfig: """Configuration related to failure handling of each run/trial. From 5cc9229716f8526f10632fddf0ef282308a47da4 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 16:07:22 +0000 Subject: [PATCH 03/70] Fix errors --- .../examples/mlflow_fashion_mnist_example.py | 2 +- .../train/examples/mlflow_simple_example.py | 36 ++++++++++--------- .../examples/tensorflow_mnist_example.py | 2 +- .../train/examples/train_linear_example.py | 2 +- 4 files changed, 23 insertions(+), 19 deletions(-) diff --git a/python/ray/train/examples/mlflow_fashion_mnist_example.py b/python/ray/train/examples/mlflow_fashion_mnist_example.py index 7cd54b821859..1cda7fc3e1ac 100644 --- a/python/ray/train/examples/mlflow_fashion_mnist_example.py +++ b/python/ray/train/examples/mlflow_fashion_mnist_example.py @@ -47,7 +47,7 @@ def main(num_workers=2, use_gpu=False): import ray if args.smoke_test: - ray.init(num_cpus=2) + ray.init(num_cpus=4) args.num_workers = 2 args.use_gpu = False else: diff --git a/python/ray/train/examples/mlflow_simple_example.py b/python/ray/train/examples/mlflow_simple_example.py index 548b44d96f3c..d61a435ce3e3 100644 --- a/python/ray/train/examples/mlflow_simple_example.py +++ b/python/ray/train/examples/mlflow_simple_example.py @@ -1,6 +1,8 @@ from ray import train -from ray.train import Trainer -from ray.train.callbacks import MLflowLoggerCallback, TBXLoggerCallback +from ray.air import RunConfig +from ray.train.torch import TorchTrainer +from ray.tune.integration.mlflow import MLflowLoggerCallback +from ray.tune.logger import TBXLoggerCallback def train_func(): @@ -8,29 +10,31 @@ def train_func(): train.report(epoch=i) -trainer = Trainer(backend="torch", num_workers=2) -trainer.start() +trainer = TorchTrainer( + train_func, + scaling_config={"num_workers": 2}, + run_config=RunConfig( + callbacks=[ + MLflowLoggerCallback(experiment_name="train_experiment"), + TBXLoggerCallback(), + ], + ), +) # Run the training function, logging all the intermediate results # to MLflow and Tensorboard. -result = trainer.run( - train_func, - callbacks=[ - MLflowLoggerCallback(experiment_name="train_experiment"), - TBXLoggerCallback(), - ], -) +result = trainer.fit() # Print the latest run directory and keep note of it. -# For example: /home/ray_results/train_2021-09-01_12-00-00/run_001 -print("Run directory:", trainer.latest_run_dir) - -trainer.shutdown() +# For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06\ +# /TorchTrainer_c02c7_00000_0_2022-06-13_20-31-06 +print("Run directory:", result.logdir) # How to visualize the logs # Navigate to the run directory of the trainer. -# For example `cd /home/ray_results/train_2021-09-01_12-00-00/run_001` +# For example `cd /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06\ +# /TorchTrainer_c02c7_00000_0_2022-06-13_20-31-06` # $ cd # # # View the MLflow UI. diff --git a/python/ray/train/examples/tensorflow_mnist_example.py b/python/ray/train/examples/tensorflow_mnist_example.py index 980e58652d95..a0ef319f8756 100644 --- a/python/ray/train/examples/tensorflow_mnist_example.py +++ b/python/ray/train/examples/tensorflow_mnist_example.py @@ -120,7 +120,7 @@ def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4): import ray if args.smoke_test: - ray.init(num_cpus=2) + ray.init(num_cpus=4) train_tensorflow_mnist() else: ray.init(address=args.address) diff --git a/python/ray/train/examples/train_linear_example.py b/python/ray/train/examples/train_linear_example.py index 40d850754401..069c6dd13db1 100644 --- a/python/ray/train/examples/train_linear_example.py +++ b/python/ray/train/examples/train_linear_example.py @@ -128,7 +128,7 @@ def train_linear(num_workers=2, use_gpu=False, epochs=3): import ray if args.smoke_test: - ray.init(num_cpus=2) + ray.init(num_cpus=4) train_linear() else: ray.init(address=args.address) From 523021843ab6886fbdce1f40381be0ed733ced98 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 17:01:17 +0000 Subject: [PATCH 04/70] Fix --- doc/source/train/examples/tune_torch_linear_dataset_example.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/train/examples/tune_torch_linear_dataset_example.rst b/doc/source/train/examples/tune_torch_linear_dataset_example.rst index 22ad2e562660..df74e93ebdf2 100644 --- a/doc/source/train/examples/tune_torch_linear_dataset_example.rst +++ b/doc/source/train/examples/tune_torch_linear_dataset_example.rst @@ -3,4 +3,4 @@ tune_torch_linear_dataset_example ================================= -.. literalinclude:: /../../python/ray/air/examples/tune_torch_linear_dataset_example.py +.. literalinclude:: /../../python/ray/air/examples/pytorch/tune_torch_linear_dataset_example.py From ef4a3fcda417ca38dae7fc8caf37b289481dd064 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 17:46:39 +0000 Subject: [PATCH 05/70] Fix link --- doc/source/train/examples.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst index e644f708b639..2a4e0b75bbd1 100644 --- a/doc/source/train/examples.rst +++ b/doc/source/train/examples.rst @@ -62,7 +62,7 @@ Ray Datasets Integration Examples * :doc:`/train/examples/train_linear_dataset_example`: Simple example for training a linear PyTorch model. -* :doc:`/air/examples/tune_torch_linear_dataset_example`: +* :doc:`/train/examples/tune_torch_linear_dataset_example`: Simple example for tuning a linear PyTorch model. From f5cfe6262dfeb173663ab7693ff1dfd60b385208 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 19:47:22 +0000 Subject: [PATCH 06/70] Fix simple example --- .../train/examples/mlflow_simple_example.py | 25 +++++++++++++------ python/ray/tune/result_grid.py | 2 +- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/python/ray/train/examples/mlflow_simple_example.py b/python/ray/train/examples/mlflow_simple_example.py index d61a435ce3e3..d64d0525ae58 100644 --- a/python/ray/train/examples/mlflow_simple_example.py +++ b/python/ray/train/examples/mlflow_simple_example.py @@ -25,20 +25,29 @@ def train_func(): # to MLflow and Tensorboard. result = trainer.fit() +# For MLFLow logs: + +# MLFlow logs will by default be saved in an `mlflow` directory +# in the current working directory. + +# $ cd mlflow +# # View the MLflow UI. +# $ mlflow ui + +# You can change the directory by setting the `tracking_uri` argument +# in `MLflowLoggerCallback`. + +# For TensorBoard logs: + # Print the latest run directory and keep note of it. -# For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06\ -# /TorchTrainer_c02c7_00000_0_2022-06-13_20-31-06 -print("Run directory:", result.logdir) +# For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06 +print("Run directory:", result.log_dir) # How to visualize the logs # Navigate to the run directory of the trainer. -# For example `cd /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06\ -# /TorchTrainer_c02c7_00000_0_2022-06-13_20-31-06` +# For example `cd /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06` # $ cd # -# # View the MLflow UI. -# $ mlflow ui -# # # View the tensorboard UI. # $ tensorboard --logdir . diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index f770e3f05f35..38ebf357e5e4 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -178,6 +178,6 @@ def _trial_to_result(self, trial: Trial) -> Result: checkpoint=checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), - log_dir=trial.logdir, + log_dir=trial.local_dir, ) return result From 468f7e80f48049f041a8c5fb038cc2d49a280b14 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 20:09:43 +0000 Subject: [PATCH 07/70] train loop utils --- python/ray/train/train_loop_utils.py | 80 ++++++++++++++++------------ 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/python/ray/train/train_loop_utils.py b/python/ray/train/train_loop_utils.py index b4dde5f4ca7b..774fda94a324 100644 --- a/python/ray/train/train_loop_utils.py +++ b/python/ray/train/train_loop_utils.py @@ -1,11 +1,8 @@ -from typing import TYPE_CHECKING -from typing import Optional, Dict, Union import warnings +from typing import TYPE_CHECKING, Dict, Optional, Union +from ray.train._internal.session import get_session from ray.train.constants import SESSION_MISUSE_LOG_ONCE_KEY -from ray.train._internal.session import ( - get_session, -) from ray.util import PublicAPI, log_once if TYPE_CHECKING: @@ -41,23 +38,25 @@ def get_dataset_shard( import ray from ray import train + from ray.train.torch import TorchTrainer def train_func(): model = Net() for iter in range(100): - data_shard = train.get_dataset_shard().to_torch() + data_shard = train.get_dataset_shard("train").to_torch() model.train(data_shard) return model dataset = ray.data.read_csv("train.csv") dataset.filter(...).repeat().random_shuffle() - trainer = Trainer(backend="torch") - trainer.start() - # Trainer will automatically handle sharding. - train_model = trainer.run(train_func, dataset=dataset) - trainer.shutdown() + trainer = TorchTrainer( + train_func, + datasets={"train": dataset}, + scaling_config={"num_workers": 2}, + ) + trainer.fit() Args: dataset_name: If a Dictionary of Datasets was passed to ``Trainer``, then @@ -98,16 +97,15 @@ def report(**kwargs) -> None: import time from ray import train + from ray.train.torch import TorchTrainer def train_func(): for iter in range(100): time.sleep(1) train.report(hello="world") - trainer = Trainer(backend="torch") - trainer.start() - trainer.run(train_func) - trainer.shutdown() + trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2}) + trainer.fit() Args: **kwargs: Any key value pair to be reported by Train. @@ -129,6 +127,7 @@ def world_rank() -> int: import time from ray import train + from ray.train.torch import TorchTrainer def train_func(): for iter in range(100): @@ -136,10 +135,8 @@ def train_func(): if train.world_rank() == 0: print("Worker 0") - trainer = Trainer(backend="torch") - trainer.start() - trainer.run(train_func) - trainer.shutdown() + trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2}) + trainer.fit() """ session = get_session() @@ -156,16 +153,18 @@ def local_rank() -> int: import time from ray import train + from ray.train.torch import TorchTrainer def train_func(): if torch.cuda.is_available(): torch.cuda.set_device(train.local_rank()) ... - trainer = Trainer(backend="torch", use_gpu=True) - trainer.start() - trainer.run(train_func) - trainer.shutdown() + trainer = TorchTrainer( + train_func, + scaling_config={"use_gpu": True, "num_workers": 2}, + ) + trainer.fit() """ session = get_session() @@ -181,18 +180,29 @@ def load_checkpoint() -> Optional[Dict]: .. code-block:: python from ray import train + from ray.air import Checkpoint + from ray.train.torch import TorchTrainer def train_func(): checkpoint = train.load_checkpoint() for iter in range(checkpoint["epoch"], 5): print(iter) - trainer = Trainer(backend="torch") - trainer.start() - trainer.run(train_func, checkpoint={"epoch": 3}) + checkpoint = Checkpoint.from_dict( + { + # this would be set during checkpoint saving + "_current_checkpoint_id": 1, + "epoch": 3, + } + ) + trainer = TorchTrainer( + train_func, + resume_from_checkpoint=checkpoint, + scaling_config={"num_workers": 2}, + ) + trainer.fit() # 3 # 4 - trainer.shutdown() Args: **kwargs: Any key value pair to be checkpointed by Train. @@ -216,16 +226,16 @@ def save_checkpoint(**kwargs) -> None: import time from ray import train + from ray.train.torch import TorchTrainer def train_func(): for iter in range(100): time.sleep(1) train.save_checkpoint(epoch=iter) - trainer = Trainer(backend="torch") - trainer.start() - trainer.run(train_func) - trainer.shutdown() + trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2}) + result = trainer.fit() + assert result.checkpoint Args: **kwargs: Any key value pair to be checkpointed by Train. @@ -245,14 +255,14 @@ def world_size() -> int: import time from ray import train + from ray.train.torch import TorchTrainer def train_func(): assert train.world_size() == 4 - trainer = Trainer(backend="torch", num_workers=4) - trainer.start() - trainer.run(train_func) - trainer.shutdown() + trainer = TorchTrainer(train_func, scaling_config={"num_workers": 4}) + result = trainer.fit() + """ session = get_session() if session is None: From 4ef6302cc5d4bfd3c00caaeb2e9ad6c678946858 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 20:29:42 +0000 Subject: [PATCH 08/70] Remove tensorboard example --- python/ray/train/BUILD | 9 -- .../torch_tensorboard_profiler_example.py | 84 ------------------- 2 files changed, 93 deletions(-) delete mode 100644 python/ray/train/examples/torch_tensorboard_profiler_example.py diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index d33bad4d9ac2..bf73935be49c 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -39,15 +39,6 @@ py_test( deps = [":train_lib"] ) -py_test( - name = "torch_tensorboard_profiler_example", - size = "small", - main = "examples/torch_tensorboard_profiler_example.py", - srcs = ["examples/torch_tensorboard_profiler_example.py"], - tags = ["team:ml", "exclusive"], - deps = [":train_lib"] -) - py_test( name = "transformers_example_gpu", size = "large", diff --git a/python/ray/train/examples/torch_tensorboard_profiler_example.py b/python/ray/train/examples/torch_tensorboard_profiler_example.py deleted file mode 100644 index 5f3641c31c8d..000000000000 --- a/python/ray/train/examples/torch_tensorboard_profiler_example.py +++ /dev/null @@ -1,84 +0,0 @@ -import argparse - -import torch -from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present -from torch.profiler import profile, record_function, schedule - -import ray -import ray.train as train -from ray.train import Trainer -from ray.train.callbacks import TBXLoggerCallback -from ray.train.callbacks.profile import TorchTensorboardProfilerCallback -from ray.train.torch import TorchWorkerProfiler - - -def train_func(): - twp = TorchWorkerProfiler() - with profile( - activities=[], - schedule=schedule(wait=0, warmup=0, active=1), - on_trace_ready=twp.trace_handler, - ) as p: - - # Setup model. - model = torch.nn.Linear(1, 1) - model = train.torch.prepare_model(model) - loss_fn = torch.nn.MSELoss() - optimizer = torch.optim.SGD(model.parameters(), lr=1e-2) - - # Setup data. - input = torch.randn(1000, 1) - labels = input * 2 - dataset = torch.utils.data.TensorDataset(input, labels) - dataloader = torch.utils.data.DataLoader(dataset, batch_size=32) - dataloader = train.torch.prepare_data_loader(dataloader) - - # Train. - for epoch in range(5): - with record_function("train_epoch"): - for X, y in dataloader: - pred = model(X) - loss = loss_fn(pred, y) - optimizer.zero_grad() - loss.backward() - optimizer.step() - - with record_function("train_checkpoint"): - state_dict = model.state_dict() - consume_prefix_in_state_dict_if_present(state_dict, "module.") - train.save_checkpoint(epoch=epoch, model_weights=state_dict) - - p.step() - - with record_function("train_report"): - profile_results = twp.get_and_clear_profile_traces() - train.report(epoch=epoch, **profile_results) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--address", required=False, type=str, help="the address to use for Ray" - ) - parser.add_argument( - "--num-workers", - "-n", - type=int, - default=2, - help="Sets number of workers for training.", - ) - parser.add_argument( - "--use-gpu", action="store_true", default=False, help="Enables GPU training" - ) - - args = parser.parse_args() - - ray.init(address=args.address) - - callbacks = [TorchTensorboardProfilerCallback(), TBXLoggerCallback()] - trainer = Trainer( - backend="torch", num_workers=args.num_workers, use_gpu=args.use_gpu - ) - trainer.start() - trainer.run(train_func, callbacks=callbacks) - trainer.shutdown() From 5db3c14e400cc94da0448dd26b3cf5328b82ea4d Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 20:30:09 +0000 Subject: [PATCH 09/70] PBT test update --- .../tune_cifar_pytorch_pbt_example.py | 13 ++-- .../workloads/pytorch_pbt_failure.py | 77 ++++++++++--------- 2 files changed, 49 insertions(+), 41 deletions(-) diff --git a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py index 5e4711adae84..a7031b3116a1 100644 --- a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py +++ b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py @@ -58,6 +58,7 @@ def validate_epoch(dataloader, model, loss_fn): def train_func(config): + # print(config) epochs = config.pop("epochs", 3) model = ResNet18(config) model = train.torch.prepare_model(model) @@ -157,14 +158,14 @@ def train_func(config): ) pbt_scheduler = PopulationBasedTraining( time_attr="training_iteration", - metric="loss", - mode="min", perturbation_interval=1, hyperparam_mutations={ - # distribution for resampling - "train_loop_config/lr": lambda: np.random.uniform(0.001, 1), - # allow perturbations within this set of categorical values - "train_loop_config/momentum": [0.8, 0.9, 0.99], + "train_loop_config": { + # distribution for resampling + "lr": lambda: np.random.uniform(0.001, 1), + # allow perturbations within this set of categorical values + "momentum": [0.8, 0.9, 0.99], + } }, ) diff --git a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py index 903e2a1cc553..d354b2834ac6 100644 --- a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py +++ b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py @@ -4,16 +4,16 @@ import numpy as np import ray - from ray import tune +from ray.air.config import RunConfig +from ray.train.examples.tune_cifar_pytorch_pbt_example import train_func +from ray.train.torch import TorchConfig, TorchTrainer from ray.tune.schedulers import PopulationBasedTraining +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner from ray.tune.utils.mock import FailureInjectorCallback from ray.tune.utils.release_test_util import ProgressCallback -from ray.train import Trainer -from ray.train.examples.tune_cifar_pytorch_pbt_example import train_func -from ray.train.torch import TorchConfig - parser = argparse.ArgumentParser() parser.add_argument( "--smoke-test", @@ -26,46 +26,53 @@ ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True) num_training_workers = 1 if args.smoke_test else 3 -trainer = Trainer( - num_workers=num_training_workers, - use_gpu=not args.smoke_test, - backend=TorchConfig(backend="gloo"), +trainer = TorchTrainer( + train_func, + scaling_config=dict( + num_workers=num_training_workers, + use_gpu=not args.smoke_test, + ), + torch_config=TorchConfig(backend="gloo"), ) -TorchTrainable = trainer.to_tune_trainable(train_func=train_func) pbt_scheduler = PopulationBasedTraining( time_attr="training_iteration", - metric="loss", - mode="min", perturbation_interval=1, hyperparam_mutations={ - # distribution for resampling - "lr": lambda: np.random.uniform(0.001, 1), - # allow perturbations within this set of categorical values - "momentum": [0.8, 0.9, 0.99], + "train_loop_config": { + # distribution for resampling + "lr": lambda: np.random.uniform(0.001, 1), + # allow perturbations within this set of categorical values + "momentum": [0.8, 0.9, 0.99], + } }, ) -analysis = tune.run( - TorchTrainable, - num_samples=4, - config={ - "lr": tune.choice([0.001, 0.01, 0.1]), - "momentum": 0.8, - "head_location": None, - "worker_locations": None, - "test_mode": args.smoke_test, - "batch_size": 128 * num_training_workers, - # For the long running test, we want the training to run forever, and it will - # be terminated by the release test infra. - "epochs": 1 if args.smoke_test else sys.maxsize, +tuner = Tuner( + trainer, + param_space={ + "train_loop_config": { + "lr": tune.choice([0.001, 0.01, 0.1]), + "momentum": 0.8, + "head_location": None, + "worker_locations": None, + "test_mode": args.smoke_test, + "batch_size": 128 * num_training_workers, + # For the long running test, we want the training to run forever, + # and it will be terminated by the release test infra. + "epochs": 1 if args.smoke_test else sys.maxsize, + } }, - max_failures=-1, # used for fault tolerance - checkpoint_freq=2, # used for fault tolerance - scheduler=pbt_scheduler, - callbacks=[FailureInjectorCallback(time_between_checks=90), ProgressCallback()], - stop={"training_iteration": 1} if args.smoke_test else None, + tune_config=TuneConfig( + num_samples=4, metric="loss", mode="min", scheduler=pbt_scheduler + ), + run_config=RunConfig( + stop={"training_iteration": 1} if args.smoke_test else None, + callbacks=[FailureInjectorCallback(time_between_checks=90), ProgressCallback()], + ), ) -print(analysis.get_best_config(metric="loss", mode="min")) +results = tuner.fit() + +print(results.get_best_result(metric="loss", mode="min")) From cb805f297c6f14e0878cdca9fbc3774d4070191e Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 20:48:31 +0000 Subject: [PATCH 10/70] WIP --- .../transformers/transformers_example.py | 2 +- python/ray/train/tests/test_examples.py | 19 ++++++++----------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/python/ray/train/examples/transformers/transformers_example.py b/python/ray/train/examples/transformers/transformers_example.py index b6cb461c4b73..1b47e5ce2e31 100644 --- a/python/ray/train/examples/transformers/transformers_example.py +++ b/python/ray/train/examples/transformers/transformers_example.py @@ -609,7 +609,7 @@ def main(): if args.start_local or args.address or args.num_workers > 1 or args.use_gpu: if args.start_local: # Start a local Ray runtime. - ray.init(num_cpus=args.num_workers) + ray.init(num_cpus=args.num_workers + 2) else: # Connect to a Ray cluster for distributed training. ray.init(address=args.address) diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index c72249b95ad5..10ebefa03588 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -19,7 +19,9 @@ train_func as fashion_mnist_train_func, ) from ray.train.examples.train_linear_example import train_func as linear_train_func +from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer from ray.train.tests.test_trainer import KillCallback +from ray.train.torch.torch_trainer import TorchTrainer @pytest.fixture @@ -35,14 +37,11 @@ def test_tensorflow_mnist(ray_start_2_cpus, num_workers): num_workers = num_workers epochs = 3 - trainer = Trainer("tensorflow", num_workers=num_workers) + trainer = TensorflowTrainer(tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=num_workers)) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} - trainer.start() - results = trainer.run(tensorflow_mnist_train_func, config) - trainer.shutdown() + results = trainer.fit() - assert len(results) == num_workers - result = results[0] + result = results.metrics loss = result["loss"] assert len(loss) == epochs @@ -56,17 +55,15 @@ def test_tensorflow_mnist(ray_start_2_cpus, num_workers): def test_tf_non_distributed(ray_start_2_cpus): """Make sure Ray Train works without TF MultiWorkerMirroredStrategy.""" - trainer = Trainer(backend="torch", num_workers=1) - trainer.start() - trainer.run(tf_quick_start_train_func) - trainer.shutdown() + trainer = TorchTrainer(tf_quick_start_train_func, scaling_config=dict(num_workers=1)) + trainer.fit() def test_tensorflow_mnist_fail(ray_start_2_cpus): """Tests if tensorflow example works even with worker failure.""" epochs = 3 - trainer = Trainer("tensorflow", num_workers=2) + trainer = TensorflowTrainer(tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=num_workers)) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} trainer.start() kill_callback = KillCallback(fail_on=0, trainer=trainer) From 2f69e37e50b57e282c406f198096848a6e03c5d1 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 15 Jun 2022 18:08:55 +0000 Subject: [PATCH 11/70] Do not use pipeline --- .../ray/train/examples/train_linear_dataset_example.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/python/ray/train/examples/train_linear_dataset_example.py b/python/ray/train/examples/train_linear_dataset_example.py index 1cfbff434c9f..f84faa5f7a11 100644 --- a/python/ray/train/examples/train_linear_dataset_example.py +++ b/python/ray/train/examples/train_linear_dataset_example.py @@ -7,11 +7,10 @@ import ray import ray.train as train from ray.data import Dataset -from ray.data.dataset_pipeline import DatasetPipeline from ray.train.torch import TorchTrainer -def get_datasets(a=5, b=10, size=1000, split=0.8) -> Dict[str, DatasetPipeline]: +def get_datasets(a=5, b=10, size=1000, split=0.8) -> Dict[str, Dataset]: def get_dataset(a, b, size) -> Dataset: items = [i / size for i in range(size)] dataset = ray.data.from_items([{"x": x, "y": a * x + b} for x in items]) @@ -23,12 +22,9 @@ def get_dataset(a, b, size) -> Dataset: [split] ) - train_dataset_pipeline = train_dataset.repeat().random_shuffle_each_window() - validation_dataset_pipeline = validation_dataset.repeat() - datasets = { - "train": train_dataset_pipeline, - "validation": validation_dataset_pipeline, + "train": train_dataset, + "validation": validation_dataset, } return datasets From 0d8eeb4a879f161e7215f117c75ef4ded4358099 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 15 Jun 2022 18:14:21 +0000 Subject: [PATCH 12/70] Remove callback test --- python/ray/train/BUILD | 8 - python/ray/train/tests/test_callbacks.py | 357 ----------------------- 2 files changed, 365 deletions(-) delete mode 100644 python/ray/train/tests/test_callbacks.py diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index bf73935be49c..1db84dd79d37 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -113,14 +113,6 @@ py_test( deps = [":train_lib"] ) -py_test( - name = "test_callbacks", - size = "medium", - srcs = ["tests/test_callbacks.py"], - tags = ["team:ml", "exclusive"], - deps = [":train_lib"] -) - py_test( name = "test_data_parallel_trainer", size = "medium", diff --git a/python/ray/train/tests/test_callbacks.py b/python/ray/train/tests/test_callbacks.py deleted file mode 100644 index 9aeb54088801..000000000000 --- a/python/ray/train/tests/test_callbacks.py +++ /dev/null @@ -1,357 +0,0 @@ -from typing import Dict, List -import glob -import io -import json -from collections import defaultdict -from contextlib import redirect_stdout -from pathlib import Path - -import pytest - -import ray -import ray.train as train -from ray.train import Trainer -from ray.train.backend import BackendConfig, Backend -from ray.train.callbacks import ( - TrainingCallback, - JsonLoggerCallback, - PrintCallback, - TBXLoggerCallback, - TorchTensorboardProfilerCallback, -) -from ray.train.callbacks.logging import ( - MLflowLoggerCallback, - _TrainCallbackLogdirManager, -) -from ray.train.constants import ( - TRAINING_ITERATION, - DETAILED_AUTOFILLED_KEYS, - BASIC_AUTOFILLED_KEYS, - ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, -) -from ray.train._internal.worker_group import WorkerGroup -from ray.train._internal.results_preprocessors.preprocessor import ( - SequentialResultsPreprocessor, -) - -try: - from tensorflow.python.summary.summary_iterator import summary_iterator -except ImportError: - summary_iterator = None - - -@pytest.fixture -def ray_start_4_cpus(): - address_info = ray.init(num_cpus=4) - yield address_info - # The code after the yield will run as teardown code. - ray.shutdown() - - -class TestConfig(BackendConfig): - @property - def backend_cls(self): - return TestBackend - - -class TestBackend(Backend): - def on_start(self, worker_group: WorkerGroup, backend_config: TestConfig): - pass - - def on_shutdown(self, worker_group: WorkerGroup, backend_config: TestConfig): - pass - - -def test_print(ray_start_4_cpus): - num_workers = 4 - - def train_func(): - train.report(rank=train.world_rank()) - - stream = io.StringIO() - with redirect_stdout(stream): - trainer = Trainer(TestConfig(), num_workers=num_workers) - trainer.start() - trainer.run(train_func, callbacks=[PrintCallback()]) - trainer.shutdown() - - output = stream.getvalue() - results = json.loads(output) - - assert len(results) == num_workers - for i, result in enumerate(results): - assert set(result.keys()) == (BASIC_AUTOFILLED_KEYS | {"rank"}) - assert result["rank"] == i - - -@pytest.mark.parametrize("input", [None, "dir", "file"]) -def test_train_callback_logdir_manager(tmp_path, input): - default_dir = tmp_path / "default_dir" - - if input == "dir": - input_logdir = tmp_path / "dir" - input_logdir.mkdir(parents=True) - elif input == "file": - input_logdir = tmp_path / "file" - input_logdir.touch() - else: - input_logdir = None - - logdir_manager = _TrainCallbackLogdirManager(input_logdir) - - if input_logdir: - path = logdir_manager.logdir_path - assert path == logdir_manager.logdir_path - else: - with pytest.raises(RuntimeError): - path = logdir_manager.logdir_path - - if input_logdir and not Path(input_logdir).is_dir(): - with pytest.raises(FileExistsError): - logdir_manager.setup_logdir(str(default_dir)) - else: - path = logdir_manager.setup_logdir(str(default_dir)) - assert path == logdir_manager.logdir_path - - -@pytest.mark.parametrize("workers_to_log", [0, None, [0, 1]]) -@pytest.mark.parametrize("detailed", [False, True]) -@pytest.mark.parametrize("filename", [None, "my_own_filename.json"]) -def test_json( - monkeypatch, ray_start_4_cpus, tmp_path, workers_to_log, detailed, filename -): - if detailed: - monkeypatch.setenv(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, "1") - - config = TestConfig() - - num_iters = 5 - num_workers = 4 - - if workers_to_log is None: - num_workers_to_log = num_workers - elif isinstance(workers_to_log, int): - num_workers_to_log = 1 - else: - num_workers_to_log = len(workers_to_log) - - def train_func(): - for i in range(num_iters): - train.report(index=i) - return 1 - - if filename is None: - # if None, use default value - callback = JsonLoggerCallback(workers_to_log=workers_to_log) - else: - callback = JsonLoggerCallback(filename=filename, workers_to_log=workers_to_log) - trainer = Trainer(config, num_workers=num_workers, logdir=str(tmp_path)) - trainer.start() - trainer.run(train_func, callbacks=[callback]) - if filename is None: - assert str(callback.log_path.name) == JsonLoggerCallback._default_filename - else: - assert str(callback.log_path.name) == filename - - with open(callback.log_path, "r") as f: - log = json.load(f) - print(log) - assert len(log) == num_iters - assert len(log[0]) == num_workers_to_log - assert all(len(element) == len(log[0]) for element in log) - assert all( - all(worker["index"] == worker[TRAINING_ITERATION] - 1 for worker in element) - for element in log - ) - assert all( - all(all(key in worker for key in BASIC_AUTOFILLED_KEYS) for worker in element) - for element in log - ) - if detailed: - assert all( - all( - all(key in worker for key in DETAILED_AUTOFILLED_KEYS) - for worker in element - ) - for element in log - ) - else: - assert all( - all( - not any(key in worker for key in DETAILED_AUTOFILLED_KEYS) - for worker in element - ) - for element in log - ) - - -def _validate_tbx_result(events_dir): - events_file = list(glob.glob(f"{events_dir}/events*"))[0] - results = defaultdict(list) - for event in summary_iterator(events_file): - for v in event.summary.value: - assert v.tag.startswith("ray/train") - results[v.tag[10:]].append(v.simple_value) - - assert len(results["episode_reward_mean"]) == 3 - assert [int(res) for res in results["episode_reward_mean"]] == [4, 5, 6] - assert len(results["score"]) == 1 - assert len(results["hello/world"]) == 1 - - -def test_TBX(ray_start_4_cpus, tmp_path): - config = TestConfig() - - temp_dir = tmp_path - num_workers = 4 - - def train_func(): - train.report(episode_reward_mean=4) - train.report(episode_reward_mean=5) - train.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1}) - return 1 - - callback = TBXLoggerCallback(temp_dir) - trainer = Trainer(config, num_workers=num_workers) - trainer.start() - trainer.run(train_func, callbacks=[callback]) - - _validate_tbx_result(temp_dir) - - -def test_mlflow(ray_start_4_cpus, tmp_path): - config = TestConfig() - - params = {"p1": "p1"} - - temp_dir = tmp_path - num_workers = 4 - - def train_func(config): - train.report(episode_reward_mean=4) - train.report(episode_reward_mean=5) - train.report(episode_reward_mean=6) - return 1 - - callback = MLflowLoggerCallback(experiment_name="test_exp", logdir=temp_dir) - trainer = Trainer(config, num_workers=num_workers) - trainer.start() - trainer.run(train_func, config=params, callbacks=[callback]) - - from mlflow.tracking import MlflowClient - - client = MlflowClient(tracking_uri=callback.mlflow_util._mlflow.get_tracking_uri()) - - experiment_id = client.get_experiment_by_name("test_exp").experiment_id - all_runs = callback.mlflow_util._mlflow.search_runs(experiment_ids=[experiment_id]) - assert len(all_runs) == 1 - # all_runs is a pandas dataframe. - all_runs = all_runs.to_dict(orient="records") - run_id = all_runs[0]["run_id"] - run = client.get_run(run_id) - - assert run.data.params == params - assert ( - "episode_reward_mean" in run.data.metrics - and run.data.metrics["episode_reward_mean"] == 6.0 - ) - assert ( - TRAINING_ITERATION in run.data.metrics - and run.data.metrics[TRAINING_ITERATION] == 3.0 - ) - - metric_history = client.get_metric_history(run_id=run_id, key="episode_reward_mean") - - assert len(metric_history) == 3 - iterations = [metric.step for metric in metric_history] - assert iterations == [1, 2, 3] - rewards = [metric.value for metric in metric_history] - assert rewards == [4, 5, 6] - - -def test_torch_tensorboard_profiler_callback(ray_start_4_cpus, tmp_path): - config = TestConfig() - - temp_dir = tmp_path - num_workers = 4 - num_epochs = 2 - - def train_func(): - from ray.train.torch import TorchWorkerProfiler - from torch.profiler import profile, record_function, schedule - - twp = TorchWorkerProfiler() - with profile( - activities=[], - schedule=schedule(wait=0, warmup=0, active=1), - on_trace_ready=twp.trace_handler, - ) as p: - - for epoch in range(num_epochs): - with record_function("test_function"): - pass - - p.step() - - profile_results = twp.get_and_clear_profile_traces() - train.report(epoch=epoch, **profile_results) - - callback = TorchTensorboardProfilerCallback(temp_dir) - trainer = Trainer(config, num_workers=num_workers) - trainer.start() - trainer.run(train_func, callbacks=[callback]) - - assert temp_dir.exists() - - count = 0 - for path in temp_dir.iterdir(): - assert path.is_file() - count += 1 - assert count == num_workers * num_epochs - - -# fix issue: repeat assignments for preprocessor results nested recursive calling -# see https://github.com/ray-project/ray/issues/25005 -def test_hotfix_callback_nested_recusive_calling(): - # test callback used to simulate the nested recursive calling for preprocess() - class TestCallback(TrainingCallback): - def __init__(self): - self.max_process_time = 0 - - def count_process_times(self, processor): - count = 0 - if processor: - if isinstance(processor, SequentialResultsPreprocessor): - for preprocessor in processor.preprocessors: - # recursive calling preprocessors in list - count += self.count_process_times(preprocessor) - else: - count = 1 - return count - - def handle_result(self, results: List[Dict], **info): - process_times = self.count_process_times(self.results_preprocessor) - if process_times > self.max_process_time: - self.max_process_time = process_times - print(f"process times: {process_times}") - - def train_func(): - for idx in range(num_iterates): - train.report(iterate=idx + 1) - - # python default limitation for iterate depth - num_iterates = 1000 - trainer = Trainer(TestConfig(), num_workers=1) - trainer.start() - test_callback = TestCallback() - trainer.run(train_func, callbacks=[test_callback]) - assert test_callback.max_process_time == 1 - print(f"callback max process time: {test_callback.max_process_time}") - trainer.shutdown() - - -if __name__ == "__main__": - import pytest - import sys - - sys.exit(pytest.main(["-v", "-x", __file__])) From 4a3103ec4a3fcd06761a9ddf51895407bbef3c87 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 15 Jun 2022 19:10:27 +0000 Subject: [PATCH 13/70] Examples tests --- .../train/examples/horovod/horovod_example.py | 3 +- python/ray/train/tests/test_examples.py | 119 +++++++++--------- 2 files changed, 62 insertions(+), 60 deletions(-) diff --git a/python/ray/train/examples/horovod/horovod_example.py b/python/ray/train/examples/horovod/horovod_example.py index c3202307755f..1e163da70052 100644 --- a/python/ray/train/examples/horovod/horovod_example.py +++ b/python/ray/train/examples/horovod/horovod_example.py @@ -10,6 +10,7 @@ from torchvision import datasets, transforms import ray +from ray import train from ray.train.horovod import HorovodTrainer @@ -148,7 +149,7 @@ def train_func(config): loss = train_epoch( model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda ) - results.append(loss) + train.report(loss=loss) return results diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index 10ebefa03588..e37cf43a4687 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -2,9 +2,10 @@ import ray from ray.train import Trainer +from ray.train.constants import TRAINING_ITERATION +from ray.train.examples.horovod.horovod_example import HorovodTrainClass from ray.train.examples.horovod.horovod_example import ( train_func as horovod_torch_train_func, - HorovodTrainClass, ) from ray.train.examples.tensorflow_mnist_example import ( train_func as tensorflow_mnist_train_func, @@ -19,52 +20,56 @@ train_func as fashion_mnist_train_func, ) from ray.train.examples.train_linear_example import train_func as linear_train_func +from ray.train.horovod.horovod_trainer import HorovodTrainer from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer from ray.train.tests.test_trainer import KillCallback from ray.train.torch.torch_trainer import TorchTrainer @pytest.fixture -def ray_start_2_cpus(): - address_info = ray.init(num_cpus=2) +def ray_start_4_cpus(): + address_info = ray.init(num_cpus=4) yield address_info # The code after the yield will run as teardown code. ray.shutdown() @pytest.mark.parametrize("num_workers", [1, 2]) -def test_tensorflow_mnist(ray_start_2_cpus, num_workers): +def test_tensorflow_mnist(ray_start_4_cpus, num_workers): num_workers = num_workers epochs = 3 - trainer = TensorflowTrainer(tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=num_workers)) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} + trainer = TensorflowTrainer( + tensorflow_mnist_train_func, + train_loop_config=config, + scaling_config=dict(num_workers=num_workers), + ) results = trainer.fit() result = results.metrics - loss = result["loss"] - assert len(loss) == epochs - assert loss[-1] < loss[0] - - accuracy = result["accuracy"] - assert len(accuracy) == epochs - assert accuracy[-1] > accuracy[0] + assert result[TRAINING_ITERATION] == epochs -def test_tf_non_distributed(ray_start_2_cpus): +def test_tf_non_distributed(ray_start_4_cpus): """Make sure Ray Train works without TF MultiWorkerMirroredStrategy.""" - trainer = TorchTrainer(tf_quick_start_train_func, scaling_config=dict(num_workers=1)) + trainer = TorchTrainer( + tf_quick_start_train_func, scaling_config=dict(num_workers=1) + ) trainer.fit() -def test_tensorflow_mnist_fail(ray_start_2_cpus): +@pytest.mark.skip("Refactor as a backend test.") +def test_tensorflow_mnist_fail(ray_start_4_cpus): """Tests if tensorflow example works even with worker failure.""" epochs = 3 - trainer = TensorflowTrainer(tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=num_workers)) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} + trainer = TensorflowTrainer( + tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=2) + ) trainer.start() kill_callback = KillCallback(fail_on=0, trainer=trainer) results = trainer.run( @@ -85,24 +90,24 @@ def test_tensorflow_mnist_fail(ray_start_2_cpus): @pytest.mark.parametrize("num_workers", [1, 2]) -def test_torch_linear(ray_start_2_cpus, num_workers): +def test_torch_linear(ray_start_4_cpus, num_workers): num_workers = num_workers epochs = 3 - trainer = Trainer("torch", num_workers=num_workers) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} - trainer.start() - results = trainer.run(linear_train_func, config) - trainer.shutdown() - - assert len(results) == num_workers + trainer = TorchTrainer( + linear_train_func, + train_loop_config=config, + scaling_config=dict(num_workers=num_workers), + ) + results = trainer.fit() - for result in results: - assert len(result) == epochs - assert result[-1]["loss"] < result[0]["loss"] + result = results.metrics + assert result[TRAINING_ITERATION] == epochs -def test_torch_linear_failure(ray_start_2_cpus): +@pytest.mark.skip("Refactor as a backend test.") +def test_torch_linear_failure(ray_start_4_cpus): num_workers = 2 epochs = 3 @@ -113,56 +118,51 @@ def test_torch_linear_failure(ray_start_2_cpus): results = trainer.run(linear_train_func, config, callbacks=[kill_callback]) trainer.shutdown() - assert len(results) == num_workers + result = results.metrics - for result in results: - assert len(result) == epochs - assert result[-1]["loss"] < result[0]["loss"] + assert result[TRAINING_ITERATION] == epochs -def test_torch_fashion_mnist(ray_start_2_cpus): +def test_torch_fashion_mnist(ray_start_4_cpus): num_workers = 2 epochs = 3 - trainer = Trainer("torch", num_workers=num_workers) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} - trainer.start() - results = trainer.run(fashion_mnist_train_func, config) - trainer.shutdown() - - assert len(results) == num_workers + trainer = TorchTrainer( + fashion_mnist_train_func, + train_loop_config=config, + scaling_config=dict(num_workers=num_workers), + ) + results = trainer.fit() - for result in results: - assert len(result) == epochs - assert result[-1] < result[0] + result = results.metrics + assert result[TRAINING_ITERATION] == epochs -def test_torch_non_distributed(ray_start_2_cpus): +def test_torch_non_distributed(ray_start_4_cpus): """Make sure Ray Train works without torch DDP.""" - trainer = Trainer(backend="torch", num_workers=1) - trainer.start() - trainer.run(torch_quick_start_train_func) - trainer.shutdown() + trainer = TorchTrainer( + torch_quick_start_train_func, scaling_config=dict(num_workers=1) + ) + trainer.fit() -def test_horovod_torch_mnist(ray_start_2_cpus): +def test_horovod_torch_mnist(ray_start_4_cpus): num_workers = 2 num_epochs = 2 - trainer = Trainer("horovod", num_workers) - trainer.start() - results = trainer.run( - horovod_torch_train_func, config={"num_epochs": num_epochs, "lr": 1e-3} + trainer = HorovodTrainer( + horovod_torch_train_func, + train_loop_config={"num_epochs": num_epochs, "lr": 1e-3}, + scaling_config=dict(num_workers=num_workers), ) - trainer.shutdown() - - assert len(results) == num_workers - for worker_result in results: - assert len(worker_result) == num_epochs - assert worker_result[num_epochs - 1] < worker_result[0] + results = trainer.fit() + result = results.metrics + assert result[TRAINING_ITERATION] == num_workers -def test_horovod_torch_mnist_stateful(ray_start_2_cpus): +@pytest.mark.skip("Refactor as a backend test.") +def test_horovod_torch_mnist_stateful(ray_start_4_cpus): num_workers = 2 num_epochs = 2 trainer = Trainer("horovod", num_workers) @@ -180,7 +180,8 @@ def test_horovod_torch_mnist_stateful(ray_start_2_cpus): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) From f7f3ea8559f3b2237895108d03cdc985646f9c3f Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 15 Jun 2022 20:14:22 +0000 Subject: [PATCH 14/70] Move tests --- python/ray/train/tests/test_examples.py | 12 ++- python/ray/train/tests/test_gpu.py | 85 +++++++++--------- python/ray/train/tests/test_minimal.py | 59 +++++------- python/ray/train/tests/test_tune.py | 114 ++++++++++++++---------- 4 files changed, 138 insertions(+), 132 deletions(-) diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index e37cf43a4687..06c88577205e 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -61,15 +61,13 @@ def test_tf_non_distributed(ray_start_4_cpus): trainer.fit() -@pytest.mark.skip("Refactor as a backend test.") -def test_tensorflow_mnist_fail(ray_start_4_cpus): +# TODO: Refactor as a backend test. +def test_tensorflow_mnist_fail(ray_start_2_cpus): """Tests if tensorflow example works even with worker failure.""" epochs = 3 + trainer = Trainer("tensorflow", num_workers=2) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} - trainer = TensorflowTrainer( - tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=2) - ) trainer.start() kill_callback = KillCallback(fail_on=0, trainer=trainer) results = trainer.run( @@ -106,7 +104,7 @@ def test_torch_linear(ray_start_4_cpus, num_workers): assert result[TRAINING_ITERATION] == epochs -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. def test_torch_linear_failure(ray_start_4_cpus): num_workers = 2 epochs = 3 @@ -161,7 +159,7 @@ def test_horovod_torch_mnist(ray_start_4_cpus): assert result[TRAINING_ITERATION] == num_workers -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. def test_horovod_torch_mnist_stateful(ray_start_4_cpus): num_workers = 2 num_epochs = 2 diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py index 875ad766ebda..16dac0c42fc7 100644 --- a/python/ray/train/tests/test_gpu.py +++ b/python/ray/train/tests/test_gpu.py @@ -1,15 +1,17 @@ import os -import pytest from timeit import default_timer as timer +import pytest import torch +import torchvision +from test_tune import torch_fashion_mnist, tune_tensorflow_mnist from torch.nn.parallel import DistributedDataParallel from torch.utils.data import DataLoader, DistributedSampler -import torchvision import ray import ray.train as train from ray.train import Trainer, TrainingCallback +from ray.train.constants import TRAINING_ITERATION from ray.train.examples.horovod.horovod_example import ( train_func as horovod_torch_train_func, ) @@ -20,7 +22,9 @@ train_func as fashion_mnist_train_func, ) from ray.train.examples.train_linear_example import LinearDataset -from test_tune import torch_fashion_mnist, tune_tensorflow_mnist +from ray.train.horovod.horovod_trainer import HorovodTrainer +from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer +from ray.train.torch.torch_trainer import TorchTrainer @pytest.fixture @@ -38,6 +42,7 @@ def ray_start_1_cpu_1_gpu(): ray.shutdown() +@pytest.mark.skip("Refactor as a backend test.") @pytest.mark.parametrize("num_gpus_per_worker", [0.5, 1]) def test_torch_get_device(ray_start_4_cpus_2_gpus, num_gpus_per_worker): def train_fn(): @@ -64,6 +69,7 @@ def train_fn(): ) +@pytest.mark.skip("Refactor as a backend test.") def test_torch_prepare_model(ray_start_4_cpus_2_gpus): """Tests if ``prepare_model`` correctly wraps in DDP.""" @@ -85,6 +91,7 @@ def train_fn(): trainer.shutdown() +@pytest.mark.skip("Refactor as a backend test.") def test_torch_prepare_dataloader(ray_start_4_cpus_2_gpus): data_loader = DataLoader(LinearDataset(a=1, b=2, size=10)) @@ -108,6 +115,7 @@ def train_fn(): trainer.shutdown() +@pytest.mark.skip("Refactor as a backend test.") @pytest.mark.parametrize("use_gpu", (False, True)) def test_enable_reproducibility(ray_start_4_cpus_2_gpus, use_gpu): # NOTE: Reproducible results aren't guaranteed between seeded executions, even with @@ -154,6 +162,7 @@ def train_func(): assert result1 == result2 +@pytest.mark.skip("Refactor as a backend test.") def test_torch_amp_performance(ray_start_4_cpus_2_gpus): def train_func(config): train.torch.accelerate(amp=config["amp"]) @@ -196,6 +205,7 @@ def latency(amp: bool) -> float: assert 1.05 * latency(amp=True) < latency(amp=False) +@pytest.mark.skip("Refactor as a backend test.") def test_checkpoint_torch_model_with_amp(ray_start_4_cpus_2_gpus): """Test that model with AMP is serializable.""" @@ -213,6 +223,7 @@ def train_func(): trainer.shutdown() +@pytest.mark.skip("Refactor as a backend test.") def test_torch_auto_gpu_to_cpu(ray_start_4_cpus_2_gpus): """Tests if GPU tensors are auto converted to CPU on driver.""" @@ -287,55 +298,47 @@ def test_tensorflow_mnist_gpu(ray_start_4_cpus_2_gpus): num_workers = 2 epochs = 3 - trainer = Trainer("tensorflow", num_workers=num_workers, use_gpu=True) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} - trainer.start() - results = trainer.run(tensorflow_mnist_train_func, config) - trainer.shutdown() - - assert len(results) == num_workers - result = results[0] + trainer = TensorflowTrainer( + tensorflow_mnist_train_func, + train_loop_config=config, + scaling_config=dict(num_workers=num_workers, use_gpu=True), + ) + results = trainer.fit() - loss = result["loss"] - assert len(loss) == epochs - assert loss[-1] < loss[0] + result = results.metrics - accuracy = result["accuracy"] - assert len(accuracy) == epochs - assert accuracy[-1] > accuracy[0] + assert result[TRAINING_ITERATION] == epochs def test_torch_fashion_mnist_gpu(ray_start_4_cpus_2_gpus): num_workers = 2 epochs = 3 - trainer = Trainer("torch", num_workers=num_workers, use_gpu=True) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} - trainer.start() - results = trainer.run(fashion_mnist_train_func, config) - trainer.shutdown() + trainer = TorchTrainer( + fashion_mnist_train_func, + train_loop_config=config, + scaling_config=dict(num_workers=num_workers, use_gpu=True), + ) + results = trainer.fit() - assert len(results) == num_workers + result = results.metrics - for result in results: - assert len(result) == epochs - assert result[-1] < result[0] + assert result[TRAINING_ITERATION] == epochs def test_horovod_torch_mnist_gpu(ray_start_4_cpus_2_gpus): num_workers = 2 num_epochs = 2 - trainer = Trainer("horovod", num_workers, use_gpu=True) - trainer.start() - results = trainer.run( - horovod_torch_train_func, config={"num_epochs": num_epochs, "lr": 1e-3} + trainer = HorovodTrainer( + horovod_torch_train_func, + train_loop_config={"num_epochs": num_epochs, "lr": 1e-3}, + scaling_config=dict(num_workers=num_workers, use_gpu=True), ) - trainer.shutdown() - - assert len(results) == num_workers - for worker_result in results: - assert len(worker_result) == num_epochs - assert worker_result[num_epochs - 1] < worker_result[0] + results = trainer.fit() + result = results.metrics + assert result[TRAINING_ITERATION] == num_workers def test_tune_fashion_mnist_gpu(ray_start_4_cpus_2_gpus): @@ -349,9 +352,7 @@ def test_tune_tensorflow_mnist_gpu(ray_start_4_cpus_2_gpus): def test_train_linear_dataset_gpu(ray_start_4_cpus_2_gpus): from ray.train.examples.train_linear_dataset_example import train_linear - results = train_linear(num_workers=2, use_gpu=True) - for result in results: - assert result[-1]["loss"] < result[0]["loss"] + assert train_linear(num_workers=2, use_gpu=True) def test_tensorflow_linear_dataset_gpu(ray_start_4_cpus_2_gpus): @@ -359,11 +360,10 @@ def test_tensorflow_linear_dataset_gpu(ray_start_4_cpus_2_gpus): train_tensorflow_linear, ) - results = train_tensorflow_linear(num_workers=2, use_gpu=True) - for result in results: - assert result[-1]["loss"] < result[0]["loss"] + assert train_tensorflow_linear(num_workers=2, use_gpu=True) +@pytest.mark.skip("Refactor as a backend test.") @pytest.mark.parametrize( ("device_choice", "auto_transfer"), [ @@ -376,8 +376,8 @@ def test_tensorflow_linear_dataset_gpu(ray_start_4_cpus_2_gpus): def test_auto_transfer_data_from_host_to_device( ray_start_1_cpu_1_gpu, device_choice, auto_transfer ): - import torch import numpy as np + import torch def compute_average_runtime(func): device = torch.device(device_choice) @@ -417,7 +417,8 @@ def host_to_device_auto_pipeline(device): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", "-s", __file__])) diff --git a/python/ray/train/tests/test_minimal.py b/python/ray/train/tests/test_minimal.py index c6c6d6bba7b3..5f3be1d4c3b3 100644 --- a/python/ray/train/tests/test_minimal.py +++ b/python/ray/train/tests/test_minimal.py @@ -1,18 +1,16 @@ -from typing import List, Dict - import pytest import ray import ray.train as train -from ray.train import Trainer -from ray.train.backend import BackendConfig, Backend -from ray.train.callbacks import TrainingCallback +from ray.air.checkpoint import Checkpoint from ray.train._internal.worker_group import WorkerGroup +from ray.train.backend import Backend, BackendConfig +from ray.train.data_parallel_trainer import DataParallelTrainer @pytest.fixture -def ray_start_2_cpus(): - address_info = ray.init(num_cpus=2) +def ray_start_4_cpus(): + address_info = ray.init(num_cpus=4) yield address_info # The code after the yield will run as teardown code. ray.shutdown() @@ -32,15 +30,7 @@ def on_shutdown(self, worker_group: WorkerGroup, backend_config: TestConfig): pass -class TestCallback(TrainingCallback): - def __init__(self): - self.result_list = [] - - def handle_result(self, results: List[Dict], **info): - self.result_list.append(results) - - -def test_run(ray_start_2_cpus): +def test_run(ray_start_4_cpus): """Tests that Train can be run without any specific backends.""" num_workers = 2 key = "value" @@ -53,27 +43,23 @@ def train_func(): train.save_checkpoint(**checkpoint) return checkpoint[key] - checkpoint = {key: value} - test_callback = TestCallback() - - trainer = Trainer(config, num_workers=num_workers) - trainer.start() - results = trainer.run(train_func, checkpoint=checkpoint, callbacks=[test_callback]) + checkpoint = Checkpoint.from_dict( + { + # this would be set during checkpoint saving + "_current_checkpoint_id": 1, + key: value, + } + ) - # Test results. - assert len(results) == num_workers - assert all(result == 1 for result in results) + trainer = DataParallelTrainer( + train_func, + backend_config=config, + resume_from_checkpoint=checkpoint, + scaling_config=dict(num_workers=num_workers), + ) + results = trainer.fit() - # Test reporting and callbacks. - assert len(test_callback.result_list) == value - assert len(test_callback.result_list[0]) == num_workers - print(test_callback.result_list[0]) - assert all(result[key] == value for result in test_callback.result_list[0]) - - # Test checkpointing. - assert trainer.latest_checkpoint[key] == value - - trainer.shutdown() + assert results.checkpoint def test_failure(): @@ -89,7 +75,8 @@ def test_failure(): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index f08b3da43dc6..2fed4e42fa43 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -1,25 +1,31 @@ import os import pytest + import ray import ray.train as train from ray import tune from ray.air import Checkpoint -from ray.tune import TuneError +from ray.air.config import FailureConfig, RunConfig from ray.train import Trainer +from ray.train._internal.worker_group import WorkerGroup from ray.train.backend import Backend, BackendConfig +from ray.train.data_parallel_trainer import DataParallelTrainer from ray.train.examples.tensorflow_mnist_example import ( train_func as tensorflow_mnist_train_func, ) from ray.train.examples.train_fashion_mnist_example import ( train_func as fashion_mnist_train_func, ) -from ray.train._internal.worker_group import WorkerGroup +from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer +from ray.train.torch.torch_trainer import TorchTrainer +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner @pytest.fixture -def ray_start_2_cpus(): - address_info = ray.init(num_cpus=2) +def ray_start_4_cpus(): + address_info = ray.init(num_cpus=4) yield address_info # The code after the yield will run as teardown code. ray.shutdown() @@ -50,18 +56,24 @@ def on_shutdown(self, worker_group: WorkerGroup, backend_config: TestConfig): def torch_fashion_mnist(num_workers, use_gpu, num_samples): epochs = 2 - trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu) - MnistTrainable = trainer.to_tune_trainable(fashion_mnist_train_func) - - analysis = tune.run( - MnistTrainable, - num_samples=num_samples, - config={ - "lr": tune.loguniform(1e-4, 1e-1), - "batch_size": tune.choice([32, 64, 128]), - "epochs": epochs, + trainer = TorchTrainer( + fashion_mnist_train_func, + scaling_config=dict(num_workers=num_workers, use_gpu=use_gpu), + ) + tuner = Tuner( + trainer, + param_space={ + "train_loop_config": { + "lr": tune.loguniform(1e-4, 1e-1), + "batch_size": tune.choice([32, 64, 128]), + "epochs": epochs, + } }, + tune_config=TuneConfig( + num_samples=num_samples, + ), ) + analysis = tuner.fit()._experiment_analysis # Check that loss decreases in each trial. for path, df in analysis.trial_dataframes.items(): @@ -74,18 +86,25 @@ def test_tune_torch_fashion_mnist(ray_start_8_cpus): def tune_tensorflow_mnist(num_workers, use_gpu, num_samples): epochs = 2 - trainer = Trainer("tensorflow", num_workers=num_workers, use_gpu=use_gpu) - MnistTrainable = trainer.to_tune_trainable(tensorflow_mnist_train_func) - - analysis = tune.run( - MnistTrainable, - num_samples=num_samples, - config={ - "lr": tune.loguniform(1e-4, 1e-1), - "batch_size": tune.choice([32, 64, 128]), - "epochs": epochs, + + trainer = TensorflowTrainer( + tensorflow_mnist_train_func, + scaling_config=dict(num_workers=num_workers, use_gpu=use_gpu), + ) + tuner = Tuner( + trainer, + param_space={ + "train_loop_config": { + "lr": tune.loguniform(1e-4, 1e-1), + "batch_size": tune.choice([32, 64, 128]), + "epochs": epochs, + } }, + tune_config=TuneConfig( + num_samples=num_samples, + ), ) + analysis = tuner.fit()._experiment_analysis # Check that loss decreases in each trial. for path, df in analysis.trial_dataframes.items(): @@ -96,18 +115,7 @@ def test_tune_tensorflow_mnist(ray_start_8_cpus): tune_tensorflow_mnist(num_workers=2, use_gpu=False, num_samples=2) -def test_tune_error(ray_start_2_cpus): - def train_func(config): - raise RuntimeError("Error in training function!") - - trainer = Trainer(TestConfig(), num_workers=1) - TestTrainable = trainer.to_tune_trainable(train_func) - - with pytest.raises(TuneError): - tune.run(TestTrainable) - - -def test_tune_checkpoint(ray_start_2_cpus): +def test_tune_checkpoint(ray_start_4_cpus): def train_func(): for i in range(10): train.report(test=i) @@ -123,7 +131,7 @@ def train_func(): assert checkpoint["hello"] == "world" -def test_reuse_checkpoint(ray_start_2_cpus): +def test_reuse_checkpoint(ray_start_4_cpus): def train_func(config): itr = 0 ckpt = train.load_checkpoint() @@ -134,19 +142,28 @@ def train_func(config): train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i) - trainer = Trainer(TestConfig(), num_workers=1) - TestTrainable = trainer.to_tune_trainable(train_func) - - [trial] = tune.run(TestTrainable, config={"max_iter": 5}).trials + trainer = DataParallelTrainer( + train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1) + ) + tuner = Tuner( + trainer, + param_space={"train_loop_config": {"max_iter": 5}}, + ) + [trial] = tuner.fit()._experiment_analysis.trials checkpoint_path = trial.checkpoint.dir_or_data checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["iter"] == 4 - analysis = tune.run(TestTrainable, config={"max_iter": 10}, restore=checkpoint_path) + + tuner = Tuner( + trainer, + param_space={"train_loop_config": {"max_iter": 10}}, + ).restore(trial.local_dir) + analysis = tuner.fit()._experiment_analysis trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 5 -def test_retry(ray_start_2_cpus): +def test_retry(ray_start_4_cpus): def train_func(): ckpt = train.load_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? @@ -160,10 +177,12 @@ def train_func(): train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i) - trainer = Trainer(TestConfig(), num_workers=1) - TestTrainable = trainer.to_tune_trainable(train_func) + trainer = DataParallelTrainer( + train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1) + ) + tuner = Tuner(trainer, run_config=RunConfig(failure=FailureConfig(max_failures=3))) - analysis = tune.run(TestTrainable, max_failures=3) + analysis = tuner.fit()._experiment_analysis checkpoint_path = analysis.trials[0].checkpoint.dir_or_data checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["iter"] == 3 @@ -173,7 +192,8 @@ def train_func(): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) From 50ca40b3a9b701d3a97a256295de9505f5d82605 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 15 Jun 2022 21:20:04 +0000 Subject: [PATCH 15/70] Fixture fix --- python/ray/train/tests/test_examples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index 06c88577205e..169c0a29e236 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -62,7 +62,7 @@ def test_tf_non_distributed(ray_start_4_cpus): # TODO: Refactor as a backend test. -def test_tensorflow_mnist_fail(ray_start_2_cpus): +def test_tensorflow_mnist_fail(ray_start_4_cpus): """Tests if tensorflow example works even with worker failure.""" epochs = 3 From 20b707571febcff0ec8f2d9ba79e8005d56f85fd Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 16 Jun 2022 15:48:50 +0000 Subject: [PATCH 16/70] CI fixes --- .../examples/train_linear_dataset_example.py | 22 ++++++++++++++----- python/ray/train/tests/test_examples.py | 6 +++-- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/python/ray/train/examples/train_linear_dataset_example.py b/python/ray/train/examples/train_linear_dataset_example.py index f84faa5f7a11..3038ac66aa9e 100644 --- a/python/ray/train/examples/train_linear_dataset_example.py +++ b/python/ray/train/examples/train_linear_dataset_example.py @@ -1,16 +1,19 @@ import argparse -from typing import Dict +from typing import Dict, Tuple import torch import torch.nn as nn import ray import ray.train as train +from ray.air.config import DatasetConfig from ray.data import Dataset from ray.train.torch import TorchTrainer -def get_datasets(a=5, b=10, size=1000, split=0.8) -> Dict[str, Dataset]: +def get_datasets_and_configs( + a=5, b=10, size=1000, split=0.8 +) -> Tuple[Dict[str, Dataset], Dict[str, DatasetConfig]]: def get_dataset(a, b, size) -> Dataset: items = [i / size for i in range(size)] dataset = ray.data.from_items([{"x": x, "y": a * x + b} for x in items]) @@ -27,7 +30,13 @@ def get_dataset(a, b, size) -> Dataset: "validation": validation_dataset, } - return datasets + # Use dataset pipelining + dataset_configs = { + "train": DatasetConfig(use_stream_api=True), + "validation": DatasetConfig(use_stream_api=True), + } + + return datasets, dataset_configs def train_epoch(iterable_dataset, model, loss_fn, optimizer, device): @@ -113,13 +122,14 @@ def train_func(config): def train_linear(num_workers=2, use_gpu=False): - datasets = get_datasets() + datasets, dataset_configs = get_datasets_and_configs() config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer = TorchTrainer( train_func, train_loop_config=config, datasets=datasets, + dataset_config=dataset_configs, scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) results = trainer.fit() @@ -152,8 +162,8 @@ def train_linear(num_workers=2, use_gpu=False): args, _ = parser.parse_known_args() if args.smoke_test: - # 1 for datasets - num_cpus = args.num_workers + 1 + # 1 for datasets, 1 for Trainable actor + num_cpus = args.num_workers + 2 num_gpus = args.num_workers if args.use_gpu else 0 ray.init(num_cpus=num_cpus, num_gpus=num_gpus) else: diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index 169c0a29e236..fd6a2fadbf91 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -116,9 +116,11 @@ def test_torch_linear_failure(ray_start_4_cpus): results = trainer.run(linear_train_func, config, callbacks=[kill_callback]) trainer.shutdown() - result = results.metrics + assert len(results) == num_workers - assert result[TRAINING_ITERATION] == epochs + for result in results: + assert len(result) == epochs + assert result[-1]["loss"] < result[0]["loss"] def test_torch_fashion_mnist(ray_start_4_cpus): From c3b7d42c5f15cf3d44fec370df0c5cff9443b96e Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 16 Jun 2022 16:51:32 +0000 Subject: [PATCH 17/70] Fix --- .../ray/train/examples/tensorflow_quick_start.py | 7 +++---- python/ray/train/examples/torch_quick_start.py | 7 +++---- python/ray/train/tests/test_gpu.py | 16 ++++++++-------- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/python/ray/train/examples/tensorflow_quick_start.py b/python/ray/train/examples/tensorflow_quick_start.py index 0ac3666672e2..f0c7f3d10f4e 100644 --- a/python/ray/train/examples/tensorflow_quick_start.py +++ b/python/ray/train/examples/tensorflow_quick_start.py @@ -1,15 +1,12 @@ # flake8: noqa # fmt: off +# isort: skip_file # __tf_setup_begin__ -import json -import os - import numpy as np import tensorflow as tf - def mnist_dataset(batch_size): (x_train, y_train), _ = tf.keras.datasets.mnist.load_data() # The `x` arrays are in uint8 and have values in the [0, 255] range. @@ -50,6 +47,8 @@ def train_func(): # __tf_distributed_begin__ +import json +import os def train_func_distributed(): per_worker_batch_size = 64 diff --git a/python/ray/train/examples/torch_quick_start.py b/python/ray/train/examples/torch_quick_start.py index eaf07a95a5d1..2f0da37ddbc9 100644 --- a/python/ray/train/examples/torch_quick_start.py +++ b/python/ray/train/examples/torch_quick_start.py @@ -1,13 +1,10 @@ # flake8: noqa # fmt: off +# isort: skip_file # __torch_setup_begin__ import torch import torch.nn as nn -import torch.optim as optim - -import ray.train.torch -from ray import train num_samples = 20 input_size = 10 @@ -32,6 +29,7 @@ def forward(self, input): # __torch_single_begin__ +import torch.optim as optim def train_func(): num_epochs = 3 @@ -51,6 +49,7 @@ def train_func(): # __torch_distributed_begin__ +from ray import train def train_func_distributed(): num_epochs = 3 diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py index 16dac0c42fc7..ac9a0afe7cfb 100644 --- a/python/ray/train/tests/test_gpu.py +++ b/python/ray/train/tests/test_gpu.py @@ -42,7 +42,7 @@ def ray_start_1_cpu_1_gpu(): ray.shutdown() -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. @pytest.mark.parametrize("num_gpus_per_worker", [0.5, 1]) def test_torch_get_device(ray_start_4_cpus_2_gpus, num_gpus_per_worker): def train_fn(): @@ -69,7 +69,7 @@ def train_fn(): ) -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. def test_torch_prepare_model(ray_start_4_cpus_2_gpus): """Tests if ``prepare_model`` correctly wraps in DDP.""" @@ -91,7 +91,7 @@ def train_fn(): trainer.shutdown() -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. def test_torch_prepare_dataloader(ray_start_4_cpus_2_gpus): data_loader = DataLoader(LinearDataset(a=1, b=2, size=10)) @@ -115,7 +115,7 @@ def train_fn(): trainer.shutdown() -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. @pytest.mark.parametrize("use_gpu", (False, True)) def test_enable_reproducibility(ray_start_4_cpus_2_gpus, use_gpu): # NOTE: Reproducible results aren't guaranteed between seeded executions, even with @@ -162,7 +162,7 @@ def train_func(): assert result1 == result2 -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. def test_torch_amp_performance(ray_start_4_cpus_2_gpus): def train_func(config): train.torch.accelerate(amp=config["amp"]) @@ -205,7 +205,7 @@ def latency(amp: bool) -> float: assert 1.05 * latency(amp=True) < latency(amp=False) -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. def test_checkpoint_torch_model_with_amp(ray_start_4_cpus_2_gpus): """Test that model with AMP is serializable.""" @@ -223,7 +223,7 @@ def train_func(): trainer.shutdown() -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. def test_torch_auto_gpu_to_cpu(ray_start_4_cpus_2_gpus): """Tests if GPU tensors are auto converted to CPU on driver.""" @@ -363,7 +363,7 @@ def test_tensorflow_linear_dataset_gpu(ray_start_4_cpus_2_gpus): assert train_tensorflow_linear(num_workers=2, use_gpu=True) -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. @pytest.mark.parametrize( ("device_choice", "auto_transfer"), [ From 37b81825e4c6fe406f88cc92446afac021ca4c74 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 16 Jun 2022 19:28:38 +0000 Subject: [PATCH 18/70] Apply suggestions from code review --- python/ray/train/BUILD | 8 + .../train/examples/horovod/horovod_example.py | 2 - .../tensorflow_linear_dataset_example.py | 30 +- .../examples/train_linear_dataset_example.py | 5 - .../train/examples/train_linear_example.py | 5 - python/ray/train/tests/test_callbacks.py | 357 ++++++++++++++++++ python/ray/train/tests/test_minimal.py | 2 +- python/ray/train/tests/test_tune.py | 28 +- 8 files changed, 407 insertions(+), 30 deletions(-) create mode 100644 python/ray/train/tests/test_callbacks.py diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index 6124c35c2606..6f719b725e64 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -129,6 +129,14 @@ py_test( deps = [":train_lib"] ) +py_test( + name = "test_callbacks", + size = "medium", + srcs = ["tests/test_callbacks.py"], + tags = ["team:ml", "exclusive"], + deps = [":train_lib"] +) + py_test( name = "test_data_parallel_trainer", size = "medium", diff --git a/python/ray/train/examples/horovod/horovod_example.py b/python/ray/train/examples/horovod/horovod_example.py index 1e163da70052..c01788008ec5 100644 --- a/python/ray/train/examples/horovod/horovod_example.py +++ b/python/ray/train/examples/horovod/horovod_example.py @@ -144,13 +144,11 @@ def train_func(config): model, optimizer, train_loader, train_sampler = setup(config) - results = [] for epoch in range(num_epochs): loss = train_epoch( model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda ) train.report(loss=loss) - return results def main(num_workers, use_gpu, kwargs): diff --git a/python/ray/train/examples/tensorflow_linear_dataset_example.py b/python/ray/train/examples/tensorflow_linear_dataset_example.py index 9271c5125da4..9dbb3205b7bf 100644 --- a/python/ray/train/examples/tensorflow_linear_dataset_example.py +++ b/python/ray/train/examples/tensorflow_linear_dataset_example.py @@ -1,12 +1,13 @@ import argparse +from typing import Dict, Tuple import tensorflow as tf from tensorflow.keras.callbacks import Callback import ray import ray.train as train +from ray.air.config import DatasetConfig from ray.data import Dataset -from ray.data.dataset_pipeline import DatasetPipeline from ray.train.tensorflow import TensorflowTrainer, prepare_dataset_shard @@ -15,17 +16,22 @@ def on_epoch_end(self, epoch, logs=None): train.report(**logs) -def get_dataset_pipeline(a=5, b=10, size=1000) -> DatasetPipeline: +def get_datasets_and_configs( + a=5, b=10, size=1000 +) -> Tuple[Dict[str, Dataset], Dict[str, DatasetConfig]]: def get_dataset(a, b, size) -> Dataset: items = [i / size for i in range(size)] dataset = ray.data.from_items([{"x": x, "y": a * x + b} for x in items]) return dataset - dataset = get_dataset(a, b, size) + datasets = {"train": get_dataset(a, b, size)} - dataset_pipeline = dataset.repeat().random_shuffle_each_window() + # Use dataset pipelining + dataset_configs = { + "train": DatasetConfig(use_stream_api=True), + } - return dataset_pipeline + return datasets, dataset_configs def build_and_compile_model(config): @@ -57,7 +63,6 @@ def train_func(config): dataset_pipeline = train.get_dataset_shard("train") dataset_iterator = dataset_pipeline.iter_epochs() - results = [] for _ in range(epochs): dataset = next(dataset_iterator) tf_dataset = prepare_dataset_shard( @@ -70,17 +75,16 @@ def train_func(config): batch_size=batch_size, ) ) - history = multi_worker_model.fit(tf_dataset, callbacks=[TrainReportCallback()]) - results.append(history.history) - return results + multi_worker_model.fit(tf_dataset, callbacks=[TrainReportCallback()]) def train_tensorflow_linear(num_workers=2, use_gpu=False): - dataset_pipeline = get_dataset_pipeline() + datasets, dataset_configs = get_datasets_and_configs() trainer = TensorflowTrainer( train_func, train_loop_config={"lr": 1e-3, "batch_size": 32, "epochs": 4}, - datasets={"train": dataset_pipeline}, + datasets=datasets, + dataset_config=dataset_configs, scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) results = trainer.fit() @@ -113,8 +117,8 @@ def train_tensorflow_linear(num_workers=2, use_gpu=False): args, _ = parser.parse_known_args() if args.smoke_test: - # 1 for datasets - num_cpus = args.num_workers + 1 + # 1 for datasets, 1 for Trainable actor + num_cpus = args.num_workers + 2 num_gpus = args.num_workers if args.use_gpu else 0 ray.init(num_cpus=num_cpus, num_gpus=num_gpus) else: diff --git a/python/ray/train/examples/train_linear_dataset_example.py b/python/ray/train/examples/train_linear_dataset_example.py index 3038ac66aa9e..acfa0ce2e637 100644 --- a/python/ray/train/examples/train_linear_dataset_example.py +++ b/python/ray/train/examples/train_linear_dataset_example.py @@ -87,8 +87,6 @@ def train_func(config): optimizer = torch.optim.SGD(model.parameters(), lr=lr) - results = [] - train_dataset_iterator = train_dataset_pipeline_shard.iter_epochs() validation_dataset_iterator = validation_dataset_pipeline_shard.iter_epochs() @@ -116,9 +114,6 @@ def train_func(config): train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) result = validate_epoch(validation_torch_dataset, model, loss_fn, device) train.report(**result) - results.append(result) - - return results def train_linear(num_workers=2, use_gpu=False): diff --git a/python/ray/train/examples/train_linear_example.py b/python/ray/train/examples/train_linear_example.py index 069c6dd13db1..7e09acef3d3d 100644 --- a/python/ray/train/examples/train_linear_example.py +++ b/python/ray/train/examples/train_linear_example.py @@ -74,15 +74,10 @@ def train_func(config): optimizer = torch.optim.SGD(model.parameters(), lr=lr) - results = [] - for _ in range(epochs): train_epoch(train_loader, model, loss_fn, optimizer) result = validate_epoch(validation_loader, model, loss_fn) train.report(**result) - results.append(result) - - return results def train_linear(num_workers=2, use_gpu=False, epochs=3): diff --git a/python/ray/train/tests/test_callbacks.py b/python/ray/train/tests/test_callbacks.py new file mode 100644 index 000000000000..b21adf6634b9 --- /dev/null +++ b/python/ray/train/tests/test_callbacks.py @@ -0,0 +1,357 @@ +from typing import Dict, List +import glob +import io +import json +from collections import defaultdict +from contextlib import redirect_stdout +from pathlib import Path + +import pytest + +import ray +import ray.train as train +from ray.train import Trainer +from ray.train.backend import BackendConfig, Backend +from ray.train.callbacks import ( + TrainingCallback, + JsonLoggerCallback, + PrintCallback, + TBXLoggerCallback, + TorchTensorboardProfilerCallback, +) +from ray.train.callbacks.logging import ( + MLflowLoggerCallback, + _TrainCallbackLogdirManager, +) +from ray.train.constants import ( + TRAINING_ITERATION, + DETAILED_AUTOFILLED_KEYS, + BASIC_AUTOFILLED_KEYS, + ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, +) +from ray.train._internal.worker_group import WorkerGroup +from ray.train._internal.results_preprocessors.preprocessor import ( + SequentialResultsPreprocessor, +) + +try: + from tensorflow.python.summary.summary_iterator import summary_iterator +except ImportError: + summary_iterator = None + + +@pytest.fixture +def ray_start_4_cpus(): + address_info = ray.init(num_cpus=4) + yield address_info + # The code after the yield will run as teardown code. + ray.shutdown() + + +class TestConfig(BackendConfig): + @property + def backend_cls(self): + return TestBackend + + +class TestBackend(Backend): + def on_start(self, worker_group: WorkerGroup, backend_config: TestConfig): + pass + + def on_shutdown(self, worker_group: WorkerGroup, backend_config: TestConfig): + pass + + +def test_print(ray_start_4_cpus): + num_workers = 4 + + def train_func(): + train.report(rank=train.world_rank()) + + stream = io.StringIO() + with redirect_stdout(stream): + trainer = Trainer(TestConfig(), num_workers=num_workers) + trainer.start() + trainer.run(train_func, callbacks=[PrintCallback()]) + trainer.shutdown() + + output = stream.getvalue() + results = json.loads(output) + + assert len(results) == num_workers + for i, result in enumerate(results): + assert set(result.keys()) == (BASIC_AUTOFILLED_KEYS | {"rank"}) + assert result["rank"] == i + + +@pytest.mark.parametrize("input", [None, "dir", "file"]) +def test_train_callback_logdir_manager(tmp_path, input): + default_dir = tmp_path / "default_dir" + + if input == "dir": + input_logdir = tmp_path / "dir" + input_logdir.mkdir(parents=True) + elif input == "file": + input_logdir = tmp_path / "file" + input_logdir.touch() + else: + input_logdir = None + + logdir_manager = _TrainCallbackLogdirManager(input_logdir) + + if input_logdir: + path = logdir_manager.logdir_path + assert path == logdir_manager.logdir_path + else: + with pytest.raises(RuntimeError): + path = logdir_manager.logdir_path + + if input_logdir and not Path(input_logdir).is_dir(): + with pytest.raises(FileExistsError): + logdir_manager.setup_logdir(str(default_dir)) + else: + path = logdir_manager.setup_logdir(str(default_dir)) + assert path == logdir_manager.logdir_path + + +@pytest.mark.parametrize("workers_to_log", [0, None, [0, 1]]) +@pytest.mark.parametrize("detailed", [False, True]) +@pytest.mark.parametrize("filename", [None, "my_own_filename.json"]) +def test_json( + monkeypatch, ray_start_4_cpus, tmp_path, workers_to_log, detailed, filename +): + if detailed: + monkeypatch.setenv(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, "1") + + config = TestConfig() + + num_iters = 5 + num_workers = 4 + + if workers_to_log is None: + num_workers_to_log = num_workers + elif isinstance(workers_to_log, int): + num_workers_to_log = 1 + else: + num_workers_to_log = len(workers_to_log) + + def train_func(): + for i in range(num_iters): + train.report(index=i) + return 1 + + if filename is None: + # if None, use default value + callback = JsonLoggerCallback(workers_to_log=workers_to_log) + else: + callback = JsonLoggerCallback(filename=filename, workers_to_log=workers_to_log) + trainer = Trainer(config, num_workers=num_workers, logdir=str(tmp_path)) + trainer.start() + trainer.run(train_func, callbacks=[callback]) + if filename is None: + assert str(callback.log_path.name) == JsonLoggerCallback._default_filename + else: + assert str(callback.log_path.name) == filename + + with open(callback.log_path, "r") as f: + log = json.load(f) + print(log) + assert len(log) == num_iters + assert len(log[0]) == num_workers_to_log + assert all(len(element) == len(log[0]) for element in log) + assert all( + all(worker["index"] == worker[TRAINING_ITERATION] - 1 for worker in element) + for element in log + ) + assert all( + all(all(key in worker for key in BASIC_AUTOFILLED_KEYS) for worker in element) + for element in log + ) + if detailed: + assert all( + all( + all(key in worker for key in DETAILED_AUTOFILLED_KEYS) + for worker in element + ) + for element in log + ) + else: + assert all( + all( + not any(key in worker for key in DETAILED_AUTOFILLED_KEYS) + for worker in element + ) + for element in log + ) + + +def _validate_tbx_result(events_dir): + events_file = list(glob.glob(f"{events_dir}/events*"))[0] + results = defaultdict(list) + for event in summary_iterator(events_file): + for v in event.summary.value: + assert v.tag.startswith("ray/train") + results[v.tag[10:]].append(v.simple_value) + + assert len(results["episode_reward_mean"]) == 3 + assert [int(res) for res in results["episode_reward_mean"]] == [4, 5, 6] + assert len(results["score"]) == 1 + assert len(results["hello/world"]) == 1 + + +def test_TBX(ray_start_4_cpus, tmp_path): + config = TestConfig() + + temp_dir = tmp_path + num_workers = 4 + + def train_func(): + train.report(episode_reward_mean=4) + train.report(episode_reward_mean=5) + train.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1}) + return 1 + + callback = TBXLoggerCallback(temp_dir) + trainer = Trainer(config, num_workers=num_workers) + trainer.start() + trainer.run(train_func, callbacks=[callback]) + + _validate_tbx_result(temp_dir) + + +def test_mlflow(ray_start_4_cpus, tmp_path): + config = TestConfig() + + params = {"p1": "p1"} + + temp_dir = tmp_path + num_workers = 4 + + def train_func(config): + train.report(episode_reward_mean=4) + train.report(episode_reward_mean=5) + train.report(episode_reward_mean=6) + return 1 + + callback = MLflowLoggerCallback(experiment_name="test_exp", logdir=temp_dir) + trainer = Trainer(config, num_workers=num_workers) + trainer.start() + trainer.run(train_func, config=params, callbacks=[callback]) + + from mlflow.tracking import MlflowClient + + client = MlflowClient(tracking_uri=callback.mlflow_util._mlflow.get_tracking_uri()) + + experiment_id = client.get_experiment_by_name("test_exp").experiment_id + all_runs = callback.mlflow_util._mlflow.search_runs(experiment_ids=[experiment_id]) + assert len(all_runs) == 1 + # all_runs is a pandas dataframe. + all_runs = all_runs.to_dict(orient="records") + run_id = all_runs[0]["run_id"] + run = client.get_run(run_id) + + assert run.data.params == params + assert ( + "episode_reward_mean" in run.data.metrics + and run.data.metrics["episode_reward_mean"] == 6.0 + ) + assert ( + TRAINING_ITERATION in run.data.metrics + and run.data.metrics[TRAINING_ITERATION] == 3.0 + ) + + metric_history = client.get_metric_history(run_id=run_id, key="episode_reward_mean") + + assert len(metric_history) == 3 + iterations = [metric.step for metric in metric_history] + assert iterations == [1, 2, 3] + rewards = [metric.value for metric in metric_history] + assert rewards == [4, 5, 6] + + +def test_torch_tensorboard_profiler_callback(ray_start_4_cpus, tmp_path): + config = TestConfig() + + temp_dir = tmp_path + num_workers = 4 + num_epochs = 2 + + def train_func(): + from ray.train.torch import TorchWorkerProfiler + from torch.profiler import profile, record_function, schedule + + twp = TorchWorkerProfiler() + with profile( + activities=[], + schedule=schedule(wait=0, warmup=0, active=1), + on_trace_ready=twp.trace_handler, + ) as p: + + for epoch in range(num_epochs): + with record_function("test_function"): + pass + + p.step() + + profile_results = twp.get_and_clear_profile_traces() + train.report(epoch=epoch, **profile_results) + + callback = TorchTensorboardProfilerCallback(temp_dir) + trainer = Trainer(config, num_workers=num_workers) + trainer.start() + trainer.run(train_func, callbacks=[callback]) + + assert temp_dir.exists() + + count = 0 + for path in temp_dir.iterdir(): + assert path.is_file() + count += 1 + assert count == num_workers * num_epochs + + +# fix issue: repeat assignments for preprocessor results nested recursive calling +# see https://github.com/ray-project/ray/issues/25005 +def test_hotfix_callback_nested_recusive_calling(): + # test callback used to simulate the nested recursive calling for preprocess() + class TestCallback(TrainingCallback): + def __init__(self): + self.max_process_time = 0 + + def count_process_times(self, processor): + count = 0 + if processor: + if isinstance(processor, SequentialResultsPreprocessor): + for preprocessor in processor.preprocessors: + # recursive calling preprocessors in list + count += self.count_process_times(preprocessor) + else: + count = 1 + return count + + def handle_result(self, results: List[Dict], **info): + process_times = self.count_process_times(self.results_preprocessor) + if process_times > self.max_process_time: + self.max_process_time = process_times + print(f"process times: {process_times}") + + def train_func(): + for idx in range(num_iterates): + train.report(iterate=idx + 1) + + # python default limitation for iterate depth + num_iterates = 1000 + trainer = Trainer(TestConfig(), num_workers=1) + trainer.start() + test_callback = TestCallback() + trainer.run(train_func, callbacks=[test_callback]) + assert test_callback.max_process_time == 1 + print(f"callback max process time: {test_callback.max_process_time}") + trainer.shutdown() + + +if __name__ == "__main__": + import pytest + import sys + + sys.exit(pytest.main(["-v", "-x", __file__])) \ No newline at end of file diff --git a/python/ray/train/tests/test_minimal.py b/python/ray/train/tests/test_minimal.py index 5f3be1d4c3b3..a23d7b4f23f9 100644 --- a/python/ray/train/tests/test_minimal.py +++ b/python/ray/train/tests/test_minimal.py @@ -59,7 +59,7 @@ def train_func(): ) results = trainer.fit() - assert results.checkpoint + assert results.checkpoint.to_dict()[key] == checkpoint.to_dict()[key] def test_failure(): diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index 2fed4e42fa43..3fac9a1e6599 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -7,7 +7,6 @@ from ray import tune from ray.air import Checkpoint from ray.air.config import FailureConfig, RunConfig -from ray.train import Trainer from ray.train._internal.worker_group import WorkerGroup from ray.train.backend import Backend, BackendConfig from ray.train.data_parallel_trainer import DataParallelTrainer @@ -115,16 +114,37 @@ def test_tune_tensorflow_mnist(ray_start_8_cpus): tune_tensorflow_mnist(num_workers=2, use_gpu=False, num_samples=2) +def test_tune_error(ray_start_4_cpus): + def train_func(config): + raise RuntimeError("Error in training function!") + + trainer = DataParallelTrainer( + train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1) + ) + tuner = Tuner( + trainer, + ) + + # with pytest.raises(TuneError): + tuner.fit() + print("a") + + def test_tune_checkpoint(ray_start_4_cpus): def train_func(): for i in range(10): train.report(test=i) train.save_checkpoint(hello="world") - trainer = Trainer(TestConfig(), num_workers=1) - TestTrainable = trainer.to_tune_trainable(train_func) + trainer = DataParallelTrainer( + train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1) + ) + tuner = Tuner( + trainer, + param_space={"train_loop_config": {"max_iter": 5}}, + ) - [trial] = tune.run(TestTrainable).trials + [trial] = tuner.fit()._experiment_analysis.trials checkpoint_path = trial.checkpoint.dir_or_data assert os.path.exists(checkpoint_path) checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() From 6f8d7e092c5ea919c99479781b8483f328ceaec6 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 16 Jun 2022 21:58:20 +0000 Subject: [PATCH 19/70] Fix tracked checkpoint error --- python/ray/util/ml_utils/checkpoint_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index 9dced9377750..9a27acd10e36 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -132,7 +132,7 @@ def to_air_checkpoint(self) -> Optional[Checkpoint]: checkpoint_dir = TrainableUtil.find_checkpoint_dir(checkpoint_data) checkpoint = Checkpoint.from_directory(checkpoint_dir) elif isinstance(checkpoint_data, bytes): - with tempfile.mkdtemp() as tmpdir: + with tempfile.TemporaryDirectory() as tmpdir: TrainableUtil.create_from_pickle(checkpoint_data, tmpdir) # Double wrap in checkpoint so we hold the data in memory and # can remove the temp directory From 85cb1a71e90ee21d5095a817ff10feb478da7922 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 16 Jun 2022 21:58:39 +0000 Subject: [PATCH 20/70] CI fixes --- .../tensorflow_linear_dataset_example.py | 2 +- .../train/examples/train_linear_example.py | 4 +++ python/ray/train/tests/test_callbacks.py | 26 ++++++++++--------- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/python/ray/train/examples/tensorflow_linear_dataset_example.py b/python/ray/train/examples/tensorflow_linear_dataset_example.py index 9dbb3205b7bf..ccc408455b44 100644 --- a/python/ray/train/examples/tensorflow_linear_dataset_example.py +++ b/python/ray/train/examples/tensorflow_linear_dataset_example.py @@ -88,7 +88,7 @@ def train_tensorflow_linear(num_workers=2, use_gpu=False): scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) results = trainer.fit() - print(f"Results: {results[0]}") + print(f"Results: {results}") return results diff --git a/python/ray/train/examples/train_linear_example.py b/python/ray/train/examples/train_linear_example.py index 7e09acef3d3d..ceabd0c2853f 100644 --- a/python/ray/train/examples/train_linear_example.py +++ b/python/ray/train/examples/train_linear_example.py @@ -74,10 +74,14 @@ def train_func(config): optimizer = torch.optim.SGD(model.parameters(), lr=lr) + results = [] for _ in range(epochs): train_epoch(train_loader, model, loss_fn, optimizer) result = validate_epoch(validation_loader, model, loss_fn) train.report(**result) + results.append(result) + # return required for backwards compatibility with the old API + return results def train_linear(num_workers=2, use_gpu=False, epochs=3): diff --git a/python/ray/train/tests/test_callbacks.py b/python/ray/train/tests/test_callbacks.py index b21adf6634b9..eb6ed7c3db17 100644 --- a/python/ray/train/tests/test_callbacks.py +++ b/python/ray/train/tests/test_callbacks.py @@ -1,37 +1,37 @@ -from typing import Dict, List import glob import io import json from collections import defaultdict from contextlib import redirect_stdout from pathlib import Path +from typing import Dict, List import pytest import ray import ray.train as train from ray.train import Trainer -from ray.train.backend import BackendConfig, Backend +from ray.train._internal.results_preprocessors.preprocessor import ( + SequentialResultsPreprocessor, +) +from ray.train._internal.worker_group import WorkerGroup +from ray.train.backend import Backend, BackendConfig from ray.train.callbacks import ( - TrainingCallback, JsonLoggerCallback, PrintCallback, TBXLoggerCallback, TorchTensorboardProfilerCallback, + TrainingCallback, ) from ray.train.callbacks.logging import ( MLflowLoggerCallback, _TrainCallbackLogdirManager, ) from ray.train.constants import ( - TRAINING_ITERATION, - DETAILED_AUTOFILLED_KEYS, BASIC_AUTOFILLED_KEYS, + DETAILED_AUTOFILLED_KEYS, ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, -) -from ray.train._internal.worker_group import WorkerGroup -from ray.train._internal.results_preprocessors.preprocessor import ( - SequentialResultsPreprocessor, + TRAINING_ITERATION, ) try: @@ -277,9 +277,10 @@ def test_torch_tensorboard_profiler_callback(ray_start_4_cpus, tmp_path): num_epochs = 2 def train_func(): - from ray.train.torch import TorchWorkerProfiler from torch.profiler import profile, record_function, schedule + from ray.train.torch import TorchWorkerProfiler + twp = TorchWorkerProfiler() with profile( activities=[], @@ -351,7 +352,8 @@ def train_func(): if __name__ == "__main__": - import pytest import sys - sys.exit(pytest.main(["-v", "-x", __file__])) \ No newline at end of file + import pytest + + sys.exit(pytest.main(["-v", "-x", __file__])) From 86a71d6bd6958b639e3eeaa19264bc99398c0aa5 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 20 Jun 2022 16:10:20 +0000 Subject: [PATCH 21/70] Add checkpoint configuration to `RunConfig` --- python/ray/air/config.py | 23 ++++++++++++++++++++++- python/ray/tune/impl/tuner_internal.py | 10 ++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/python/ray/air/config.py b/python/ray/air/config.py index a8f5c2b85c66..d75dc3c0cc5d 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -273,6 +273,25 @@ class FailureConfig: max_failures: int = 0 +@dataclass +@PublicAPI(stability="alpha") +class CheckpointingConfig: + """Configuration related to checkpointing of each run/trial. + + Args: + keep_checkpoints_num: Number of checkpoints to keep. A value of + `None` keeps all checkpoints. Defaults to `None`. If set, need + to provide `checkpoint_score_attr`. + checkpoint_score_attr: Specifies by which attribute to rank the + best checkpoint. Default is increasing order. If attribute starts + with `min-` it will rank attribute in decreasing order, i.e. + `min-validation_loss`. + """ + + keep_checkpoints_num: Optional[int] = None + checkpoint_score_attr: Optional[str] = None + + @dataclass @PublicAPI(stability="alpha") class RunConfig: @@ -298,8 +317,9 @@ class RunConfig: Currently only stateless callbacks are supported for resumed runs. (any state of the callback will not be checkpointed by Tune and thus will not take effect in resumed runs). - failure: The failure mode configuration. + failure: Failure mode configuration. sync_config: Configuration object for syncing. See tune.SyncConfig. + checkpointing: Checkpointing configuration. verbose: 0, 1, 2, or 3. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and brief results, 3 = status and detailed results. Defaults to 2. @@ -312,4 +332,5 @@ class RunConfig: stop: Optional[Union[Mapping, "Stopper", Callable[[str, Mapping], bool]]] = None failure: Optional[FailureConfig] = None sync_config: Optional[SyncConfig] = None + checkpointing: Optional[CheckpointingConfig] = None verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py index 51d67a5a5c9b..7190f6c6ff38 100644 --- a/python/ray/tune/impl/tuner_internal.py +++ b/python/ray/tune/impl/tuner_internal.py @@ -166,6 +166,16 @@ def _fit_internal(self, trainable, param_space) -> ExperimentAnalysis: max_failures=( self._run_config.failure.max_failures if self._run_config.failure else 0 ), + keep_checkpoints_num=( + self._run_config.checkpointing.keep_checkpoints_num + if self._run_config.checkpointing + else None + ), + checkpoint_score_attr=( + self._run_config.checkpointing.checkpoint_score_attr + if self._run_config.checkpointing + else None + ), _experiment_checkpoint_dir=self._experiment_checkpoint_dir, raise_on_failed_trial=False, verbose=self._run_config.verbose, From 41eb7809fd2f7bcb1188080f927d95cc5ed23645 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 20 Jun 2022 18:41:58 +0000 Subject: [PATCH 22/70] Add `best_checkpoint` and `dataframe` to `Result` --- python/ray/air/result.py | 9 ++++ python/ray/tune/impl/tuner_internal.py | 49 ++++++++++++---------- python/ray/tune/result_grid.py | 6 +++ python/ray/tune/tests/test_result_grid.py | 50 +++++++++++++++++++++++ 4 files changed, 92 insertions(+), 22 deletions(-) diff --git a/python/ray/air/result.py b/python/ray/air/result.py index 69cfd69926b8..2b52fb844244 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -4,6 +4,8 @@ from ray.air.checkpoint import Checkpoint from ray.util.annotations import PublicAPI +import pandas as pd + @dataclass @PublicAPI(stability="alpha") @@ -21,12 +23,19 @@ class Result: Args: metrics: The final metrics as reported by an Trainable. checkpoint: The final checkpoint of the Trainable. + best_checkpoint: The best checkpoint of the Trainable, as + determined by the ``metric`` and ``mode`` arguments set. + If either of those has not been set, this will be None. + May be the same as ``checkpoint``. error: The execution error of the Trainable run, if the trial finishes in error. + dataframe: The full result dataframe of the Trainable. """ metrics: Optional[Dict[str, Any]] checkpoint: Optional[Checkpoint] + best_checkpoint: Optional[Checkpoint] error: Optional[Exception] + dataframe: Optional[pd.DataFrame] @property def config(self) -> Optional[Dict[str, Any]]: diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py index 7190f6c6ff38..b2ea167369d7 100644 --- a/python/ray/tune/impl/tuner_internal.py +++ b/python/ray/tune/impl/tuner_internal.py @@ -149,17 +149,11 @@ def fit(self) -> ResultGrid: return ResultGrid(analysis) - def _fit_internal(self, trainable, param_space) -> ExperimentAnalysis: - """Fitting for a fresh Tuner.""" - analysis = run( - trainable, - config={**param_space}, + def _get_tune_run_arguments(self) -> Dict[str, Any]: + """Get tune.run arguments common for both new and resumed runs.""" + return dict( mode=self._tune_config.mode, metric=self._tune_config.metric, - num_samples=self._tune_config.num_samples, - search_alg=self._tune_config.search_alg, - scheduler=self._tune_config.scheduler, - name=self._run_config.name, callbacks=self._run_config.callbacks, sync_config=self._run_config.sync_config, stop=self._run_config.stop, @@ -179,27 +173,38 @@ def _fit_internal(self, trainable, param_space) -> ExperimentAnalysis: _experiment_checkpoint_dir=self._experiment_checkpoint_dir, raise_on_failed_trial=False, verbose=self._run_config.verbose, + ) + + def _fit_internal(self, trainable, param_space) -> ExperimentAnalysis: + """Fitting for a fresh Tuner.""" + args = { + **self._get_tune_run_arguments(), + **dict( + run_or_experiment=trainable, + config={**param_space}, + num_samples=self._tune_config.num_samples, + search_alg=self._tune_config.search_alg, + scheduler=self._tune_config.scheduler, + name=self._run_config.name, + ), **self._tuner_kwargs, + } + analysis = run( + **args, ) return analysis def _fit_resume(self, trainable) -> ExperimentAnalysis: """Fitting for a restored Tuner.""" - analysis = run( - trainable, - resume=True, - mode=self._tune_config.mode, - metric=self._tune_config.metric, - callbacks=self._run_config.callbacks, - sync_config=self._run_config.sync_config, - stop=self._run_config.stop, - max_failures=( - self._run_config.failure.max_failures if self._run_config.failure else 0 + args = { + **self._get_tune_run_arguments(), + **dict( + run_or_experiment=trainable, + resume=True, ), - _experiment_checkpoint_dir=self._experiment_checkpoint_dir, - raise_on_failed_trial=False, **self._tuner_kwargs, - ) + } + analysis = run(**args) return analysis def __getstate__(self): diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 9d653ecb4991..2568ebe46b09 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -165,10 +165,16 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError] def _trial_to_result(self, trial: Trial) -> Result: checkpoint = trial.checkpoint.to_air_checkpoint() + try: + best_checkpoint = self._experiment_analysis.best_checkpoint + except ValueError: + best_checkpoint = None result = Result( checkpoint=checkpoint, + best_checkpoint=best_checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), + dataframe=self._experiment_analysis.trial_dataframes.get(trial.logdir), ) return result diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index 77fae7453edf..dbcab0037d50 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -3,6 +3,7 @@ import pickle import pytest +import pandas as pd import ray from ray import tune @@ -40,6 +41,55 @@ def f(config): assert result.metrics["config"] == result.config +def test_result_grid_metric_mode(ray_start_2_cpus): + def f(config): + for i in range(2): + with tune.checkpoint_dir(step=i) as checkpoint_dir: + path = os.path.join(checkpoint_dir, "checkpoint") + with open(path, "w") as f: + f.write(json.dumps({"step": i})) + tune.report(step=i) + + analysis = tune.run(f, config={"a": 1}, metric="step", mode="max") + analysis._legacy_checkpoint = False + result_grid = ResultGrid(analysis) + result = result_grid[0] + assert isinstance(result.checkpoint, Checkpoint) + assert isinstance(result.best_checkpoint, Checkpoint) + assert isinstance(result.metrics, dict) + assert isinstance(result.config, dict) + assert isinstance(result.dataframe, pd.DataFrame) + assert os.path.normpath( + result.checkpoint.get_internal_representation()[1] + ) == os.path.normpath(result.best_checkpoint.get_internal_representation()[1]) + assert result.config == {"a": 1} + assert result.metrics["config"] == result.config + assert len(result.dataframe) == 2 + + +def test_result_grid_metric_mode_unset(ray_start_2_cpus): + def f(config): + for i in range(2): + with tune.checkpoint_dir(step=i) as checkpoint_dir: + path = os.path.join(checkpoint_dir, "checkpoint") + with open(path, "w") as f: + f.write(json.dumps({"step": i})) + tune.report(step=i) + + analysis = tune.run(f, config={"a": 1}) + analysis._legacy_checkpoint = False + result_grid = ResultGrid(analysis) + result = result_grid[0] + assert isinstance(result.checkpoint, Checkpoint) + assert result.best_checkpoint is None + assert isinstance(result.metrics, dict) + assert isinstance(result.config, dict) + assert isinstance(result.dataframe, pd.DataFrame) + assert result.config == {"a": 1} + assert result.metrics["config"] == result.config + assert len(result.dataframe) == 2 + + def test_result_grid_no_checkpoint(ray_start_2_cpus): def f(config): pass From eb2eb6717ff59a29072a566377760fb5cc1e2025 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 20 Jun 2022 19:53:31 +0000 Subject: [PATCH 23/70] Tests, fixes --- python/ray/air/__init__.py | 10 ++- python/ray/air/config.py | 43 +++++++++++-- python/ray/air/result.py | 6 +- python/ray/air/tests/test_api.py | 28 +++++++++ python/ray/tune/impl/tuner_internal.py | 2 +- python/ray/tune/result_grid.py | 28 ++++++++- python/ray/tune/tests/test_result_grid.py | 4 +- python/ray/tune/tests/test_tuner.py | 76 ++++++++++++++++++++++- 8 files changed, 181 insertions(+), 16 deletions(-) diff --git a/python/ray/air/__init__.py b/python/ray/air/__init__.py index 196fa1aa7e35..2c82cce8f4e3 100644 --- a/python/ray/air/__init__.py +++ b/python/ray/air/__init__.py @@ -1,5 +1,11 @@ from ray.air.checkpoint import Checkpoint -from ray.air.config import DatasetConfig, RunConfig, ScalingConfig +from ray.air.config import ( + DatasetConfig, + RunConfig, + ScalingConfig, + FailureConfig, + CheckpointingConfig, +) from ray.air.data_batch_type import DataBatchType from ray.air.result import Result from ray.air.util.datasets import train_test_split @@ -11,5 +17,7 @@ "Result", "ScalingConfig", "DatasetConfig", + "FailureConfig", + "CheckpointingConfig", "train_test_split", ] diff --git a/python/ray/air/config.py b/python/ray/air/config.py index d75dc3c0cc5d..ab9cfe79b67e 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -282,14 +282,47 @@ class CheckpointingConfig: keep_checkpoints_num: Number of checkpoints to keep. A value of `None` keeps all checkpoints. Defaults to `None`. If set, need to provide `checkpoint_score_attr`. - checkpoint_score_attr: Specifies by which attribute to rank the - best checkpoint. Default is increasing order. If attribute starts - with `min-` it will rank attribute in decreasing order, i.e. - `min-validation_loss`. + checkpoint_score_metric: Specifies by which metric to rank the + best checkpoint. Defaults to training iteration. + checkpoint_score_mode: Must be one of [min, max]. Determines + whether ``checkpoint_score_metric`` should be minimized or maximized. + If not set, will be the same as 'max'. Cannot be set if + ``checkpoint_score_metric`` is not set. """ keep_checkpoints_num: Optional[int] = None - checkpoint_score_attr: Optional[str] = None + checkpoint_score_metric: Optional[str] = None + checkpoint_score_mode: Optional[str] = None + + def __post_init__(self): + if self.checkpoint_score_mode not in (None, "min", "max"): + raise ValueError( + "The `checkpoint_score_mode` parameter can only be " + f"either None, 'min' or 'max', got {self.checkpoint_score_mode}." + ) + if ( + self.checkpoint_score_metric is None + and self.checkpoint_score_mode is not None + ): + raise ValueError( + "`checkpoint_score_mode` cannot be set if " + "`checkpoint_score_metric` is not set." + ) + + @property + def checkpoint_score_attr(self) -> Optional[str]: + """Same as ``checkpoint_score_attr`` in ``tune.run``.""" + if self.checkpoint_score_metric is None: + return self.checkpoint_score_metric + prefix = "" + if self.checkpoint_score_mode == "min": + prefix = "min-" + return f"{prefix}{self.checkpoint_score_metric}" + + @property + def checkpoint_score_mode_not_none(self) -> str: + """``checkpoint_score_mode`` but None -> 'max'""" + return self.checkpoint_score_mode or "max" @dataclass diff --git a/python/ray/air/result.py b/python/ray/air/result.py index 2b52fb844244..954c0b8f9054 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -24,9 +24,11 @@ class Result: metrics: The final metrics as reported by an Trainable. checkpoint: The final checkpoint of the Trainable. best_checkpoint: The best checkpoint of the Trainable, as - determined by the ``metric`` and ``mode`` arguments set. + determined by the ``checkpointing`` argument of ``RunConfig``, + or, if that's unset, by ``metric`` and ``mode`` arguments of + ``TuneConfig``. If either of those has not been set, this will be None. - May be the same as ``checkpoint``. + May be the same object as ``checkpoint``. error: The execution error of the Trainable run, if the trial finishes in error. dataframe: The full result dataframe of the Trainable. """ diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py index dce2ce930c8d..20138448a77d 100644 --- a/python/ray/air/tests/test_api.py +++ b/python/ray/air/tests/test_api.py @@ -4,6 +4,7 @@ from ray.air import Checkpoint from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated from ray.air.config import ScalingConfigDataClass +from ray.air.config import CheckpointingConfig from ray.data.preprocessor import Preprocessor from ray.train.trainer import BaseTrainer @@ -38,6 +39,33 @@ def test_run_config(): DummyTrainer(run_config=ray.air.RunConfig()) +def test_checkpointing_config(): + # cannot set checkpoint_score_mode if checkpoint_score_metric is unset + with pytest.raises(ValueError): + CheckpointingConfig(checkpoint_score_mode="min") + + with pytest.raises(ValueError): + CheckpointingConfig( + checkpoint_score_metric="metric", checkpoint_score_mode="invalid" + ) + + checkpointing = CheckpointingConfig() + assert checkpointing.checkpoint_score_attr is None + + checkpointing = CheckpointingConfig(checkpoint_score_metric="metric") + assert checkpointing.checkpoint_score_attr == "metric" + + checkpointing = CheckpointingConfig( + checkpoint_score_metric="metric", checkpoint_score_mode="max" + ) + assert checkpointing.checkpoint_score_attr == "metric" + + checkpointing = CheckpointingConfig( + checkpoint_score_metric="metric", checkpoint_score_mode="min" + ) + assert checkpointing.checkpoint_score_attr == "min-metric" + + def test_scaling_config(): with pytest.raises(ValueError): DummyTrainer(scaling_config="invalid") diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py index b2ea167369d7..7a0bf39eff6a 100644 --- a/python/ray/tune/impl/tuner_internal.py +++ b/python/ray/tune/impl/tuner_internal.py @@ -147,7 +147,7 @@ def fit(self) -> ResultGrid: else: analysis = self._fit_resume(trainable) - return ResultGrid(analysis) + return ResultGrid(analysis, self._run_config.checkpointing) def _get_tune_run_arguments(self) -> Dict[str, Any]: """Get tune.run arguments common for both new and resumed runs.""" diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 2568ebe46b09..ef4dd58b5064 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -1,5 +1,5 @@ import os -from typing import Optional, Union +from typing import TYPE_CHECKING, Optional, Union import pandas as pd @@ -11,6 +11,9 @@ from ray.tune.trial import Trial from ray.util import PublicAPI +if TYPE_CHECKING: + from ray.air.config import CheckpointingConfig + @PublicAPI(stability="alpha") class ResultGrid: @@ -40,8 +43,14 @@ class ResultGrid: seen by Tune will be provided. """ - def __init__(self, experiment_analysis: ExperimentAnalysis): + def __init__( + self, + experiment_analysis: ExperimentAnalysis, + checkpointing_config: Optional["CheckpointingConfig"] = None, + ): self._experiment_analysis = experiment_analysis + # Used to determine best checkpoint + self._checkpointing_config = checkpointing_config def get_best_result( self, @@ -165,8 +174,21 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError] def _trial_to_result(self, trial: Trial) -> Result: checkpoint = trial.checkpoint.to_air_checkpoint() + + checkpoint_metric = ( + self._checkpointing_config.checkpoint_score_metric + if self._checkpointing_config + else None + ) + checkpoint_mode = ( + self._checkpointing_config.checkpoint_score_mode_not_none + if self._checkpointing_config and checkpoint_metric + else None + ) try: - best_checkpoint = self._experiment_analysis.best_checkpoint + best_checkpoint = self._experiment_analysis.get_best_checkpoint( + trial, metric=checkpoint_metric, mode=checkpoint_mode + ) except ValueError: best_checkpoint = None diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index dbcab0037d50..6a81cfd01d1d 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -50,7 +50,7 @@ def f(config): f.write(json.dumps({"step": i})) tune.report(step=i) - analysis = tune.run(f, config={"a": 1}, metric="step", mode="max") + analysis = tune.run(f, config={"a": 1}, metric="step", mode="min") analysis._legacy_checkpoint = False result_grid = ResultGrid(analysis) result = result_grid[0] @@ -61,7 +61,7 @@ def f(config): assert isinstance(result.dataframe, pd.DataFrame) assert os.path.normpath( result.checkpoint.get_internal_representation()[1] - ) == os.path.normpath(result.best_checkpoint.get_internal_representation()[1]) + ) != os.path.normpath(result.best_checkpoint.get_internal_representation()[1]) assert result.config == {"a": 1} assert result.metrics["config"] == result.config assert len(result.dataframe) == 2 diff --git a/python/ray/tune/tests/test_tuner.py b/python/ray/tune/tests/test_tuner.py index a7cad997092c..07a088efacf6 100644 --- a/python/ray/tune/tests/test_tuner.py +++ b/python/ray/tune/tests/test_tuner.py @@ -7,7 +7,7 @@ from sklearn.utils import shuffle from ray import tune -from ray.air.config import RunConfig +from ray.air.config import CheckpointingConfig, RunConfig from ray.air.examples.pytorch.torch_linear_example import ( train_func as linear_train_func, ) @@ -32,6 +32,16 @@ class DummyTrainer(BaseTrainer): "placement_strategy", ] + def training_loop(self) -> None: + for i in range(5): + with tune.checkpoint_dir(step=i) as checkpoint_dir: + path = os.path.join(checkpoint_dir, "checkpoint") + with open(path, "w") as f: + f.write(str(i)) + tune.report(step=i) + + +class FailingTrainer(DummyTrainer): def training_loop(self) -> None: raise RuntimeError("There is an error in trainer!") @@ -189,7 +199,7 @@ def on_step_end(self, iteration, trials, **kwargs): assert len(results) == 4 def test_tuner_trainer_fail(self): - trainer = DummyTrainer() + trainer = FailingTrainer() param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), @@ -243,6 +253,68 @@ def test_tuner_run_config_override(self): assert tuner._local_tuner._run_config.stop == {"metric": 4} + def test_tuner_checkpoint_configuration(self): + # Case 1: nothing set + trainer = DummyTrainer() + tuner = Tuner(trainer) + + results = tuner.fit() + result = results[0] + assert result.checkpoint + assert not result.best_checkpoint + + # Case 2: metric and mode set + trainer = DummyTrainer() + tuner = Tuner( + trainer, tune_config=TuneConfig(mode="min", metric="step", num_samples=2) + ) + + results = tuner.fit() + result = results[0] + assert result.checkpoint + assert result.best_checkpoint + assert ( + os.path.basename( + os.path.normpath( + result.best_checkpoint.get_internal_representation()[1] + ) + ) + == "checkpoint_000000" + ) + assert ( + result.best_checkpoint.get_internal_representation() + != results[1].best_checkpoint.get_internal_representation() + ) + + # Case 3: CheckpointingConfig set. Takes priority. + trainer = DummyTrainer( + run_config=RunConfig( + checkpointing=CheckpointingConfig( + checkpoint_score_metric="step", checkpoint_score_mode="min" + ) + ) + ) + tuner = Tuner( + trainer, tune_config=TuneConfig(mode="max", metric="step", num_samples=2) + ) + + results = tuner.fit() + result = results[0] + assert result.checkpoint + assert result.best_checkpoint + assert ( + os.path.basename( + os.path.normpath( + result.best_checkpoint.get_internal_representation()[1] + ) + ) + == "checkpoint_000000" + ) + assert ( + result.best_checkpoint.get_internal_representation() + != results[1].best_checkpoint.get_internal_representation() + ) + if __name__ == "__main__": import sys From 024932e8acaf3a1deeb2a3af7ccdb9965589bbd3 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 20 Jun 2022 20:17:52 +0000 Subject: [PATCH 24/70] Result grid tweaks --- python/ray/tune/result_grid.py | 43 +++++++++++++++--- python/ray/tune/tests/test_result_grid.py | 53 +++++++++++++++++++++++ 2 files changed, 89 insertions(+), 7 deletions(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index ef4dd58b5064..78f05e6dcd20 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -58,6 +58,7 @@ def get_best_result( mode: Optional[str] = None, scope: str = "last", filter_nan_and_inf: bool = True, + checkpointing_config: Union[bool, "CheckpointingConfig"] = True, ) -> Result: """Get the best result from all the trials run. @@ -79,6 +80,13 @@ def get_best_result( filter_nan_and_inf: If True (default), NaN or infinite values are disregarded and these trials are never selected as the best trial. + checkpointing_config: If True (default), will use the + ``CheckpointingConfig`` object set in Trainer's ``RunConfig`` + to determine the best checkpoint of the trial. + If False, or if the ``CheckpointingConfig`` object was not set, will use + ``metric`` and ``mode`` as set here. + Can also be a ``CheckpointingConfig`` object, in which case it will + be used directly. """ if not metric and not self._experiment_analysis.default_metric: raise ValueError( @@ -92,6 +100,10 @@ def get_best_result( "`get_best_result` or specify a mode in the " "`TuneConfig` of your `Tuner`." ) + + metric = metric or self._experiment_analysis.default_metric + mode = mode or self._experiment_analysis.default_mode + best_trial = self._experiment_analysis.get_best_trial( metric=metric, mode=mode, @@ -112,7 +124,19 @@ def get_best_result( ) raise RuntimeError(error_msg) - return self._trial_to_result(best_trial) + # Lazy import to avoid circular dependency + from ray.air.config import CheckpointingConfig + + if not isinstance(checkpointing_config, CheckpointingConfig): + if checkpointing_config and self._checkpointing_config: + checkpointing_config = self._checkpointing_config + else: + checkpointing_config = CheckpointingConfig( + checkpoint_score_metric=metric, checkpoint_score_mode=mode + ) + return self._trial_to_result( + best_trial, checkpointing_config=checkpointing_config + ) def get_dataframe( self, @@ -159,7 +183,10 @@ def __len__(self) -> int: def __getitem__(self, i) -> Result: """Returns the i'th result in the grid.""" - return self._trial_to_result(self._experiment_analysis.trials[i]) + return self._trial_to_result( + self._experiment_analysis.trials[i], + checkpointing_config=self._checkpointing_config, + ) @staticmethod def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError]]: @@ -172,17 +199,19 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError] return TuneError(f.read()) return None - def _trial_to_result(self, trial: Trial) -> Result: + def _trial_to_result( + self, trial: Trial, checkpointing_config: "CheckpointingConfig" + ) -> Result: checkpoint = trial.checkpoint.to_air_checkpoint() checkpoint_metric = ( - self._checkpointing_config.checkpoint_score_metric - if self._checkpointing_config + checkpointing_config.checkpoint_score_metric + if checkpointing_config else None ) checkpoint_mode = ( - self._checkpointing_config.checkpoint_score_mode_not_none - if self._checkpointing_config and checkpoint_metric + checkpointing_config.checkpoint_score_mode_not_none + if checkpointing_config and checkpoint_metric else None ) try: diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index 6a81cfd01d1d..c6bcb7af2077 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -150,6 +150,59 @@ def f(config): assert best_result.metrics["x"] == 2 +def test_best_result_best_checkpoint(ray_start_2_cpus): + from ray.air.config import CheckpointingConfig + + def f(config): + for i in range(2): + with tune.checkpoint_dir(step=i) as checkpoint_dir: + path = os.path.join(checkpoint_dir, "checkpoint") + with open(path, "w") as f: + f.write(json.dumps(dict(x=config["x"] * (i + 1), step=i))) + tune.report(x=config["x"] * (i + 1), step=i) + + def load_checkpoint(result): + with open( + os.path.join(result.best_checkpoint.to_directory(), "checkpoint") + ) as f: + checkpoint_data = json.load(f) + return checkpoint_data + + analysis = tune.run(f, config={"x": tune.grid_search([1, 3])}) + + # No checkpointing config. Use metric and mode + result_grid = ResultGrid(analysis) + best_result = result_grid.get_best_result(metric="x", mode="max") + assert best_result.metrics["x"] == 6 + assert best_result.best_checkpoint + assert load_checkpoint(best_result)["step"] == 1 + + # Checkpointing config. Use by default + result_grid = ResultGrid( + analysis, checkpointing_config=CheckpointingConfig(checkpoint_score_metric="x") + ) + best_result = result_grid.get_best_result(metric="x", mode="min") + assert best_result.metrics["x"] == 2 + assert best_result.best_checkpoint + assert load_checkpoint(best_result)["step"] == 1 + + best_result = result_grid.get_best_result( + metric="x", mode="min", checkpointing_config=False + ) + assert best_result.metrics["x"] == 2 + assert best_result.best_checkpoint + assert load_checkpoint(best_result)["step"] == 0 + + best_result = result_grid.get_best_result( + metric="x", + mode="min", + checkpointing_config=CheckpointingConfig(checkpoint_score_metric="x"), + ) + assert best_result.metrics["x"] == 2 + assert best_result.best_checkpoint + assert load_checkpoint(best_result)["step"] == 1 + + def test_best_result_no_report(ray_start_2_cpus): def f(config): pass From abf2cdc9a18d56147d1dfd2aec4b56eab0a1223b Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 20 Jun 2022 20:24:23 +0000 Subject: [PATCH 25/70] Extend --- python/ray/tune/result_grid.py | 62 ++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 78f05e6dcd20..896cd1b885c8 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -52,6 +52,27 @@ def __init__( # Used to determine best checkpoint self._checkpointing_config = checkpointing_config + def _resolve_checkpointing_config( + self, + checkpointing_config: "CheckpointingConfig", + metric: Optional[str] = None, + mode: Optional[str] = None, + ) -> "CheckpointingConfig": + # Lazy import to avoid circular dependency + from ray.air.config import CheckpointingConfig + + metric = metric or self._experiment_analysis.default_metric + mode = mode or self._experiment_analysis.default_mode + + if not isinstance(checkpointing_config, CheckpointingConfig): + if checkpointing_config and self._checkpointing_config: + checkpointing_config = self._checkpointing_config + else: + checkpointing_config = CheckpointingConfig( + checkpoint_score_metric=metric, checkpoint_score_mode=mode + ) + return checkpointing_config + def get_best_result( self, metric: Optional[str] = None, @@ -101,9 +122,6 @@ def get_best_result( "`TuneConfig` of your `Tuner`." ) - metric = metric or self._experiment_analysis.default_metric - mode = mode or self._experiment_analysis.default_mode - best_trial = self._experiment_analysis.get_best_trial( metric=metric, mode=mode, @@ -124,16 +142,10 @@ def get_best_result( ) raise RuntimeError(error_msg) - # Lazy import to avoid circular dependency - from ray.air.config import CheckpointingConfig + checkpointing_config = self._resolve_checkpointing_config( + checkpointing_config, metric=metric, mode=mode + ) - if not isinstance(checkpointing_config, CheckpointingConfig): - if checkpointing_config and self._checkpointing_config: - checkpointing_config = self._checkpointing_config - else: - checkpointing_config = CheckpointingConfig( - checkpoint_score_metric=metric, checkpoint_score_mode=mode - ) return self._trial_to_result( best_trial, checkpointing_config=checkpointing_config ) @@ -181,11 +193,33 @@ def get_dataframe( def __len__(self) -> int: return len(self._experiment_analysis.trials) - def __getitem__(self, i) -> Result: + def __getitem__(self, i: int) -> Result: """Returns the i'th result in the grid.""" + return self.get( + self._experiment_analysis.trials[i], + ) + + def get( + self, i: int, *, checkpointing_config: Union[bool, "CheckpointingConfig"] = True + ): + """Returns the i'th result in the grid. + + Args: + i: index to return. + checkpointing_config: If True (default), will use the + ``CheckpointingConfig`` object set in Trainer's ``RunConfig`` + to determine the best checkpoint of the trial. + If False, or if the ``CheckpointingConfig`` object was not set, will use + ``metric`` and ``mode`` as set here. + Can also be a ``CheckpointingConfig`` object, in which case it will + be used directly. + """ + + checkpointing_config = self._resolve_checkpointing_config(checkpointing_config) + return self._trial_to_result( self._experiment_analysis.trials[i], - checkpointing_config=self._checkpointing_config, + checkpointing_config=checkpointing_config, ) @staticmethod From 563bc338b976e0a4a98d51982cddada388ceb7a6 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 05:38:46 +0200 Subject: [PATCH 26/70] Update result_grid.py --- python/ray/tune/result_grid.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 896cd1b885c8..9321de0d8cdb 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -195,9 +195,7 @@ def __len__(self) -> int: def __getitem__(self, i: int) -> Result: """Returns the i'th result in the grid.""" - return self.get( - self._experiment_analysis.trials[i], - ) + return self.get(i) def get( self, i: int, *, checkpointing_config: Union[bool, "CheckpointingConfig"] = True From d0261bea6d4f40414491d84fe9017cc7ad335c45 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 16:58:23 +0000 Subject: [PATCH 27/70] Fix --- python/ray/tune/result_grid.py | 2 +- python/ray/tune/tests/test_result_grid.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 9321de0d8cdb..344c4356938f 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -232,7 +232,7 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError] return None def _trial_to_result( - self, trial: Trial, checkpointing_config: "CheckpointingConfig" + self, trial: Trial, checkpointing_config: Optional["CheckpointingConfig"] ) -> Result: checkpoint = trial.checkpoint.to_air_checkpoint() diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index c6bcb7af2077..bccd553469a3 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -124,7 +124,7 @@ def test_result_grid_future_checkpoint(ray_start_2_cpus, to_object): result_grid = ResultGrid(None) # Internal result grid conversion - result = result_grid._trial_to_result(trial) + result = result_grid._trial_to_result(trial, checkpointing_config=None) assert isinstance(result.checkpoint, Checkpoint) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) From 56df4936e05391ae27255d762b1f32f4f2105138 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 16:58:47 +0000 Subject: [PATCH 28/70] Lint --- python/ray/tune/result_grid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 344c4356938f..4513935697a0 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -107,7 +107,7 @@ def get_best_result( If False, or if the ``CheckpointingConfig`` object was not set, will use ``metric`` and ``mode`` as set here. Can also be a ``CheckpointingConfig`` object, in which case it will - be used directly. + be used directly. """ if not metric and not self._experiment_analysis.default_metric: raise ValueError( From ef0c75ae685afdc31b70a66f43701b265f59decc Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 17:57:36 +0000 Subject: [PATCH 29/70] Lint --- python/ray/tune/result_grid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 4513935697a0..6f5edc3d3f13 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -210,7 +210,7 @@ def get( If False, or if the ``CheckpointingConfig`` object was not set, will use ``metric`` and ``mode`` as set here. Can also be a ``CheckpointingConfig`` object, in which case it will - be used directly. + be used directly. """ checkpointing_config = self._resolve_checkpointing_config(checkpointing_config) From 3464c93c5eb8fdd7ea0fb3a5c6cf4a071371246d Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 18:43:36 +0000 Subject: [PATCH 30/70] WIP --- python/ray/train/tests/test_tune.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index 3fac9a1e6599..dafe52241312 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -5,6 +5,7 @@ import ray import ray.train as train from ray import tune +from ray.tune import TuneError from ray.air import Checkpoint from ray.air.config import FailureConfig, RunConfig from ray.train._internal.worker_group import WorkerGroup @@ -125,9 +126,8 @@ def train_func(config): trainer, ) - # with pytest.raises(TuneError): - tuner.fit() - print("a") + with pytest.raises(TuneError): + tuner.fit() def test_tune_checkpoint(ray_start_4_cpus): From ee87c12d772860c18f80d4a2bb4a5c2514a81195 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 19:12:46 +0000 Subject: [PATCH 31/70] Renaming --- python/ray/air/__init__.py | 4 +- python/ray/air/config.py | 6 +- python/ray/air/result.py | 12 ++-- python/ray/air/tests/test_api.py | 14 ++--- python/ray/tune/impl/tuner_internal.py | 10 ++-- python/ray/tune/result_grid.py | 68 +++++++++++------------ python/ray/tune/tests/test_result_grid.py | 10 ++-- python/ray/tune/tests/test_tuner.py | 6 +- 8 files changed, 63 insertions(+), 67 deletions(-) diff --git a/python/ray/air/__init__.py b/python/ray/air/__init__.py index 2c82cce8f4e3..506f9d022cc0 100644 --- a/python/ray/air/__init__.py +++ b/python/ray/air/__init__.py @@ -4,7 +4,7 @@ RunConfig, ScalingConfig, FailureConfig, - CheckpointingConfig, + CheckpointConfig, ) from ray.air.data_batch_type import DataBatchType from ray.air.result import Result @@ -18,6 +18,6 @@ "ScalingConfig", "DatasetConfig", "FailureConfig", - "CheckpointingConfig", + "CheckpointConfig", "train_test_split", ] diff --git a/python/ray/air/config.py b/python/ray/air/config.py index ab9cfe79b67e..5b0a886cc3e2 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -275,7 +275,7 @@ class FailureConfig: @dataclass @PublicAPI(stability="alpha") -class CheckpointingConfig: +class CheckpointConfig: """Configuration related to checkpointing of each run/trial. Args: @@ -352,7 +352,7 @@ class RunConfig: and thus will not take effect in resumed runs). failure: Failure mode configuration. sync_config: Configuration object for syncing. See tune.SyncConfig. - checkpointing: Checkpointing configuration. + checkpoint_config: Checkpointing configuration. verbose: 0, 1, 2, or 3. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and brief results, 3 = status and detailed results. Defaults to 2. @@ -365,5 +365,5 @@ class RunConfig: stop: Optional[Union[Mapping, "Stopper", Callable[[str, Mapping], bool]]] = None failure: Optional[FailureConfig] = None sync_config: Optional[SyncConfig] = None - checkpointing: Optional[CheckpointingConfig] = None + checkpoint_config: Optional[CheckpointConfig] = None verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS diff --git a/python/ray/air/result.py b/python/ray/air/result.py index 954c0b8f9054..f615959a2fdc 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -23,11 +23,13 @@ class Result: Args: metrics: The final metrics as reported by an Trainable. checkpoint: The final checkpoint of the Trainable. - best_checkpoint: The best checkpoint of the Trainable, as - determined by the ``checkpointing`` argument of ``RunConfig``, - or, if that's unset, by ``metric`` and ``mode`` arguments of - ``TuneConfig``. - If either of those has not been set, this will be None. + best_checkpoint: The best checkpoint of the Trainable. + This will be determined by (from highest priority): + + 1. ``checkpoint_config`` argument of ``run_config`` + 2. ``metric`` and ``mode`` arguments of ``tune_config`` (if using ``Tuner``) + + If neither of those has not been set, this will be None. May be the same object as ``checkpoint``. error: The execution error of the Trainable run, if the trial finishes in error. dataframe: The full result dataframe of the Trainable. diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py index 20138448a77d..136cb0e58473 100644 --- a/python/ray/air/tests/test_api.py +++ b/python/ray/air/tests/test_api.py @@ -4,7 +4,7 @@ from ray.air import Checkpoint from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated from ray.air.config import ScalingConfigDataClass -from ray.air.config import CheckpointingConfig +from ray.air.config import CheckpointConfig from ray.data.preprocessor import Preprocessor from ray.train.trainer import BaseTrainer @@ -42,25 +42,25 @@ def test_run_config(): def test_checkpointing_config(): # cannot set checkpoint_score_mode if checkpoint_score_metric is unset with pytest.raises(ValueError): - CheckpointingConfig(checkpoint_score_mode="min") + CheckpointConfig(checkpoint_score_mode="min") with pytest.raises(ValueError): - CheckpointingConfig( + CheckpointConfig( checkpoint_score_metric="metric", checkpoint_score_mode="invalid" ) - checkpointing = CheckpointingConfig() + checkpointing = CheckpointConfig() assert checkpointing.checkpoint_score_attr is None - checkpointing = CheckpointingConfig(checkpoint_score_metric="metric") + checkpointing = CheckpointConfig(checkpoint_score_metric="metric") assert checkpointing.checkpoint_score_attr == "metric" - checkpointing = CheckpointingConfig( + checkpointing = CheckpointConfig( checkpoint_score_metric="metric", checkpoint_score_mode="max" ) assert checkpointing.checkpoint_score_attr == "metric" - checkpointing = CheckpointingConfig( + checkpointing = CheckpointConfig( checkpoint_score_metric="metric", checkpoint_score_mode="min" ) assert checkpointing.checkpoint_score_attr == "min-metric" diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py index 7a0bf39eff6a..e2dfadf43fdf 100644 --- a/python/ray/tune/impl/tuner_internal.py +++ b/python/ray/tune/impl/tuner_internal.py @@ -147,7 +147,7 @@ def fit(self) -> ResultGrid: else: analysis = self._fit_resume(trainable) - return ResultGrid(analysis, self._run_config.checkpointing) + return ResultGrid(analysis, self._run_config.checkpoint_config) def _get_tune_run_arguments(self) -> Dict[str, Any]: """Get tune.run arguments common for both new and resumed runs.""" @@ -161,13 +161,13 @@ def _get_tune_run_arguments(self) -> Dict[str, Any]: self._run_config.failure.max_failures if self._run_config.failure else 0 ), keep_checkpoints_num=( - self._run_config.checkpointing.keep_checkpoints_num - if self._run_config.checkpointing + self._run_config.checkpoint_config.keep_checkpoints_num + if self._run_config.checkpoint_config else None ), checkpoint_score_attr=( - self._run_config.checkpointing.checkpoint_score_attr - if self._run_config.checkpointing + self._run_config.checkpoint_config.checkpoint_score_attr + if self._run_config.checkpoint_config else None ), _experiment_checkpoint_dir=self._experiment_checkpoint_dir, diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 6f5edc3d3f13..afa68b609b59 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -12,7 +12,7 @@ from ray.util import PublicAPI if TYPE_CHECKING: - from ray.air.config import CheckpointingConfig + from ray.air.config import CheckpointConfig @PublicAPI(stability="alpha") @@ -46,32 +46,32 @@ class ResultGrid: def __init__( self, experiment_analysis: ExperimentAnalysis, - checkpointing_config: Optional["CheckpointingConfig"] = None, + checkpoint_config: Optional["CheckpointConfig"] = None, ): self._experiment_analysis = experiment_analysis # Used to determine best checkpoint - self._checkpointing_config = checkpointing_config + self._checkpointing_config = checkpoint_config - def _resolve_checkpointing_config( + def _resolve_checkpoint_config( self, - checkpointing_config: "CheckpointingConfig", + checkpoint_config: "CheckpointConfig", metric: Optional[str] = None, mode: Optional[str] = None, - ) -> "CheckpointingConfig": + ) -> "CheckpointConfig": # Lazy import to avoid circular dependency - from ray.air.config import CheckpointingConfig + from ray.air.config import CheckpointConfig metric = metric or self._experiment_analysis.default_metric mode = mode or self._experiment_analysis.default_mode - if not isinstance(checkpointing_config, CheckpointingConfig): - if checkpointing_config and self._checkpointing_config: - checkpointing_config = self._checkpointing_config + if not isinstance(checkpoint_config, CheckpointConfig): + if checkpoint_config and self._checkpointing_config: + checkpoint_config = self._checkpointing_config else: - checkpointing_config = CheckpointingConfig( + checkpoint_config = CheckpointConfig( checkpoint_score_metric=metric, checkpoint_score_mode=mode ) - return checkpointing_config + return checkpoint_config def get_best_result( self, @@ -79,7 +79,7 @@ def get_best_result( mode: Optional[str] = None, scope: str = "last", filter_nan_and_inf: bool = True, - checkpointing_config: Union[bool, "CheckpointingConfig"] = True, + checkpoint_config: Union[bool, "CheckpointConfig"] = True, ) -> Result: """Get the best result from all the trials run. @@ -101,12 +101,12 @@ def get_best_result( filter_nan_and_inf: If True (default), NaN or infinite values are disregarded and these trials are never selected as the best trial. - checkpointing_config: If True (default), will use the - ``CheckpointingConfig`` object set in Trainer's ``RunConfig`` + checkpoint_config: If True (default), will use the + ``CheckpointConfig`` object set in Trainer's ``run_config`` to determine the best checkpoint of the trial. - If False, or if the ``CheckpointingConfig`` object was not set, will use + If False, or if the ``CheckpointConfig`` object was not set, will use ``metric`` and ``mode`` as set here. - Can also be a ``CheckpointingConfig`` object, in which case it will + Can also be a ``CheckpointConfig`` object, in which case it will be used directly. """ if not metric and not self._experiment_analysis.default_metric: @@ -142,13 +142,11 @@ def get_best_result( ) raise RuntimeError(error_msg) - checkpointing_config = self._resolve_checkpointing_config( - checkpointing_config, metric=metric, mode=mode + checkpoint_config = self._resolve_checkpoint_config( + checkpoint_config, metric=metric, mode=mode ) - return self._trial_to_result( - best_trial, checkpointing_config=checkpointing_config - ) + return self._trial_to_result(best_trial, checkpoint_config=checkpoint_config) def get_dataframe( self, @@ -197,27 +195,25 @@ def __getitem__(self, i: int) -> Result: """Returns the i'th result in the grid.""" return self.get(i) - def get( - self, i: int, *, checkpointing_config: Union[bool, "CheckpointingConfig"] = True - ): + def get(self, i: int, *, checkpoint_config: Union[bool, "CheckpointConfig"] = True): """Returns the i'th result in the grid. Args: i: index to return. - checkpointing_config: If True (default), will use the - ``CheckpointingConfig`` object set in Trainer's ``RunConfig`` + checkpoint_config: If True (default), will use the + ``CheckpointConfig`` object set in Trainer's ``RunConfig`` to determine the best checkpoint of the trial. - If False, or if the ``CheckpointingConfig`` object was not set, will use + If False, or if the ``CheckpointConfig`` object was not set, will use ``metric`` and ``mode`` as set here. - Can also be a ``CheckpointingConfig`` object, in which case it will + Can also be a ``CheckpointConfig`` object, in which case it will be used directly. """ - checkpointing_config = self._resolve_checkpointing_config(checkpointing_config) + checkpoint_config = self._resolve_checkpoint_config(checkpoint_config) return self._trial_to_result( self._experiment_analysis.trials[i], - checkpointing_config=checkpointing_config, + checkpoint_config=checkpoint_config, ) @staticmethod @@ -232,18 +228,16 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError] return None def _trial_to_result( - self, trial: Trial, checkpointing_config: Optional["CheckpointingConfig"] + self, trial: Trial, checkpoint_config: Optional["CheckpointConfig"] ) -> Result: checkpoint = trial.checkpoint.to_air_checkpoint() checkpoint_metric = ( - checkpointing_config.checkpoint_score_metric - if checkpointing_config - else None + checkpoint_config.checkpoint_score_metric if checkpoint_config else None ) checkpoint_mode = ( - checkpointing_config.checkpoint_score_mode_not_none - if checkpointing_config and checkpoint_metric + checkpoint_config.checkpoint_score_mode_not_none + if checkpoint_config and checkpoint_metric else None ) try: diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index bccd553469a3..96789116db33 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -124,7 +124,7 @@ def test_result_grid_future_checkpoint(ray_start_2_cpus, to_object): result_grid = ResultGrid(None) # Internal result grid conversion - result = result_grid._trial_to_result(trial, checkpointing_config=None) + result = result_grid._trial_to_result(trial, checkpoint_config=None) assert isinstance(result.checkpoint, Checkpoint) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) @@ -151,7 +151,7 @@ def f(config): def test_best_result_best_checkpoint(ray_start_2_cpus): - from ray.air.config import CheckpointingConfig + from ray.air.config import CheckpointConfig def f(config): for i in range(2): @@ -179,7 +179,7 @@ def load_checkpoint(result): # Checkpointing config. Use by default result_grid = ResultGrid( - analysis, checkpointing_config=CheckpointingConfig(checkpoint_score_metric="x") + analysis, checkpoint_config=CheckpointConfig(checkpoint_score_metric="x") ) best_result = result_grid.get_best_result(metric="x", mode="min") assert best_result.metrics["x"] == 2 @@ -187,7 +187,7 @@ def load_checkpoint(result): assert load_checkpoint(best_result)["step"] == 1 best_result = result_grid.get_best_result( - metric="x", mode="min", checkpointing_config=False + metric="x", mode="min", checkpoint_config=False ) assert best_result.metrics["x"] == 2 assert best_result.best_checkpoint @@ -196,7 +196,7 @@ def load_checkpoint(result): best_result = result_grid.get_best_result( metric="x", mode="min", - checkpointing_config=CheckpointingConfig(checkpoint_score_metric="x"), + checkpoint_config=CheckpointConfig(checkpoint_score_metric="x"), ) assert best_result.metrics["x"] == 2 assert best_result.best_checkpoint diff --git a/python/ray/tune/tests/test_tuner.py b/python/ray/tune/tests/test_tuner.py index 07a088efacf6..dab5b9a1b51b 100644 --- a/python/ray/tune/tests/test_tuner.py +++ b/python/ray/tune/tests/test_tuner.py @@ -7,7 +7,7 @@ from sklearn.utils import shuffle from ray import tune -from ray.air.config import CheckpointingConfig, RunConfig +from ray.air.config import CheckpointConfig, RunConfig from ray.air.examples.pytorch.torch_linear_example import ( train_func as linear_train_func, ) @@ -286,10 +286,10 @@ def test_tuner_checkpoint_configuration(self): != results[1].best_checkpoint.get_internal_representation() ) - # Case 3: CheckpointingConfig set. Takes priority. + # Case 3: CheckpointConfig set. Takes priority. trainer = DummyTrainer( run_config=RunConfig( - checkpointing=CheckpointingConfig( + checkpoint_config=CheckpointConfig( checkpoint_score_metric="step", checkpoint_score_mode="min" ) ) From b10fe1e18745cd045168f256229b64c3b841fa6b Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 20:05:25 +0000 Subject: [PATCH 32/70] Improve test coverage --- python/ray/train/tests/test_examples.py | 12 ++++++++++++ python/ray/train/tests/test_tune.py | 6 +++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index fd6a2fadbf91..2ebef818d7aa 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -51,6 +51,10 @@ def test_tensorflow_mnist(ray_start_4_cpus, num_workers): assert result[TRAINING_ITERATION] == epochs + loss = list(results.dataframe["loss"]) + assert len(loss) == epochs + assert loss[-1] < loss[0] + def test_tf_non_distributed(ray_start_4_cpus): """Make sure Ray Train works without TF MultiWorkerMirroredStrategy.""" @@ -103,6 +107,10 @@ def test_torch_linear(ray_start_4_cpus, num_workers): result = results.metrics assert result[TRAINING_ITERATION] == epochs + loss = list(results.dataframe["loss"]) + assert len(loss) == epochs + assert loss[-1] < loss[0] + # TODO: Refactor as a backend test. def test_torch_linear_failure(ray_start_4_cpus): @@ -138,6 +146,10 @@ def test_torch_fashion_mnist(ray_start_4_cpus): result = results.metrics assert result[TRAINING_ITERATION] == epochs + loss = list(results.dataframe["loss"]) + assert len(loss) == epochs + assert loss[-1] < loss[0] + def test_torch_non_distributed(ray_start_4_cpus): """Make sure Ray Train works without torch DDP.""" diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index dafe52241312..0196a84e46b6 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -5,7 +5,6 @@ import ray import ray.train as train from ray import tune -from ray.tune import TuneError from ray.air import Checkpoint from ray.air.config import FailureConfig, RunConfig from ray.train._internal.worker_group import WorkerGroup @@ -126,8 +125,9 @@ def train_func(config): trainer, ) - with pytest.raises(TuneError): - tuner.fit() + result_grid = tuner.fit() + with pytest.raises(RuntimeError): + raise result_grid[0].error def test_tune_checkpoint(ray_start_4_cpus): From 4dbcccaba67b23538d969e5df309c506e128d964 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 20:51:57 +0000 Subject: [PATCH 33/70] Simplify --- python/ray/air/result.py | 18 ++-- .../ray/tune/analysis/experiment_analysis.py | 2 +- python/ray/tune/function_runner.py | 2 + python/ray/tune/result_grid.py | 84 +++---------------- python/ray/tune/tests/test_result_grid.py | 65 +++++--------- python/ray/tune/tests/test_tuner.py | 64 +------------- python/ray/tune/trial.py | 3 + 7 files changed, 46 insertions(+), 192 deletions(-) diff --git a/python/ray/air/result.py b/python/ray/air/result.py index f615959a2fdc..77b3d4b03d28 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional, Tuple from dataclasses import dataclass from ray.air.checkpoint import Checkpoint @@ -23,23 +23,19 @@ class Result: Args: metrics: The final metrics as reported by an Trainable. checkpoint: The final checkpoint of the Trainable. - best_checkpoint: The best checkpoint of the Trainable. - This will be determined by (from highest priority): - - 1. ``checkpoint_config`` argument of ``run_config`` - 2. ``metric`` and ``mode`` arguments of ``tune_config`` (if using ``Tuner``) - - If neither of those has not been set, this will be None. - May be the same object as ``checkpoint``. error: The execution error of the Trainable run, if the trial finishes in error. - dataframe: The full result dataframe of the Trainable. + dataframe: The full result dataframe of the Trainable. Each row of the + dataframe corresponds to one iteration and contains reported + metrics. + checkpoint_history: A list of tuples of all checkpoints saved + by the Trainable and their associated metrics. """ metrics: Optional[Dict[str, Any]] checkpoint: Optional[Checkpoint] - best_checkpoint: Optional[Checkpoint] error: Optional[Exception] dataframe: Optional[pd.DataFrame] + checkpoint_history: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]] @property def config(self) -> Optional[Dict[str, Any]]: diff --git a/python/ray/tune/analysis/experiment_analysis.py b/python/ray/tune/analysis/experiment_analysis.py index 84d99637f4c7..f537788f61f9 100644 --- a/python/ray/tune/analysis/experiment_analysis.py +++ b/python/ray/tune/analysis/experiment_analysis.py @@ -436,7 +436,7 @@ def get_trial_checkpoints_paths( ) return path_metric_df[["chkpt_path", metric]].values.tolist() elif isinstance(trial, Trial): - checkpoints = trial.checkpoint_manager.best_checkpoints() + checkpoints = trial.get_trial_checkpoints() # Support metrics given as paths, e.g. # "info/learner/default_policy/policy_loss". return [ diff --git a/python/ray/tune/function_runner.py b/python/ray/tune/function_runner.py index 89930e921351..02f5cf707989 100644 --- a/python/ray/tune/function_runner.py +++ b/python/ray/tune/function_runner.py @@ -441,6 +441,8 @@ def step(self): new_result = self._last_result.copy() new_result.update(result) result = new_result + # Do not checkpoint again + result[SHOULD_CHECKPOINT] = False self._last_result = result if self._status_reporter.has_new_checkpoint(): diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index afa68b609b59..12f4cf0c8514 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -1,5 +1,5 @@ import os -from typing import TYPE_CHECKING, Optional, Union +from typing import Optional, Union import pandas as pd @@ -11,9 +11,6 @@ from ray.tune.trial import Trial from ray.util import PublicAPI -if TYPE_CHECKING: - from ray.air.config import CheckpointConfig - @PublicAPI(stability="alpha") class ResultGrid: @@ -46,32 +43,8 @@ class ResultGrid: def __init__( self, experiment_analysis: ExperimentAnalysis, - checkpoint_config: Optional["CheckpointConfig"] = None, ): self._experiment_analysis = experiment_analysis - # Used to determine best checkpoint - self._checkpointing_config = checkpoint_config - - def _resolve_checkpoint_config( - self, - checkpoint_config: "CheckpointConfig", - metric: Optional[str] = None, - mode: Optional[str] = None, - ) -> "CheckpointConfig": - # Lazy import to avoid circular dependency - from ray.air.config import CheckpointConfig - - metric = metric or self._experiment_analysis.default_metric - mode = mode or self._experiment_analysis.default_mode - - if not isinstance(checkpoint_config, CheckpointConfig): - if checkpoint_config and self._checkpointing_config: - checkpoint_config = self._checkpointing_config - else: - checkpoint_config = CheckpointConfig( - checkpoint_score_metric=metric, checkpoint_score_mode=mode - ) - return checkpoint_config def get_best_result( self, @@ -79,7 +52,6 @@ def get_best_result( mode: Optional[str] = None, scope: str = "last", filter_nan_and_inf: bool = True, - checkpoint_config: Union[bool, "CheckpointConfig"] = True, ) -> Result: """Get the best result from all the trials run. @@ -142,11 +114,7 @@ def get_best_result( ) raise RuntimeError(error_msg) - checkpoint_config = self._resolve_checkpoint_config( - checkpoint_config, metric=metric, mode=mode - ) - - return self._trial_to_result(best_trial, checkpoint_config=checkpoint_config) + return self._trial_to_result(best_trial) def get_dataframe( self, @@ -193,27 +161,8 @@ def __len__(self) -> int: def __getitem__(self, i: int) -> Result: """Returns the i'th result in the grid.""" - return self.get(i) - - def get(self, i: int, *, checkpoint_config: Union[bool, "CheckpointConfig"] = True): - """Returns the i'th result in the grid. - - Args: - i: index to return. - checkpoint_config: If True (default), will use the - ``CheckpointConfig`` object set in Trainer's ``RunConfig`` - to determine the best checkpoint of the trial. - If False, or if the ``CheckpointConfig`` object was not set, will use - ``metric`` and ``mode`` as set here. - Can also be a ``CheckpointConfig`` object, in which case it will - be used directly. - """ - - checkpoint_config = self._resolve_checkpoint_config(checkpoint_config) - return self._trial_to_result( self._experiment_analysis.trials[i], - checkpoint_config=checkpoint_config, ) @staticmethod @@ -227,31 +176,20 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError] return TuneError(f.read()) return None - def _trial_to_result( - self, trial: Trial, checkpoint_config: Optional["CheckpointConfig"] - ) -> Result: + def _trial_to_result(self, trial: Trial) -> Result: checkpoint = trial.checkpoint.to_air_checkpoint() - - checkpoint_metric = ( - checkpoint_config.checkpoint_score_metric if checkpoint_config else None - ) - checkpoint_mode = ( - checkpoint_config.checkpoint_score_mode_not_none - if checkpoint_config and checkpoint_metric - else None - ) - try: - best_checkpoint = self._experiment_analysis.get_best_checkpoint( - trial, metric=checkpoint_metric, mode=checkpoint_mode - ) - except ValueError: - best_checkpoint = None + checkpoint_history = [ + (checkpoint.to_air_checkpoint(), checkpoint.metrics) + for checkpoint in trial.get_trial_checkpoints() + ] result = Result( checkpoint=checkpoint, - best_checkpoint=best_checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), - dataframe=self._experiment_analysis.trial_dataframes.get(trial.logdir), + dataframe=self._experiment_analysis.trial_dataframes.get(trial.logdir) + if self._experiment_analysis + else None, + checkpoint_history=checkpoint_history, ) return result diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index 96789116db33..0de68be19190 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -55,13 +55,17 @@ def f(config): result_grid = ResultGrid(analysis) result = result_grid[0] assert isinstance(result.checkpoint, Checkpoint) - assert isinstance(result.best_checkpoint, Checkpoint) + assert isinstance(result.checkpoint_history, list) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) assert isinstance(result.dataframe, pd.DataFrame) assert os.path.normpath( result.checkpoint.get_internal_representation()[1] - ) != os.path.normpath(result.best_checkpoint.get_internal_representation()[1]) + ) != os.path.normpath( + min((x for x in result.checkpoint_history), key=lambda x: x[1]["step"])[ + 0 + ].get_internal_representation()[1] + ) assert result.config == {"a": 1} assert result.metrics["config"] == result.config assert len(result.dataframe) == 2 @@ -81,7 +85,6 @@ def f(config): result_grid = ResultGrid(analysis) result = result_grid[0] assert isinstance(result.checkpoint, Checkpoint) - assert result.best_checkpoint is None assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) assert isinstance(result.dataframe, pd.DataFrame) @@ -124,10 +127,11 @@ def test_result_grid_future_checkpoint(ray_start_2_cpus, to_object): result_grid = ResultGrid(None) # Internal result grid conversion - result = result_grid._trial_to_result(trial, checkpoint_config=None) + result = result_grid._trial_to_result(trial) assert isinstance(result.checkpoint, Checkpoint) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) + assert result.dataframe is None assert result.config == {"some_config": 1} assert result.metrics["config"] == result.config @@ -150,57 +154,30 @@ def f(config): assert best_result.metrics["x"] == 2 -def test_best_result_best_checkpoint(ray_start_2_cpus): - from ray.air.config import CheckpointConfig - +def test_best_result_checkpoint_history(ray_start_2_cpus): def f(config): for i in range(2): with tune.checkpoint_dir(step=i) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") with open(path, "w") as f: - f.write(json.dumps(dict(x=config["x"] * (i + 1), step=i))) - tune.report(x=config["x"] * (i + 1), step=i) - - def load_checkpoint(result): - with open( - os.path.join(result.best_checkpoint.to_directory(), "checkpoint") - ) as f: - checkpoint_data = json.load(f) - return checkpoint_data + f.write(json.dumps(dict(x=config["x"], step=i))) + tune.report(x=config["x"], step=i) analysis = tune.run(f, config={"x": tune.grid_search([1, 3])}) # No checkpointing config. Use metric and mode result_grid = ResultGrid(analysis) best_result = result_grid.get_best_result(metric="x", mode="max") - assert best_result.metrics["x"] == 6 - assert best_result.best_checkpoint - assert load_checkpoint(best_result)["step"] == 1 - - # Checkpointing config. Use by default - result_grid = ResultGrid( - analysis, checkpoint_config=CheckpointConfig(checkpoint_score_metric="x") - ) - best_result = result_grid.get_best_result(metric="x", mode="min") - assert best_result.metrics["x"] == 2 - assert best_result.best_checkpoint - assert load_checkpoint(best_result)["step"] == 1 - - best_result = result_grid.get_best_result( - metric="x", mode="min", checkpoint_config=False - ) - assert best_result.metrics["x"] == 2 - assert best_result.best_checkpoint - assert load_checkpoint(best_result)["step"] == 0 - - best_result = result_grid.get_best_result( - metric="x", - mode="min", - checkpoint_config=CheckpointConfig(checkpoint_score_metric="x"), - ) - assert best_result.metrics["x"] == 2 - assert best_result.best_checkpoint - assert load_checkpoint(best_result)["step"] == 1 + assert best_result.metrics["x"] == 3 + print(best_result.checkpoint_history) + print([x[0].get_internal_representation() for x in best_result.checkpoint_history]) + assert len(best_result.checkpoint_history) == 2 + i = 0 + for checkpoint, metrics in best_result.checkpoint_history: + assert isinstance(checkpoint, Checkpoint) + assert metrics["x"] == 3 + assert metrics["step"] == i + i += 1 def test_best_result_no_report(ray_start_2_cpus): diff --git a/python/ray/tune/tests/test_tuner.py b/python/ray/tune/tests/test_tuner.py index dab5b9a1b51b..e00e41fcdefd 100644 --- a/python/ray/tune/tests/test_tuner.py +++ b/python/ray/tune/tests/test_tuner.py @@ -7,7 +7,7 @@ from sklearn.utils import shuffle from ray import tune -from ray.air.config import CheckpointConfig, RunConfig +from ray.air.config import RunConfig from ray.air.examples.pytorch.torch_linear_example import ( train_func as linear_train_func, ) @@ -253,68 +253,6 @@ def test_tuner_run_config_override(self): assert tuner._local_tuner._run_config.stop == {"metric": 4} - def test_tuner_checkpoint_configuration(self): - # Case 1: nothing set - trainer = DummyTrainer() - tuner = Tuner(trainer) - - results = tuner.fit() - result = results[0] - assert result.checkpoint - assert not result.best_checkpoint - - # Case 2: metric and mode set - trainer = DummyTrainer() - tuner = Tuner( - trainer, tune_config=TuneConfig(mode="min", metric="step", num_samples=2) - ) - - results = tuner.fit() - result = results[0] - assert result.checkpoint - assert result.best_checkpoint - assert ( - os.path.basename( - os.path.normpath( - result.best_checkpoint.get_internal_representation()[1] - ) - ) - == "checkpoint_000000" - ) - assert ( - result.best_checkpoint.get_internal_representation() - != results[1].best_checkpoint.get_internal_representation() - ) - - # Case 3: CheckpointConfig set. Takes priority. - trainer = DummyTrainer( - run_config=RunConfig( - checkpoint_config=CheckpointConfig( - checkpoint_score_metric="step", checkpoint_score_mode="min" - ) - ) - ) - tuner = Tuner( - trainer, tune_config=TuneConfig(mode="max", metric="step", num_samples=2) - ) - - results = tuner.fit() - result = results[0] - assert result.checkpoint - assert result.best_checkpoint - assert ( - os.path.basename( - os.path.normpath( - result.best_checkpoint.get_internal_representation()[1] - ) - ) - == "checkpoint_000000" - ) - assert ( - result.best_checkpoint.get_internal_representation() - != results[1].best_checkpoint.get_internal_representation() - ) - if __name__ == "__main__": import sys diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py index 73e3944ee289..ea5d4bceb809 100644 --- a/python/ray/tune/trial.py +++ b/python/ray/tune/trial.py @@ -767,6 +767,9 @@ def get_trainable_cls(self): def is_finished(self): return self.status in [Trial.ERROR, Trial.TERMINATED] + def get_trial_checkpoints(self) -> List[_TrackedCheckpoint]: + return self.checkpoint_manager.best_checkpoints() + @property def is_restoring(self): return self.restoring_from is not None From 27e531c3431c23225e3caa47f92e3d93b871cc0d Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 20:54:43 +0000 Subject: [PATCH 34/70] Docstring tweak --- python/ray/tune/analysis/experiment_analysis.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/ray/tune/analysis/experiment_analysis.py b/python/ray/tune/analysis/experiment_analysis.py index f537788f61f9..7b0518cfa5c5 100644 --- a/python/ray/tune/analysis/experiment_analysis.py +++ b/python/ray/tune/analysis/experiment_analysis.py @@ -366,7 +366,11 @@ def results_df(self) -> DataFrame: @property def trial_dataframes(self) -> Dict[str, DataFrame]: - """List of all dataframes of the trials.""" + """List of all dataframes of the trials. + + Each row of the dataframe corresponds to one iteration of a trial + and contains reported metrics. + """ return self._trial_dataframes def dataframe( From 7d1abfe2a2d6b1786e3b571a5cdc8fbcca256cdf Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 20:56:16 +0000 Subject: [PATCH 35/70] Remove docstring --- python/ray/tune/result_grid.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 12f4cf0c8514..9c216b657e7b 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -73,13 +73,6 @@ def get_best_result( filter_nan_and_inf: If True (default), NaN or infinite values are disregarded and these trials are never selected as the best trial. - checkpoint_config: If True (default), will use the - ``CheckpointConfig`` object set in Trainer's ``run_config`` - to determine the best checkpoint of the trial. - If False, or if the ``CheckpointConfig`` object was not set, will use - ``metric`` and ``mode`` as set here. - Can also be a ``CheckpointConfig`` object, in which case it will - be used directly. """ if not metric and not self._experiment_analysis.default_metric: raise ValueError( From b0dd3baf038252f2f4208b18a10e380d8b566a40 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 20:58:45 +0000 Subject: [PATCH 36/70] Fix --- python/ray/air/result.py | 9 ++++++--- python/ray/tune/result_grid.py | 4 ++-- python/ray/tune/tests/test_result_grid.py | 12 ++++++------ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/python/ray/air/result.py b/python/ray/air/result.py index 77b3d4b03d28..a404569b3c9b 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -27,15 +27,18 @@ class Result: dataframe: The full result dataframe of the Trainable. Each row of the dataframe corresponds to one iteration and contains reported metrics. - checkpoint_history: A list of tuples of all checkpoints saved - by the Trainable and their associated metrics. + best_checkpoints: A list of tuples of the best checkpoints saved + by the Trainable and their associated metrics. The number of + saved checkpoints is determined by the ``checkpoint_config`` + argument of ``run_config`` (by default, all checkpoints will + be saved). """ metrics: Optional[Dict[str, Any]] checkpoint: Optional[Checkpoint] error: Optional[Exception] dataframe: Optional[pd.DataFrame] - checkpoint_history: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]] + best_checkpoints: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]] @property def config(self) -> Optional[Dict[str, Any]]: diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 9c216b657e7b..aaf6a73ecf2b 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -171,7 +171,7 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError] def _trial_to_result(self, trial: Trial) -> Result: checkpoint = trial.checkpoint.to_air_checkpoint() - checkpoint_history = [ + best_checkpoints = [ (checkpoint.to_air_checkpoint(), checkpoint.metrics) for checkpoint in trial.get_trial_checkpoints() ] @@ -183,6 +183,6 @@ def _trial_to_result(self, trial: Trial) -> Result: dataframe=self._experiment_analysis.trial_dataframes.get(trial.logdir) if self._experiment_analysis else None, - checkpoint_history=checkpoint_history, + best_checkpoints=best_checkpoints, ) return result diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index 0de68be19190..dc49c404cdc6 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -55,14 +55,14 @@ def f(config): result_grid = ResultGrid(analysis) result = result_grid[0] assert isinstance(result.checkpoint, Checkpoint) - assert isinstance(result.checkpoint_history, list) + assert isinstance(result.best_checkpoints, list) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) assert isinstance(result.dataframe, pd.DataFrame) assert os.path.normpath( result.checkpoint.get_internal_representation()[1] ) != os.path.normpath( - min((x for x in result.checkpoint_history), key=lambda x: x[1]["step"])[ + min((x for x in result.best_checkpoints), key=lambda x: x[1]["step"])[ 0 ].get_internal_representation()[1] ) @@ -169,11 +169,11 @@ def f(config): result_grid = ResultGrid(analysis) best_result = result_grid.get_best_result(metric="x", mode="max") assert best_result.metrics["x"] == 3 - print(best_result.checkpoint_history) - print([x[0].get_internal_representation() for x in best_result.checkpoint_history]) - assert len(best_result.checkpoint_history) == 2 + print(best_result.best_checkpoints) + print([x[0].get_internal_representation() for x in best_result.best_checkpoints]) + assert len(best_result.best_checkpoints) == 2 i = 0 - for checkpoint, metrics in best_result.checkpoint_history: + for checkpoint, metrics in best_result.best_checkpoints: assert isinstance(checkpoint, Checkpoint) assert metrics["x"] == 3 assert metrics["step"] == i From 5b226abab1b98e5bd237a87270bf2cd31f05b8bf Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 21:10:01 +0000 Subject: [PATCH 37/70] Tweak docstring --- python/ray/air/result.py | 4 ++-- python/ray/tune/analysis/experiment_analysis.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/ray/air/result.py b/python/ray/air/result.py index a404569b3c9b..d6fa35a4a809 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -24,8 +24,8 @@ class Result: metrics: The final metrics as reported by an Trainable. checkpoint: The final checkpoint of the Trainable. error: The execution error of the Trainable run, if the trial finishes in error. - dataframe: The full result dataframe of the Trainable. Each row of the - dataframe corresponds to one iteration and contains reported + dataframe: The full result dataframe of the Trainable. + The dataframe is indexed by iterations and contains reported metrics. best_checkpoints: A list of tuples of the best checkpoints saved by the Trainable and their associated metrics. The number of diff --git a/python/ray/tune/analysis/experiment_analysis.py b/python/ray/tune/analysis/experiment_analysis.py index 7b0518cfa5c5..97dd0a924a51 100644 --- a/python/ray/tune/analysis/experiment_analysis.py +++ b/python/ray/tune/analysis/experiment_analysis.py @@ -368,8 +368,8 @@ def results_df(self) -> DataFrame: def trial_dataframes(self) -> Dict[str, DataFrame]: """List of all dataframes of the trials. - Each row of the dataframe corresponds to one iteration of a trial - and contains reported metrics. + Each dataframe is indexed by iterations and contains reported + metrics. """ return self._trial_dataframes From 65ce1d3c9a1fd668a3c09eece4c07d46476c15b9 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 22:21:39 +0000 Subject: [PATCH 38/70] Fix --- python/ray/tune/impl/tuner_internal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py index e2dfadf43fdf..3faf25ee895c 100644 --- a/python/ray/tune/impl/tuner_internal.py +++ b/python/ray/tune/impl/tuner_internal.py @@ -147,7 +147,7 @@ def fit(self) -> ResultGrid: else: analysis = self._fit_resume(trainable) - return ResultGrid(analysis, self._run_config.checkpoint_config) + return ResultGrid(analysis) def _get_tune_run_arguments(self) -> Dict[str, Any]: """Get tune.run arguments common for both new and resumed runs.""" From 1e1fbea7d3392aa70869750a09c3e1467678dc06 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 22 Jun 2022 11:22:08 +0000 Subject: [PATCH 39/70] Use CheckpointStrategy --- python/ray/air/__init__.py | 4 +- python/ray/air/config.py | 55 +------------------ python/ray/air/tests/test_api.py | 14 ++--- .../ray/util/ml_utils/checkpoint_manager.py | 10 ++++ 4 files changed, 21 insertions(+), 62 deletions(-) diff --git a/python/ray/air/__init__.py b/python/ray/air/__init__.py index 506f9d022cc0..922f1bc83b94 100644 --- a/python/ray/air/__init__.py +++ b/python/ray/air/__init__.py @@ -4,8 +4,8 @@ RunConfig, ScalingConfig, FailureConfig, - CheckpointConfig, ) +from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy from ray.air.data_batch_type import DataBatchType from ray.air.result import Result from ray.air.util.datasets import train_test_split @@ -18,6 +18,6 @@ "ScalingConfig", "DatasetConfig", "FailureConfig", - "CheckpointConfig", + "CheckpointStrategy", "train_test_split", ] diff --git a/python/ray/air/config.py b/python/ray/air/config.py index 5b0a886cc3e2..5ea53d92f749 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -5,6 +5,7 @@ from ray.tune.syncer import SyncConfig from ray.tune.utils.log import Verbosity from ray.util.annotations import PublicAPI +from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy if TYPE_CHECKING: from ray.data import Dataset @@ -273,58 +274,6 @@ class FailureConfig: max_failures: int = 0 -@dataclass -@PublicAPI(stability="alpha") -class CheckpointConfig: - """Configuration related to checkpointing of each run/trial. - - Args: - keep_checkpoints_num: Number of checkpoints to keep. A value of - `None` keeps all checkpoints. Defaults to `None`. If set, need - to provide `checkpoint_score_attr`. - checkpoint_score_metric: Specifies by which metric to rank the - best checkpoint. Defaults to training iteration. - checkpoint_score_mode: Must be one of [min, max]. Determines - whether ``checkpoint_score_metric`` should be minimized or maximized. - If not set, will be the same as 'max'. Cannot be set if - ``checkpoint_score_metric`` is not set. - """ - - keep_checkpoints_num: Optional[int] = None - checkpoint_score_metric: Optional[str] = None - checkpoint_score_mode: Optional[str] = None - - def __post_init__(self): - if self.checkpoint_score_mode not in (None, "min", "max"): - raise ValueError( - "The `checkpoint_score_mode` parameter can only be " - f"either None, 'min' or 'max', got {self.checkpoint_score_mode}." - ) - if ( - self.checkpoint_score_metric is None - and self.checkpoint_score_mode is not None - ): - raise ValueError( - "`checkpoint_score_mode` cannot be set if " - "`checkpoint_score_metric` is not set." - ) - - @property - def checkpoint_score_attr(self) -> Optional[str]: - """Same as ``checkpoint_score_attr`` in ``tune.run``.""" - if self.checkpoint_score_metric is None: - return self.checkpoint_score_metric - prefix = "" - if self.checkpoint_score_mode == "min": - prefix = "min-" - return f"{prefix}{self.checkpoint_score_metric}" - - @property - def checkpoint_score_mode_not_none(self) -> str: - """``checkpoint_score_mode`` but None -> 'max'""" - return self.checkpoint_score_mode or "max" - - @dataclass @PublicAPI(stability="alpha") class RunConfig: @@ -365,5 +314,5 @@ class RunConfig: stop: Optional[Union[Mapping, "Stopper", Callable[[str, Mapping], bool]]] = None failure: Optional[FailureConfig] = None sync_config: Optional[SyncConfig] = None - checkpoint_config: Optional[CheckpointConfig] = None + checkpoint_config: Optional[CheckpointStrategy] = None verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py index 136cb0e58473..d7054c5a81bf 100644 --- a/python/ray/air/tests/test_api.py +++ b/python/ray/air/tests/test_api.py @@ -4,7 +4,7 @@ from ray.air import Checkpoint from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated from ray.air.config import ScalingConfigDataClass -from ray.air.config import CheckpointConfig +from ray.air.config import CheckpointStrategy from ray.data.preprocessor import Preprocessor from ray.train.trainer import BaseTrainer @@ -42,25 +42,25 @@ def test_run_config(): def test_checkpointing_config(): # cannot set checkpoint_score_mode if checkpoint_score_metric is unset with pytest.raises(ValueError): - CheckpointConfig(checkpoint_score_mode="min") + CheckpointStrategy(checkpoint_score_mode="min") with pytest.raises(ValueError): - CheckpointConfig( + CheckpointStrategy( checkpoint_score_metric="metric", checkpoint_score_mode="invalid" ) - checkpointing = CheckpointConfig() + checkpointing = CheckpointStrategy() assert checkpointing.checkpoint_score_attr is None - checkpointing = CheckpointConfig(checkpoint_score_metric="metric") + checkpointing = CheckpointStrategy(checkpoint_score_metric="metric") assert checkpointing.checkpoint_score_attr == "metric" - checkpointing = CheckpointConfig( + checkpointing = CheckpointStrategy( checkpoint_score_metric="metric", checkpoint_score_mode="max" ) assert checkpointing.checkpoint_score_attr == "metric" - checkpointing = CheckpointConfig( + checkpointing = CheckpointStrategy( checkpoint_score_metric="metric", checkpoint_score_mode="min" ) assert checkpointing.checkpoint_score_attr == "min-metric" diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index 9a27acd10e36..4b4dc9b8d113 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -230,6 +230,16 @@ def __post_init__(self): f"checkpoint_score_order must be either " f'"{MAX}" or "{MIN}".' ) + @property + def checkpoint_score_attr(self) -> Optional[str]: + """Same as ``checkpoint_score_attr`` in ``tune.run``.""" + if self.checkpoint_score_attribute is None: + return self.checkpoint_score_attribute + prefix = "" + if self.checkpoint_score_order == MIN: + prefix = "min-" + return f"{prefix}{self.checkpoint_score_attribute}" + class _CheckpointManager: """Common checkpoint management and bookkeeping class for Ray Train and Tune. From e19d40f542dd8e3af89042331ccc1d94d48692cf Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 22 Jun 2022 15:15:17 +0200 Subject: [PATCH 40/70] Fix --- python/ray/air/tests/test_api.py | 12 ++++-------- python/ray/tune/impl/tuner_internal.py | 2 +- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py index d7054c5a81bf..1c0680860e3f 100644 --- a/python/ray/air/tests/test_api.py +++ b/python/ray/air/tests/test_api.py @@ -40,28 +40,24 @@ def test_run_config(): def test_checkpointing_config(): - # cannot set checkpoint_score_mode if checkpoint_score_metric is unset - with pytest.raises(ValueError): - CheckpointStrategy(checkpoint_score_mode="min") - with pytest.raises(ValueError): CheckpointStrategy( - checkpoint_score_metric="metric", checkpoint_score_mode="invalid" + checkpoint_score_attribute="metric", checkpoint_score_order="invalid" ) checkpointing = CheckpointStrategy() assert checkpointing.checkpoint_score_attr is None - checkpointing = CheckpointStrategy(checkpoint_score_metric="metric") + checkpointing = CheckpointStrategy(checkpoint_score_attribute="metric") assert checkpointing.checkpoint_score_attr == "metric" checkpointing = CheckpointStrategy( - checkpoint_score_metric="metric", checkpoint_score_mode="max" + checkpoint_score_attribute="metric", checkpoint_score_order="max" ) assert checkpointing.checkpoint_score_attr == "metric" checkpointing = CheckpointStrategy( - checkpoint_score_metric="metric", checkpoint_score_mode="min" + checkpoint_score_attribute="metric", checkpoint_score_order="min" ) assert checkpointing.checkpoint_score_attr == "min-metric" diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py index 60b89b8acb98..2d348c94d47a 100644 --- a/python/ray/tune/impl/tuner_internal.py +++ b/python/ray/tune/impl/tuner_internal.py @@ -163,7 +163,7 @@ def _get_tune_run_arguments(self) -> Dict[str, Any]: else 0 ), keep_checkpoints_num=( - self._run_config.checkpoint_config.keep_checkpoints_num + self._run_config.checkpoint_config.num_to_keep if self._run_config.checkpoint_config else None ), From fd961746ca582263ecbc6bacc4342e915bd74416 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 15:43:53 +0000 Subject: [PATCH 41/70] dataframe -> metrics_dataframe --- python/ray/air/result.py | 4 ++-- python/ray/tune/result_grid.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/ray/air/result.py b/python/ray/air/result.py index d6fa35a4a809..5b7f0fcba04b 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -24,7 +24,7 @@ class Result: metrics: The final metrics as reported by an Trainable. checkpoint: The final checkpoint of the Trainable. error: The execution error of the Trainable run, if the trial finishes in error. - dataframe: The full result dataframe of the Trainable. + metrics_dataframe: The full result dataframe of the Trainable. The dataframe is indexed by iterations and contains reported metrics. best_checkpoints: A list of tuples of the best checkpoints saved @@ -37,7 +37,7 @@ class Result: metrics: Optional[Dict[str, Any]] checkpoint: Optional[Checkpoint] error: Optional[Exception] - dataframe: Optional[pd.DataFrame] + metrics_dataframe: Optional[pd.DataFrame] best_checkpoints: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]] @property diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index ed6711414985..b0cc6f83899e 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -180,7 +180,9 @@ def _trial_to_result(self, trial: Trial) -> Result: checkpoint=checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), - dataframe=self._experiment_analysis.trial_dataframes.get(trial.logdir) + metrics_dataframe=self._experiment_analysis.trial_dataframes.get( + trial.logdir + ) if self._experiment_analysis else None, best_checkpoints=best_checkpoints, From 8d5f1b3d63d6b58843f2fd51d39c8502b2293015 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 16:04:09 +0000 Subject: [PATCH 42/70] CheckpointStrategy -> CheckpointConfig --- doc/source/train/api.rst | 4 --- doc/source/train/user_guide.rst | 10 +++--- python/ray/air/__init__.py | 4 +-- python/ray/air/config.py | 6 ++-- python/ray/air/tests/test_api.py | 12 +++---- python/ray/train/__init__.py | 6 +++- python/ray/train/_internal/checkpoint.py | 8 ++--- python/ray/train/data_parallel_trainer.py | 6 ++-- python/ray/train/tests/test_trainer.py | 12 +++---- python/ray/train/trainer.py | 16 ++++----- python/ray/tune/callback.py | 4 +-- .../ray/tune/execution/checkpoint_manager.py | 4 +-- .../ray/util/ml_utils/checkpoint_manager.py | 33 ++++++++++++++----- .../ml_utils/tests/test_checkpoint_manager.py | 14 ++++---- 14 files changed, 79 insertions(+), 60 deletions(-) diff --git a/doc/source/train/api.rst b/doc/source/train/api.rst index 054ad0f30d10..ea8b879cdcd9 100644 --- a/doc/source/train/api.rst +++ b/doc/source/train/api.rst @@ -117,10 +117,6 @@ Checkpointing .. _train-api-checkpoint-strategy: -CheckpointStrategy -~~~~~~~~~~~~~~~~~~ - -.. autoclass:: ray.train.CheckpointStrategy .. _train-api-func-utils: diff --git a/doc/source/train/user_guide.rst b/doc/source/train/user_guide.rst index 7dc1bd79ce3a..ff2b2556afb0 100644 --- a/doc/source/train/user_guide.rst +++ b/doc/source/train/user_guide.rst @@ -700,13 +700,13 @@ As an example, to completely disable writing checkpoints to disk: :emphasize-lines: 8,12 from ray import train - from ray.train import CheckpointStrategy, Trainer + from ray.train import CheckpointConfig, Trainer def train_func(): for epoch in range(3): train.save_checkpoint(epoch=epoch) - checkpoint_strategy = CheckpointStrategy(num_to_keep=0) + checkpoint_strategy = CheckpointConfig(num_to_keep=0) trainer = Trainer(backend="torch", num_workers=2) trainer.start() @@ -714,12 +714,12 @@ As an example, to completely disable writing checkpoints to disk: trainer.shutdown() -You may also config ``CheckpointStrategy`` to keep the "N best" checkpoints persisted to disk. The following example shows how you could keep the 2 checkpoints with the lowest "loss" value: +You may also config ``CheckpointConfig`` to keep the "N best" checkpoints persisted to disk. The following example shows how you could keep the 2 checkpoints with the lowest "loss" value: .. code-block:: python from ray import train - from ray.train import CheckpointStrategy, Trainer + from ray.train import CheckpointConfig, Trainer def train_func(): @@ -733,7 +733,7 @@ You may also config ``CheckpointStrategy`` to keep the "N best" checkpoints pers train.save_checkpoint(loss=3) # Keep the 2 checkpoints with the smallest "loss" value. - checkpoint_strategy = CheckpointStrategy(num_to_keep=2, + checkpoint_strategy = CheckpointConfig(num_to_keep=2, checkpoint_score_attribute="loss", checkpoint_score_order="min") diff --git a/python/ray/air/__init__.py b/python/ray/air/__init__.py index 922f1bc83b94..506f9d022cc0 100644 --- a/python/ray/air/__init__.py +++ b/python/ray/air/__init__.py @@ -4,8 +4,8 @@ RunConfig, ScalingConfig, FailureConfig, + CheckpointConfig, ) -from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy from ray.air.data_batch_type import DataBatchType from ray.air.result import Result from ray.air.util.datasets import train_test_split @@ -18,6 +18,6 @@ "ScalingConfig", "DatasetConfig", "FailureConfig", - "CheckpointStrategy", + "CheckpointConfig", "train_test_split", ] diff --git a/python/ray/air/config.py b/python/ray/air/config.py index 00db7cdcf33b..5b7317d283ca 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -5,7 +5,9 @@ from ray.tune.syncer import SyncConfig from ray.tune.utils.log import Verbosity from ray.util.annotations import PublicAPI -from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy + +# Move here later when ml_utils is deprecated +from ray.util.ml_utils.checkpoint_manager import CheckpointConfig if TYPE_CHECKING: from ray.data import Dataset @@ -314,5 +316,5 @@ class RunConfig: stop: Optional[Union[Mapping, "Stopper", Callable[[str, Mapping], bool]]] = None failure_config: Optional[FailureConfig] = None sync_config: Optional[SyncConfig] = None - checkpoint_config: Optional[CheckpointStrategy] = None + checkpoint_config: Optional[CheckpointConfig] = None verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py index 1c0680860e3f..e4474c24626a 100644 --- a/python/ray/air/tests/test_api.py +++ b/python/ray/air/tests/test_api.py @@ -4,7 +4,7 @@ from ray.air import Checkpoint from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated from ray.air.config import ScalingConfigDataClass -from ray.air.config import CheckpointStrategy +from ray.air.config import CheckpointConfig from ray.data.preprocessor import Preprocessor from ray.train.trainer import BaseTrainer @@ -41,22 +41,22 @@ def test_run_config(): def test_checkpointing_config(): with pytest.raises(ValueError): - CheckpointStrategy( + CheckpointConfig( checkpoint_score_attribute="metric", checkpoint_score_order="invalid" ) - checkpointing = CheckpointStrategy() + checkpointing = CheckpointConfig() assert checkpointing.checkpoint_score_attr is None - checkpointing = CheckpointStrategy(checkpoint_score_attribute="metric") + checkpointing = CheckpointConfig(checkpoint_score_attribute="metric") assert checkpointing.checkpoint_score_attr == "metric" - checkpointing = CheckpointStrategy( + checkpointing = CheckpointConfig( checkpoint_score_attribute="metric", checkpoint_score_order="max" ) assert checkpointing.checkpoint_score_attr == "metric" - checkpointing = CheckpointStrategy( + checkpointing = CheckpointConfig( checkpoint_score_attribute="metric", checkpoint_score_order="min" ) assert checkpointing.checkpoint_score_attr == "min-metric" diff --git a/python/ray/train/__init__.py b/python/ray/train/__init__.py index 11407fa8a16a..74d360f117b7 100644 --- a/python/ray/train/__init__.py +++ b/python/ray/train/__init__.py @@ -12,13 +12,16 @@ world_size, ) from ray.train.trainer import Trainer, TrainingIterator +from ray.air.config import CheckpointConfig + +# deprecated from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy usage_lib.record_library_usage("train") __all__ = [ "BackendConfig", - "CheckpointStrategy", + "CheckpointConfig", "get_dataset_shard", "load_checkpoint", "local_rank", @@ -30,4 +33,5 @@ "world_rank", "world_size", "TRAIN_DATASET_KEY", + "CheckpointStrategy", ] diff --git a/python/ray/train/_internal/checkpoint.py b/python/ray/train/_internal/checkpoint.py index 0a85f4396e36..8bffe957833d 100644 --- a/python/ray/train/_internal/checkpoint.py +++ b/python/ray/train/_internal/checkpoint.py @@ -11,7 +11,7 @@ TUNE_CHECKPOINT_ID, TUNE_INSTALLED, ) -from ray.util.ml_utils.checkpoint_manager import CheckpointStorage, CheckpointStrategy +from ray.util.ml_utils.checkpoint_manager import CheckpointStorage, CheckpointConfig from ray.util.ml_utils.checkpoint_manager import ( _CheckpointManager as CommonCheckpointManager, ) @@ -67,7 +67,7 @@ class CheckpointManager(CommonCheckpointManager): def __init__( self, run_dir: Optional[Path] = None, - checkpoint_strategy: Optional[CheckpointStrategy] = None, + checkpoint_strategy: Optional[CheckpointConfig] = None, ): self.run_dir = run_dir @@ -136,11 +136,11 @@ def _get_next_checkpoint_path(self) -> Optional[Path]: def on_start_training( self, - checkpoint_strategy: Optional[CheckpointStrategy], + checkpoint_strategy: Optional[CheckpointConfig], run_dir: Path, latest_checkpoint_id: Optional[int] = 0, ): - checkpoint_strategy = checkpoint_strategy or CheckpointStrategy() + checkpoint_strategy = checkpoint_strategy or CheckpointConfig() self._checkpoint_strategy = checkpoint_strategy self._validate_checkpoint_strategy() diff --git a/python/ray/train/data_parallel_trainer.py b/python/ray/train/data_parallel_trainer.py index 59ed5cb0be4c..c8729282ab40 100644 --- a/python/ray/train/data_parallel_trainer.py +++ b/python/ray/train/data_parallel_trainer.py @@ -8,7 +8,7 @@ from ray import tune from ray.air import session from ray.air.checkpoint import Checkpoint -from ray.air.config import DatasetConfig, RunConfig, ScalingConfig +from ray.air.config import DatasetConfig, RunConfig, ScalingConfig, CheckpointConfig from ray.air.constants import MODEL_KEY, PREPROCESSOR_KEY from ray.train import BackendConfig, TrainingIterator from ray.train._internal.backend_executor import BackendExecutor, TrialInfo @@ -18,7 +18,7 @@ from ray.train.constants import TRAIN_DATASET_KEY, WILDCARD_KEY from ray.train.trainer import BaseTrainer, GenDataset from ray.util.annotations import DeveloperAPI -from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy, _TrackedCheckpoint +from ray.util.ml_utils.checkpoint_manager import _TrackedCheckpoint if TYPE_CHECKING: from ray.data.preprocessor import Preprocessor @@ -32,7 +32,7 @@ def __init__( self, preprocessor: "Preprocessor", run_dir: Optional[Path] = None, - checkpoint_strategy: Optional[CheckpointStrategy] = None, + checkpoint_strategy: Optional[CheckpointConfig] = None, ): self.preprocessor = preprocessor super(_DataParallelCheckpointManager, self).__init__( diff --git a/python/ray/train/tests/test_trainer.py b/python/ray/train/tests/test_trainer.py index 207d45f79d28..d366999e8611 100644 --- a/python/ray/train/tests/test_trainer.py +++ b/python/ray/train/tests/test_trainer.py @@ -10,7 +10,7 @@ import ray import ray.train as train from ray._private.test_utils import wait_for_condition -from ray.train import Trainer, CheckpointStrategy +from ray.train import Trainer, CheckpointConfig from ray.train.backend import BackendConfig, Backend from ray.train.constants import TRAIN_ENABLE_WORKER_SPREAD_ENV from ray.train.torch import TorchConfig @@ -514,7 +514,7 @@ def test_persisted_checkpoint_strategy(ray_start_2_cpus): logdir = "/tmp/test/trainer/test_persisted_checkpoint_strategy" config = TestConfig() - checkpoint_strategy = CheckpointStrategy( + checkpoint_strategy = CheckpointConfig( num_to_keep=2, checkpoint_score_attribute="loss", checkpoint_score_order="min" ) @@ -555,7 +555,7 @@ def validate(): def test_load_checkpoint_from_path(ray_start_2_cpus, tmpdir): config = TestConfig() - checkpoint_strategy = CheckpointStrategy( + checkpoint_strategy = CheckpointConfig( checkpoint_score_attribute="loss", checkpoint_score_order="min" ) @@ -585,12 +585,12 @@ def train_func(): trainer.start() with pytest.raises(ValueError): - trainer.run(train_func, checkpoint_strategy=CheckpointStrategy(num_to_keep=-1)) + trainer.run(train_func, checkpoint_strategy=CheckpointConfig(num_to_keep=-1)) with pytest.raises(ValueError): trainer.run( train_func, - checkpoint_strategy=CheckpointStrategy( + checkpoint_strategy=CheckpointConfig( checkpoint_score_order="invalid_order" ), ) @@ -598,7 +598,7 @@ def train_func(): with pytest.raises(ValueError): trainer.run( train_func, - checkpoint_strategy=CheckpointStrategy( + checkpoint_strategy=CheckpointConfig( checkpoint_score_attribute="missing_attribute" ), ) diff --git a/python/ray/train/trainer.py b/python/ray/train/trainer.py index 1980f16235df..0262f92963d8 100644 --- a/python/ray/train/trainer.py +++ b/python/ray/train/trainer.py @@ -9,6 +9,7 @@ import ray from ray.actor import ActorHandle from ray.air.checkpoint import Checkpoint +from ray.air.config import CheckpointConfig from ray.train._internal.backend_executor import ( BackendExecutor, InactiveWorkerGroupError, @@ -42,7 +43,6 @@ TUNE_INSTALLED, ) from ray.util.annotations import Deprecated, DeveloperAPI -from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy if TUNE_INSTALLED: from ray import tune @@ -293,7 +293,7 @@ def run( callbacks: Optional[List[TrainingCallback]] = None, dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]] = None, checkpoint: Optional[Union[Dict, str, Path]] = None, - checkpoint_strategy: Optional[CheckpointStrategy] = None, + checkpoint_strategy: Optional[CheckpointConfig] = None, ) -> List[T]: """Runs a training function in a distributed manner. @@ -321,7 +321,7 @@ def run( or ``Path`` then the value is expected to be a path to a file that contains a serialized checkpoint dict. If this is ``None`` then no checkpoint will be loaded. - checkpoint_strategy (Optional[CheckpointStrategy]): The + checkpoint_strategy (Optional[CheckpointConfig]): The configurations for saving checkpoints. Returns: @@ -373,7 +373,7 @@ def run_iterator( config: Optional[Dict[str, Any]] = None, dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]] = None, checkpoint: Optional[Union[Dict, str, Path]] = None, - checkpoint_strategy: Optional[CheckpointStrategy] = None, + checkpoint_strategy: Optional[CheckpointConfig] = None, ) -> "TrainingIterator": """Same as ``run`` except returns an iterator over the results. @@ -411,7 +411,7 @@ def train_func(config): ``str`` or ``Path`` then the value is expected to be a path to a file that contains a serialized checkpoint dict. If this is ``None`` then no checkpoint will be loaded. - checkpoint_strategy (Optional[CheckpointStrategy]): The + checkpoint_strategy (Optional[CheckpointConfig]): The configurations for saving checkpoints. Returns: @@ -462,7 +462,7 @@ def latest_checkpoint_dir(self) -> Optional[Path]: def best_checkpoint_path(self) -> Optional[Path]: """Path to the best persisted checkpoint from the latest run. - "Best" is defined by the input ``CheckpointStrategy``. + "Best" is defined by the input ``CheckpointConfig``. Default behavior is to return the most recent checkpoint. Returns ``None`` if ``run()`` has not been called or if @@ -486,7 +486,7 @@ def latest_checkpoint(self) -> Optional[Dict]: def best_checkpoint(self) -> Optional[Dict]: """Best saved checkpoint from the latest run. - "Best" is defined by the input ``CheckpointStrategy``. + "Best" is defined by the input ``CheckpointConfig``. Default behavior is to return the most recent checkpoint. Returns ``None`` if ``run()`` has not been called or if @@ -670,7 +670,7 @@ def __init__( dataset_spec: RayDatasetSpec, checkpoint_manager: CheckpointManager, checkpoint: Optional[Union[Dict, str, Path, Checkpoint]], - checkpoint_strategy: Optional[CheckpointStrategy], + checkpoint_strategy: Optional[CheckpointConfig], run_dir: Optional[Path] = None, ): self._backend_executor = backend_executor diff --git a/python/ray/tune/callback.py b/python/ray/tune/callback.py index fcf4e24aee3d..450ee55310f7 100644 --- a/python/ray/tune/callback.py +++ b/python/ray/tune/callback.py @@ -3,11 +3,11 @@ import warnings from ray.util.annotations import PublicAPI, DeveloperAPI -from ray.util.ml_utils.checkpoint_manager import _TrackedCheckpoint if TYPE_CHECKING: from ray.tune.experiment import Trial from ray.tune.stopper import Stopper + from ray.util.ml_utils.checkpoint_manager import _TrackedCheckpoint class _CallbackMeta(ABCMeta): @@ -251,7 +251,7 @@ def on_checkpoint( iteration: int, trials: List["Trial"], trial: "Trial", - checkpoint: _TrackedCheckpoint, + checkpoint: "_TrackedCheckpoint", **info, ): """Called after a trial saved a checkpoint with Tune. diff --git a/python/ray/tune/execution/checkpoint_manager.py b/python/ray/tune/execution/checkpoint_manager.py index 64b68a7fb416..f1295ac6f604 100644 --- a/python/ray/tune/execution/checkpoint_manager.py +++ b/python/ray/tune/execution/checkpoint_manager.py @@ -4,7 +4,7 @@ from ray.tune.result import TRAINING_ITERATION from ray.util.ml_utils.checkpoint_manager import ( - CheckpointStrategy, + CheckpointConfig, MIN, MAX, _CheckpointManager as CommonCheckpointManager, @@ -51,7 +51,7 @@ def __init__( else: checkpoint_score_attr = checkpoint_score_attr - checkpoint_strategy = CheckpointStrategy( + checkpoint_strategy = CheckpointConfig( num_to_keep=keep_checkpoints_num, checkpoint_score_attribute=checkpoint_score_attr, checkpoint_score_order=MIN if checkpoint_score_desc else MAX, diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index 12be4aea9f99..493be617e2d4 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -14,8 +14,7 @@ import ray from ray.air import Checkpoint from ray.tune.result import NODE_IP -from ray.util import PublicAPI -from ray.util.annotations import DeveloperAPI +from ray.util.annotations import Deprecated, DeveloperAPI, PublicAPI from ray.util.ml_utils.util import is_nan MAX = "max" @@ -186,9 +185,10 @@ def __repr__(self): return f"_HeapCheckpoint({repr(self.tracked_checkpoint)})" -@PublicAPI(stability="beta") +# Move to ray.air.config when ml_utils is deprecated. @dataclass -class CheckpointStrategy: +@PublicAPI(stability="alpha") +class CheckpointConfig: """Configurable parameters for defining the checkpointing strategy. Default behavior is to persist all checkpoints to disk. If @@ -196,7 +196,7 @@ class CheckpointStrategy: checkpoints with maximum timestamp, i.e. the most recent checkpoints. Args: - num_to_keep (Optional[int]): The number of checkpoints to keep + num_to_keep: The number of checkpoints to keep on disk for this run. If a checkpoint is persisted to disk after there are already this many checkpoints, then an existing checkpoint will be deleted. If this is ``None`` then checkpoints @@ -208,7 +208,7 @@ class CheckpointStrategy: This attribute must be a key from the checkpoint dictionary which has a numerical value. Per default, the last checkpoints will be kept. - checkpoint_score_order (str). Either "max" or "min". + checkpoint_score_order: Either "max" or "min". If "max", then checkpoints with highest values of ``checkpoint_score_attribute`` will be kept. If "min", then checkpoints with lowest values of @@ -242,6 +242,23 @@ def checkpoint_score_attr(self) -> Optional[str]: return f"{prefix}{self.checkpoint_score_attribute}" +# Alias for backwards compatibility + +deprecation_message = ( + "`CheckpointStrategy` is deprecated and will be removed in " + "the future. Please use `ray.air.config.CheckpointStrategy` " + "instead." +) + + +@Deprecated(message=deprecation_message) +@dataclass +class CheckpointStrategy(CheckpointConfig): + def __post_init__(self): + logger.warning(deprecation_message) + super().__post_init__() + + class _CheckpointManager: """Common checkpoint management and bookkeeping class for Ray Train and Tune. @@ -269,11 +286,11 @@ class _CheckpointManager: def __init__( self, - checkpoint_strategy: CheckpointStrategy, + checkpoint_strategy: CheckpointConfig, latest_checkpoint_id: int = 0, delete_fn: Optional[Callable[["_TrackedCheckpoint"], None]] = None, ): - self._checkpoint_strategy = checkpoint_strategy or CheckpointStrategy() + self._checkpoint_strategy = checkpoint_strategy or CheckpointConfig() # Incremental unique checkpoint ID of this run. self._latest_checkpoint_id = latest_checkpoint_id diff --git a/python/ray/util/ml_utils/tests/test_checkpoint_manager.py b/python/ray/util/ml_utils/tests/test_checkpoint_manager.py index 16fd83a8ecb8..0c0a145ad26b 100644 --- a/python/ray/util/ml_utils/tests/test_checkpoint_manager.py +++ b/python/ray/util/ml_utils/tests/test_checkpoint_manager.py @@ -2,13 +2,13 @@ from ray.util.ml_utils.checkpoint_manager import ( _CheckpointManager, CheckpointStorage, - CheckpointStrategy, + CheckpointConfig, _TrackedCheckpoint, ) def test_unlimited_persistent_checkpoints(): - cpm = _CheckpointManager(checkpoint_strategy=CheckpointStrategy(num_to_keep=None)) + cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=None)) for i in range(10): cpm.register_checkpoint( @@ -19,7 +19,7 @@ def test_unlimited_persistent_checkpoints(): def test_limited_persistent_checkpoints(): - cpm = _CheckpointManager(checkpoint_strategy=CheckpointStrategy(num_to_keep=2)) + cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=2)) for i in range(10): cpm.register_checkpoint( @@ -30,7 +30,7 @@ def test_limited_persistent_checkpoints(): def test_no_persistent_checkpoints(): - cpm = _CheckpointManager(checkpoint_strategy=CheckpointStrategy(num_to_keep=0)) + cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=0)) for i in range(10): cpm.register_checkpoint( @@ -41,7 +41,7 @@ def test_no_persistent_checkpoints(): def test_dont_persist_memory_checkpoints(): - cpm = _CheckpointManager(checkpoint_strategy=CheckpointStrategy(num_to_keep=None)) + cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=None)) cpm._persist_memory_checkpoints = False for i in range(10): @@ -53,7 +53,7 @@ def test_dont_persist_memory_checkpoints(): def test_persist_memory_checkpoints(): - cpm = _CheckpointManager(checkpoint_strategy=CheckpointStrategy(num_to_keep=None)) + cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=None)) cpm._persist_memory_checkpoints = True for i in range(10): @@ -66,7 +66,7 @@ def test_persist_memory_checkpoints(): def test_keep_best_checkpoints(): cpm = _CheckpointManager( - checkpoint_strategy=CheckpointStrategy( + checkpoint_strategy=CheckpointConfig( num_to_keep=2, checkpoint_score_attribute="metric", checkpoint_score_order="min", From 0482bce4c4bc0b2204283b5455eb9b5474de90b0 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 16:10:20 +0000 Subject: [PATCH 43/70] Missed this --- doc/source/train/user_guide.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/train/user_guide.rst b/doc/source/train/user_guide.rst index ff2b2556afb0..8a75792732cf 100644 --- a/doc/source/train/user_guide.rst +++ b/doc/source/train/user_guide.rst @@ -680,7 +680,7 @@ directory ` of each run. # /home/ray_results/train_2021-09-01_12-00-00/run_001/checkpoints # By default, the "best" checkpoint path will refer to the most recent one. - # This can be configured by defining a CheckpointStrategy. + # This can be configured by defining a CheckpointConfig. print(trainer.best_checkpoint_path) # /home/ray_results/train_2021-09-01_12-00-00/run_001/checkpoints/checkpoint_000005 From 0cb579ddd3b868849453d4a72239fd3581873e07 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 11:47:44 -0700 Subject: [PATCH 44/70] Update test_result_grid.py --- python/ray/tune/tests/test_result_grid.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index fde9713908d0..ea7bf0a5bcb3 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -58,7 +58,7 @@ def f(config): assert isinstance(result.best_checkpoints, list) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) - assert isinstance(result.dataframe, pd.DataFrame) + assert isinstance(result.metrics_dataframe, pd.DataFrame) assert os.path.normpath( result.checkpoint.get_internal_representation()[1] ) != os.path.normpath( @@ -68,7 +68,7 @@ def f(config): ) assert result.config == {"a": 1} assert result.metrics["config"] == result.config - assert len(result.dataframe) == 2 + assert len(result.metrics_dataframe) == 2 def test_result_grid_metric_mode_unset(ray_start_2_cpus): @@ -87,10 +87,10 @@ def f(config): assert isinstance(result.checkpoint, Checkpoint) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) - assert isinstance(result.dataframe, pd.DataFrame) + assert isinstance(result.metrics_dataframe, pd.DataFrame) assert result.config == {"a": 1} assert result.metrics["config"] == result.config - assert len(result.dataframe) == 2 + assert len(result.metrics_dataframe) == 2 def test_result_grid_no_checkpoint(ray_start_2_cpus): @@ -131,7 +131,7 @@ def test_result_grid_future_checkpoint(ray_start_2_cpus, to_object): assert isinstance(result.checkpoint, Checkpoint) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) - assert result.dataframe is None + assert result.metrics_dataframe is None assert result.config == {"some_config": 1} assert result.metrics["config"] == result.config From 7ade7e4878c87212fdb6d3707fc10e3447a76164 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 21:09:20 +0000 Subject: [PATCH 45/70] Fix --- python/ray/train/tests/test_examples.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index 2ebef818d7aa..316ff4dc5fc3 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -51,7 +51,7 @@ def test_tensorflow_mnist(ray_start_4_cpus, num_workers): assert result[TRAINING_ITERATION] == epochs - loss = list(results.dataframe["loss"]) + loss = list(results.metrics_dataframe["loss"]) assert len(loss) == epochs assert loss[-1] < loss[0] @@ -107,7 +107,7 @@ def test_torch_linear(ray_start_4_cpus, num_workers): result = results.metrics assert result[TRAINING_ITERATION] == epochs - loss = list(results.dataframe["loss"]) + loss = list(results.metrics_dataframe["loss"]) assert len(loss) == epochs assert loss[-1] < loss[0] @@ -146,7 +146,7 @@ def test_torch_fashion_mnist(ray_start_4_cpus): result = results.metrics assert result[TRAINING_ITERATION] == epochs - loss = list(results.dataframe["loss"]) + loss = list(results.metrics_dataframe["loss"]) assert len(loss) == epochs assert loss[-1] < loss[0] From 0937dc857fb32d4cf0ff99bb8daeae5d7c2ade85 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 21:33:14 +0000 Subject: [PATCH 46/70] Apply feeedback from code review --- doc/source/ray-air/package-ref.rst | 3 +++ doc/source/train/api.rst | 6 ------ doc/source/train/user_guide.rst | 2 +- python/ray/air/config.py | 2 +- python/ray/util/ml_utils/checkpoint_manager.py | 1 + 5 files changed, 6 insertions(+), 8 deletions(-) diff --git a/doc/source/ray-air/package-ref.rst b/doc/source/ray-air/package-ref.rst index 3b57ecfe6dc5..24586c5c620b 100644 --- a/doc/source/ray-air/package-ref.rst +++ b/doc/source/ray-air/package-ref.rst @@ -124,3 +124,6 @@ Configs .. automodule:: ray.air.config :members: +.. _train-api-checkpoint-config: + +.. autoclass:: ray.air.config.CheckpointConfig \ No newline at end of file diff --git a/doc/source/train/api.rst b/doc/source/train/api.rst index ea8b879cdcd9..babf01861019 100644 --- a/doc/source/train/api.rst +++ b/doc/source/train/api.rst @@ -112,12 +112,6 @@ TorchTensorboardProfilerCallback .. autoclass:: ray.train.callbacks.TorchTensorboardProfilerCallback -Checkpointing -------------- - -.. _train-api-checkpoint-strategy: - - .. _train-api-func-utils: Training Function Utilities diff --git a/doc/source/train/user_guide.rst b/doc/source/train/user_guide.rst index 8a75792732cf..701492136952 100644 --- a/doc/source/train/user_guide.rst +++ b/doc/source/train/user_guide.rst @@ -691,7 +691,7 @@ Configuring checkpoints +++++++++++++++++++++++ For more configurability of checkpointing behavior (specifically saving -checkpoints to disk), a :ref:`train-api-checkpoint-strategy` can be passed into +checkpoints to disk), a :ref:`train-api-checkpoint-config` can be passed into ``Trainer.run``. As an example, to completely disable writing checkpoints to disk: diff --git a/python/ray/air/config.py b/python/ray/air/config.py index 5b7317d283ca..cec982da4188 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -6,7 +6,7 @@ from ray.tune.utils.log import Verbosity from ray.util.annotations import PublicAPI -# Move here later when ml_utils is deprecated +# Move here later when ml_utils is deprecated. Doing it now causes a circular import. from ray.util.ml_utils.checkpoint_manager import CheckpointConfig if TYPE_CHECKING: diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index 493be617e2d4..a3153dbd5e06 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -186,6 +186,7 @@ def __repr__(self): # Move to ray.air.config when ml_utils is deprecated. +# Doing it now causes a circular import. @dataclass @PublicAPI(stability="alpha") class CheckpointConfig: From b99362770375c0bda7b0087e88fe58a0891933f5 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 22:34:27 +0000 Subject: [PATCH 47/70] Fix lint --- doc/source/ray-air/package-ref.rst | 2 -- doc/source/train/user_guide.rst | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/source/ray-air/package-ref.rst b/doc/source/ray-air/package-ref.rst index 24586c5c620b..744206354232 100644 --- a/doc/source/ray-air/package-ref.rst +++ b/doc/source/ray-air/package-ref.rst @@ -124,6 +124,4 @@ Configs .. automodule:: ray.air.config :members: -.. _train-api-checkpoint-config: - .. autoclass:: ray.air.config.CheckpointConfig \ No newline at end of file diff --git a/doc/source/train/user_guide.rst b/doc/source/train/user_guide.rst index 701492136952..be799a49e666 100644 --- a/doc/source/train/user_guide.rst +++ b/doc/source/train/user_guide.rst @@ -691,8 +691,8 @@ Configuring checkpoints +++++++++++++++++++++++ For more configurability of checkpointing behavior (specifically saving -checkpoints to disk), a :ref:`train-api-checkpoint-config` can be passed into -``Trainer.run``. +checkpoints to disk), a :class:`CheckpointConfig` can be passed into +``Trainer``. As an example, to completely disable writing checkpoints to disk: From ed870bd4f72bd8d64d8a24d651a21b068114c3d2 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 15:46:51 -0700 Subject: [PATCH 48/70] Update python/ray/train/__init__.py --- python/ray/train/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/train/__init__.py b/python/ray/train/__init__.py index 74d360f117b7..5039ae461499 100644 --- a/python/ray/train/__init__.py +++ b/python/ray/train/__init__.py @@ -14,7 +14,7 @@ from ray.train.trainer import Trainer, TrainingIterator from ray.air.config import CheckpointConfig -# deprecated +# Deprecated. Alias of CheckpointConfig for backwards compat from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy usage_lib.record_library_usage("train") From a4fd532ea77f16b0e4e70e738eeceacdc7912d85 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 27 Jun 2022 18:26:44 +0000 Subject: [PATCH 49/70] Fix CI --- .../ray/train/examples/tune_cifar_pytorch_pbt_example.py | 2 +- python/ray/train/tests/test_minimal.py | 9 ++++----- python/ray/train/tests/test_tune.py | 4 +++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py index a7031b3116a1..38abba231ae8 100644 --- a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py +++ b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py @@ -185,7 +185,7 @@ def train_func(config): ), run_config=RunConfig( stop={"training_iteration": 2 if args.smoke_test else 100}, - failure=FailureConfig(max_failures=3), # used for fault tolerance + failure_config=FailureConfig(max_failures=3), # used for fault tolerance ), ) diff --git a/python/ray/train/tests/test_minimal.py b/python/ray/train/tests/test_minimal.py index a23d7b4f23f9..7541edb16852 100644 --- a/python/ray/train/tests/test_minimal.py +++ b/python/ray/train/tests/test_minimal.py @@ -1,7 +1,7 @@ import pytest import ray -import ray.train as train +from ray.air import session from ray.air.checkpoint import Checkpoint from ray.train._internal.worker_group import WorkerGroup from ray.train.backend import Backend, BackendConfig @@ -38,10 +38,9 @@ def test_run(ray_start_4_cpus): config = TestConfig() def train_func(): - checkpoint = train.load_checkpoint() - train.report(**checkpoint) - train.save_checkpoint(**checkpoint) - return checkpoint[key] + checkpoint = session.get_checkpoint() + session.report(metrics=checkpoint.to_dict(), checkpoint=checkpoint) + return checkpoint.to_dict()[key] checkpoint = Checkpoint.from_dict( { diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index 0196a84e46b6..e407679268ad 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -200,7 +200,9 @@ def train_func(): trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1) ) - tuner = Tuner(trainer, run_config=RunConfig(failure=FailureConfig(max_failures=3))) + tuner = Tuner( + trainer, run_config=RunConfig(failure_config=FailureConfig(max_failures=3)) + ) analysis = tuner.fit()._experiment_analysis checkpoint_path = analysis.trials[0].checkpoint.dir_or_data From d0ae2ba1998b544b01897d03768b0fa0d9a5c3e7 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 28 Jun 2022 11:21:57 -0700 Subject: [PATCH 50/70] Use warnings.warn --- python/ray/util/ml_utils/checkpoint_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index a3153dbd5e06..1f64633a9666 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -7,6 +7,7 @@ import os import shutil import tempfile +import warnings from dataclasses import dataclass from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -256,7 +257,7 @@ def checkpoint_score_attr(self) -> Optional[str]: @dataclass class CheckpointStrategy(CheckpointConfig): def __post_init__(self): - logger.warning(deprecation_message) + warnings.warn(deprecation_message) super().__post_init__() From d44f75026ad47a6fcea4035e1ee8de68bd1980a3 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 28 Jun 2022 11:47:05 -0700 Subject: [PATCH 51/70] Make method privat --- python/ray/air/tests/test_api.py | 8 ++++---- python/ray/tune/impl/tuner_internal.py | 2 +- python/ray/util/ml_utils/checkpoint_manager.py | 6 ++++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py index e4474c24626a..ffd2d722378c 100644 --- a/python/ray/air/tests/test_api.py +++ b/python/ray/air/tests/test_api.py @@ -46,20 +46,20 @@ def test_checkpointing_config(): ) checkpointing = CheckpointConfig() - assert checkpointing.checkpoint_score_attr is None + assert checkpointing._tune_legacy_checkpoint_score_attr is None checkpointing = CheckpointConfig(checkpoint_score_attribute="metric") - assert checkpointing.checkpoint_score_attr == "metric" + assert checkpointing._tune_legacy_checkpoint_score_attr == "metric" checkpointing = CheckpointConfig( checkpoint_score_attribute="metric", checkpoint_score_order="max" ) - assert checkpointing.checkpoint_score_attr == "metric" + assert checkpointing._tune_legacy_checkpoint_score_attr == "metric" checkpointing = CheckpointConfig( checkpoint_score_attribute="metric", checkpoint_score_order="min" ) - assert checkpointing.checkpoint_score_attr == "min-metric" + assert checkpointing._tune_legacy_checkpoint_score_attr == "min-metric" def test_scaling_config(): diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py index 2d348c94d47a..d1e01e0a8e8d 100644 --- a/python/ray/tune/impl/tuner_internal.py +++ b/python/ray/tune/impl/tuner_internal.py @@ -168,7 +168,7 @@ def _get_tune_run_arguments(self) -> Dict[str, Any]: else None ), checkpoint_score_attr=( - self._run_config.checkpoint_config.checkpoint_score_attr + self._run_config.checkpoint_config._tune_legacy_checkpoint_score_attr if self._run_config.checkpoint_config else None ), diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index 1f64633a9666..c687ca48e8ad 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -234,8 +234,10 @@ def __post_init__(self): ) @property - def checkpoint_score_attr(self) -> Optional[str]: - """Same as ``checkpoint_score_attr`` in ``tune.run``.""" + def _tune_legacy_checkpoint_score_attr(self) -> Optional[str]: + """Same as ``checkpoint_score_attr`` in ``tune.run``. + + Only used for Legacy API compatibility.""" if self.checkpoint_score_attribute is None: return self.checkpoint_score_attribute prefix = "" From c9d33806a000f3bb84bb68ca467e6bd7d6675923 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 28 Jun 2022 13:39:36 -0700 Subject: [PATCH 52/70] Update python/ray/util/ml_utils/checkpoint_manager.py --- python/ray/util/ml_utils/checkpoint_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index c687ca48e8ad..4b6576a20d79 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -236,7 +236,6 @@ def __post_init__(self): @property def _tune_legacy_checkpoint_score_attr(self) -> Optional[str]: """Same as ``checkpoint_score_attr`` in ``tune.run``. - Only used for Legacy API compatibility.""" if self.checkpoint_score_attribute is None: return self.checkpoint_score_attribute From 5c0a75317897b9ee37f6dfd7899342c8ed490cb8 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 28 Jun 2022 13:41:17 -0700 Subject: [PATCH 53/70] Update checkpoint_manager.py --- python/ray/util/ml_utils/checkpoint_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index 4b6576a20d79..e6e58ff77402 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -236,7 +236,9 @@ def __post_init__(self): @property def _tune_legacy_checkpoint_score_attr(self) -> Optional[str]: """Same as ``checkpoint_score_attr`` in ``tune.run``. - Only used for Legacy API compatibility.""" + + Only used for Legacy API compatibility. + """ if self.checkpoint_score_attribute is None: return self.checkpoint_score_attribute prefix = "" From c7b783b05f6fd76d0ba4f2febc6d58d3720e0ccf Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 29 Jun 2022 11:39:02 -0700 Subject: [PATCH 54/70] Fix test --- python/ray/train/tests/test_tune.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index e407679268ad..e25193eddf54 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -5,7 +5,7 @@ import ray import ray.train as train from ray import tune -from ray.air import Checkpoint +from ray.air import Checkpoint, session from ray.air.config import FailureConfig, RunConfig from ray.train._internal.worker_group import WorkerGroup from ray.train.backend import Backend, BackendConfig @@ -154,13 +154,16 @@ def train_func(): def test_reuse_checkpoint(ray_start_4_cpus): def train_func(config): itr = 0 - ckpt = train.load_checkpoint() + ckpt = session.get_checkpoint() if ckpt is not None: + ckpt = ckpt.to_dict() itr = ckpt["iter"] + 1 for i in range(itr, config["max_iter"]): - train.save_checkpoint(iter=i) - train.report(test=i, training_iteration=i) + session.report( + dict(test=i, training_iteration=i), + checkpoint=Checkpoint.from_dict(dict(iter=i)), + ) trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1) @@ -185,17 +188,20 @@ def train_func(config): def test_retry(ray_start_4_cpus): def train_func(): - ckpt = train.load_checkpoint() + ckpt = session.get_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? itr = 0 if ckpt: + ckpt = ckpt.to_dict() itr = ckpt["iter"] + 1 for i in range(itr, 4): if i == 2 and not restored: raise Exception("try to fail me") - train.save_checkpoint(iter=i) - train.report(test=i, training_iteration=i) + session.report( + dict(test=i, training_iteration=i), + checkpoint=Checkpoint.from_dict(dict(iter=i)), + ) trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1) From 2e9ec6644b4e6b06c47f96918df28ffd0d8e97e3 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 30 Jun 2022 16:39:04 +0000 Subject: [PATCH 55/70] Rename files --- doc/source/train/examples.rst | 8 ++++---- doc/source/train/examples/train_fashion_mnist_example.rst | 4 ++-- .../train/examples/train_linear_dataset_example.rst | 4 ++-- doc/source/train/examples/train_linear_example.rst | 4 ++-- .../train/examples/tune_cifar_pytorch_pbt_example.rst | 6 +++--- python/ray/train/BUILD | 6 +++--- python/ray/train/examples/mlflow_fashion_mnist_example.py | 2 +- ...on_mnist_example.py => torch_fashion_mnist_example.py} | 0 ...dataset_example.py => torch_linear_dataset_example.py} | 0 .../{train_linear_example.py => torch_linear_example.py} | 0 ...rch_pbt_example.py => tune_cifar_torch_pbt_example.py} | 1 - python/ray/train/examples/tune_linear_example.py | 2 +- python/ray/train/tests/test_examples.py | 8 ++++++-- python/ray/train/tests/test_gpu.py | 6 +++--- python/ray/train/tests/test_tune.py | 2 +- .../workloads/pytorch_pbt_failure.py | 2 +- release/ml_user_tests/train/train_torch_linear_test.py | 2 +- 17 files changed, 30 insertions(+), 27 deletions(-) rename python/ray/train/examples/{train_fashion_mnist_example.py => torch_fashion_mnist_example.py} (100%) rename python/ray/train/examples/{train_linear_dataset_example.py => torch_linear_dataset_example.py} (100%) rename python/ray/train/examples/{train_linear_example.py => torch_linear_example.py} (100%) rename python/ray/train/examples/{tune_cifar_pytorch_pbt_example.py => tune_cifar_torch_pbt_example.py} (99%) diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst index 2a4e0b75bbd1..1529e63342cc 100644 --- a/doc/source/train/examples.rst +++ b/doc/source/train/examples.rst @@ -15,10 +15,10 @@ General Examples PyTorch ~~~~~~~ -* :doc:`/train/examples/train_linear_example`: +* :doc:`/train/examples/torch_linear_example`: Simple example for PyTorch. -* :doc:`/train/examples/train_fashion_mnist_example`: +* :doc:`/train/examples/torch_fashion_mnist_example`: End-to-end example for PyTorch. * :doc:`/train/examples/transformers/transformers_example`: @@ -59,7 +59,7 @@ Ray Datasets Integration Examples * :doc:`/train/examples/tensorflow_linear_dataset_example`: Simple example for training a linear TensorFlow model. -* :doc:`/train/examples/train_linear_dataset_example`: +* :doc:`/train/examples/torch_linear_dataset_example`: Simple example for training a linear PyTorch model. * :doc:`/train/examples/tune_torch_linear_dataset_example`: @@ -75,7 +75,7 @@ Ray Tune Integration Examples * :doc:`/train/examples/tune_tensorflow_mnist_example`: End-to-end example for tuning a TensorFlow model. -* :doc:`/train/examples/tune_cifar_pytorch_pbt_example`: +* :doc:`/train/examples/tune_cifar_torch_pbt_example`: End-to-end example for tuning a PyTorch model with PBT. .. diff --git a/doc/source/train/examples/train_fashion_mnist_example.rst b/doc/source/train/examples/train_fashion_mnist_example.rst index e11849e7b5f5..7082cc1433db 100644 --- a/doc/source/train/examples/train_fashion_mnist_example.rst +++ b/doc/source/train/examples/train_fashion_mnist_example.rst @@ -1,6 +1,6 @@ :orphan: -train_fashion_mnist_example +torch_fashion_mnist_example =========================== -.. literalinclude:: /../../python/ray/train/examples/train_fashion_mnist_example.py +.. literalinclude:: /../../python/ray/train/examples/torch_fashion_mnist_example.py diff --git a/doc/source/train/examples/train_linear_dataset_example.rst b/doc/source/train/examples/train_linear_dataset_example.rst index 5dfe21be0dc5..f84daeb67a7a 100644 --- a/doc/source/train/examples/train_linear_dataset_example.rst +++ b/doc/source/train/examples/train_linear_dataset_example.rst @@ -1,6 +1,6 @@ :orphan: -train_linear_dataset_example +torch_linear_dataset_example ============================ -.. literalinclude:: /../../python/ray/train/examples/train_linear_dataset_example.py +.. literalinclude:: /../../python/ray/train/examples/torch_linear_dataset_example.py diff --git a/doc/source/train/examples/train_linear_example.rst b/doc/source/train/examples/train_linear_example.rst index 3abb4af64c81..10f3090d5196 100644 --- a/doc/source/train/examples/train_linear_example.rst +++ b/doc/source/train/examples/train_linear_example.rst @@ -1,6 +1,6 @@ :orphan: -train_linear_example +torch_linear_example ==================== -.. literalinclude:: /../../python/ray/train/examples/train_linear_example.py +.. literalinclude:: /../../python/ray/train/examples/torch_linear_example.py diff --git a/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst b/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst index 5a1f156d8ee7..dae870f3247e 100644 --- a/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst +++ b/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst @@ -1,6 +1,6 @@ :orphan: -tune_cifar_pytorch_pbt_example -============================== +tune_cifar_torch_pbt_example +============================ -.. literalinclude:: /../../python/ray/train/examples/tune_cifar_pytorch_pbt_example.py +.. literalinclude:: /../../python/ray/train/examples/tune_cifar_torch_pbt_example.py diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index 6f719b725e64..89f32eda50e2 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -64,10 +64,10 @@ py_test( ) py_test( - name = "tune_cifar_pytorch_pbt_example", + name = "tune_cifar_torch_pbt_example", size = "medium", - main = "examples/tune_cifar_pytorch_pbt_example.py", - srcs = ["examples/tune_cifar_pytorch_pbt_example.py"], + main = "examples/tune_cifar_torch_pbt_example.py", + srcs = ["examples/tune_cifar_torch_pbt_example.py"], tags = ["team:ml", "exclusive", "pytorch", "tune"], deps = [":train_lib"], args = ["--smoke-test"] diff --git a/python/ray/train/examples/mlflow_fashion_mnist_example.py b/python/ray/train/examples/mlflow_fashion_mnist_example.py index 1cda7fc3e1ac..2d223c43ec1d 100644 --- a/python/ray/train/examples/mlflow_fashion_mnist_example.py +++ b/python/ray/train/examples/mlflow_fashion_mnist_example.py @@ -1,7 +1,7 @@ import argparse from ray.air import RunConfig -from ray.train.examples.train_fashion_mnist_example import train_func +from ray.train.examples.torch_fashion_mnist_example import train_func from ray.train.torch import TorchTrainer from ray.tune.integration.mlflow import MLflowLoggerCallback diff --git a/python/ray/train/examples/train_fashion_mnist_example.py b/python/ray/train/examples/torch_fashion_mnist_example.py similarity index 100% rename from python/ray/train/examples/train_fashion_mnist_example.py rename to python/ray/train/examples/torch_fashion_mnist_example.py diff --git a/python/ray/train/examples/train_linear_dataset_example.py b/python/ray/train/examples/torch_linear_dataset_example.py similarity index 100% rename from python/ray/train/examples/train_linear_dataset_example.py rename to python/ray/train/examples/torch_linear_dataset_example.py diff --git a/python/ray/train/examples/train_linear_example.py b/python/ray/train/examples/torch_linear_example.py similarity index 100% rename from python/ray/train/examples/train_linear_example.py rename to python/ray/train/examples/torch_linear_example.py diff --git a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py b/python/ray/train/examples/tune_cifar_torch_pbt_example.py similarity index 99% rename from python/ray/train/examples/tune_cifar_pytorch_pbt_example.py rename to python/ray/train/examples/tune_cifar_torch_pbt_example.py index 38abba231ae8..f0b5c786ff8d 100644 --- a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py +++ b/python/ray/train/examples/tune_cifar_torch_pbt_example.py @@ -58,7 +58,6 @@ def validate_epoch(dataloader, model, loss_fn): def train_func(config): - # print(config) epochs = config.pop("epochs", 3) model = ResNet18(config) model = train.torch.prepare_model(model) diff --git a/python/ray/train/examples/tune_linear_example.py b/python/ray/train/examples/tune_linear_example.py index 5d4a8edc911b..096c35547842 100644 --- a/python/ray/train/examples/tune_linear_example.py +++ b/python/ray/train/examples/tune_linear_example.py @@ -1,6 +1,6 @@ import argparse -from train_linear_example import train_func +from torch_linear_example import train_func import ray from ray import tune diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index 316ff4dc5fc3..1bb0753b1c6b 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -16,10 +16,10 @@ from ray.train.examples.torch_quick_start import ( train_func as torch_quick_start_train_func, ) -from ray.train.examples.train_fashion_mnist_example import ( +from ray.train.examples.torch_fashion_mnist_example import ( train_func as fashion_mnist_train_func, ) -from ray.train.examples.train_linear_example import train_func as linear_train_func +from ray.train.examples.torch_linear_example import train_func as linear_train_func from ray.train.horovod.horovod_trainer import HorovodTrainer from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer from ray.train.tests.test_trainer import KillCallback @@ -172,6 +172,10 @@ def test_horovod_torch_mnist(ray_start_4_cpus): result = results.metrics assert result[TRAINING_ITERATION] == num_workers + loss = list(results.metrics_dataframe["loss"]) + assert len(loss) == num_epochs + assert loss[-1] < loss[0] + # TODO: Refactor as a backend test. def test_horovod_torch_mnist_stateful(ray_start_4_cpus): diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py index ac9a0afe7cfb..a4ac411eb9f5 100644 --- a/python/ray/train/tests/test_gpu.py +++ b/python/ray/train/tests/test_gpu.py @@ -18,10 +18,10 @@ from ray.train.examples.tensorflow_mnist_example import ( train_func as tensorflow_mnist_train_func, ) -from ray.train.examples.train_fashion_mnist_example import ( +from ray.train.examples.torch_fashion_mnist_example import ( train_func as fashion_mnist_train_func, ) -from ray.train.examples.train_linear_example import LinearDataset +from ray.train.examples.torch_linear_example import LinearDataset from ray.train.horovod.horovod_trainer import HorovodTrainer from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer from ray.train.torch.torch_trainer import TorchTrainer @@ -350,7 +350,7 @@ def test_tune_tensorflow_mnist_gpu(ray_start_4_cpus_2_gpus): def test_train_linear_dataset_gpu(ray_start_4_cpus_2_gpus): - from ray.train.examples.train_linear_dataset_example import train_linear + from ray.train.examples.torch_linear_dataset_example import train_linear assert train_linear(num_workers=2, use_gpu=True) diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index e25193eddf54..6c34bb7259b4 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -13,7 +13,7 @@ from ray.train.examples.tensorflow_mnist_example import ( train_func as tensorflow_mnist_train_func, ) -from ray.train.examples.train_fashion_mnist_example import ( +from ray.train.examples.torch_fashion_mnist_example import ( train_func as fashion_mnist_train_func, ) from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer diff --git a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py index d354b2834ac6..0704bed7ec75 100644 --- a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py +++ b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py @@ -6,7 +6,7 @@ import ray from ray import tune from ray.air.config import RunConfig -from ray.train.examples.tune_cifar_pytorch_pbt_example import train_func +from ray.train.examples.tune_cifar_torch_pbt_example import train_func from ray.train.torch import TorchConfig, TorchTrainer from ray.tune.schedulers import PopulationBasedTraining from ray.tune.tune_config import TuneConfig diff --git a/release/ml_user_tests/train/train_torch_linear_test.py b/release/ml_user_tests/train/train_torch_linear_test.py index 2a2a0a751061..1629ec6cdda9 100644 --- a/release/ml_user_tests/train/train_torch_linear_test.py +++ b/release/ml_user_tests/train/train_torch_linear_test.py @@ -4,7 +4,7 @@ import ray -from ray.train.examples.train_linear_example import train_linear +from ray.train.examples.torch_linear_example import train_linear if __name__ == "__main__": start = time.time() From 2bf89d221e4e171b1ebb3f9f62a8f8d3532cd3da Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 30 Jun 2022 16:43:12 +0000 Subject: [PATCH 56/70] Use keras callback --- .../examples/tensorflow_linear_dataset_example.py | 11 +++-------- python/ray/train/examples/tensorflow_mnist_example.py | 8 +------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/python/ray/train/examples/tensorflow_linear_dataset_example.py b/python/ray/train/examples/tensorflow_linear_dataset_example.py index ccc408455b44..0ee9d48d2077 100644 --- a/python/ray/train/examples/tensorflow_linear_dataset_example.py +++ b/python/ray/train/examples/tensorflow_linear_dataset_example.py @@ -2,20 +2,15 @@ from typing import Dict, Tuple import tensorflow as tf -from tensorflow.keras.callbacks import Callback +from ray.air.callbacks.keras import Callback as TrainReportCallback import ray -import ray.train as train +from ray.air import session from ray.air.config import DatasetConfig from ray.data import Dataset from ray.train.tensorflow import TensorflowTrainer, prepare_dataset_shard -class TrainReportCallback(Callback): - def on_epoch_end(self, epoch, logs=None): - train.report(**logs) - - def get_datasets_and_configs( a=5, b=10, size=1000 ) -> Tuple[Dict[str, Dataset], Dict[str, DatasetConfig]]: @@ -60,7 +55,7 @@ def train_func(config): # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = build_and_compile_model(config) - dataset_pipeline = train.get_dataset_shard("train") + dataset_pipeline = session.get_dataset_shard("train") dataset_iterator = dataset_pipeline.iter_epochs() for _ in range(epochs): diff --git a/python/ray/train/examples/tensorflow_mnist_example.py b/python/ray/train/examples/tensorflow_mnist_example.py index a0ef319f8756..97e8db033025 100644 --- a/python/ray/train/examples/tensorflow_mnist_example.py +++ b/python/ray/train/examples/tensorflow_mnist_example.py @@ -7,17 +7,11 @@ import numpy as np import tensorflow as tf -from tensorflow.keras.callbacks import Callback +from ray.air.callbacks.keras import Callback as TrainReportCallback -import ray.train as train from ray.train.tensorflow import TensorflowTrainer -class TrainReportCallback(Callback): - def on_epoch_end(self, epoch, logs=None): - train.report(**logs) - - def mnist_dataset(batch_size): (x_train, y_train), _ = tf.keras.datasets.mnist.load_data() # The `x` arrays are in uint8 and have values in the [0, 255] range. From 375790ecde6e07658618bedf74d6b991e801c2c7 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 30 Jun 2022 16:44:47 +0000 Subject: [PATCH 57/70] Revert docstring changes --- python/ray/train/train_loop_utils.py | 73 ++++++++++++---------------- 1 file changed, 30 insertions(+), 43 deletions(-) diff --git a/python/ray/train/train_loop_utils.py b/python/ray/train/train_loop_utils.py index 5b03fd1fffe5..58244652e961 100644 --- a/python/ray/train/train_loop_utils.py +++ b/python/ray/train/train_loop_utils.py @@ -38,25 +38,23 @@ def get_dataset_shard( import ray from ray import train - from ray.train.torch import TorchTrainer def train_func(): model = Net() for iter in range(100): - data_shard = train.get_dataset_shard("train").to_torch() + data_shard = train.get_dataset_shard().to_torch() model.train(data_shard) return model dataset = ray.data.read_csv("train.csv") dataset.filter(...).repeat().random_shuffle() + trainer = Trainer(backend="torch") + trainer.start() + # Trainer will automatically handle sharding. - trainer = TorchTrainer( - train_func, - datasets={"train": dataset}, - scaling_config={"num_workers": 2}, - ) - trainer.fit() + train_model = trainer.run(train_func, dataset=dataset) + trainer.shutdown() Args: dataset_name: If a Dictionary of Datasets was passed to ``Trainer``, then @@ -97,15 +95,16 @@ def report(**kwargs) -> None: import time from ray import train - from ray.train.torch import TorchTrainer def train_func(): for iter in range(100): time.sleep(1) train.report(hello="world") - trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2}) - trainer.fit() + trainer = Trainer(backend="torch") + trainer.start() + trainer.run(train_func) + trainer.shutdown() Args: **kwargs: Any key value pair to be reported by Train. @@ -127,7 +126,6 @@ def world_rank() -> int: import time from ray import train - from ray.train.torch import TorchTrainer def train_func(): for iter in range(100): @@ -135,8 +133,10 @@ def train_func(): if train.world_rank() == 0: print("Worker 0") - trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2}) - trainer.fit() + trainer = Trainer(backend="torch") + trainer.start() + trainer.run(train_func) + trainer.shutdown() """ session = get_session() @@ -153,18 +153,16 @@ def local_rank() -> int: import time from ray import train - from ray.train.torch import TorchTrainer def train_func(): if torch.cuda.is_available(): torch.cuda.set_device(train.local_rank()) ... - trainer = TorchTrainer( - train_func, - scaling_config={"use_gpu": True, "num_workers": 2}, - ) - trainer.fit() + trainer = Trainer(backend="torch", use_gpu=True) + trainer.start() + trainer.run(train_func) + trainer.shutdown() """ session = get_session() @@ -180,29 +178,18 @@ def load_checkpoint() -> Optional[Dict]: .. code-block:: python from ray import train - from ray.air import Checkpoint - from ray.train.torch import TorchTrainer def train_func(): checkpoint = train.load_checkpoint() for iter in range(checkpoint["epoch"], 5): print(iter) - checkpoint = Checkpoint.from_dict( - { - # this would be set during checkpoint saving - "_current_checkpoint_id": 1, - "epoch": 3, - } - ) - trainer = TorchTrainer( - train_func, - resume_from_checkpoint=checkpoint, - scaling_config={"num_workers": 2}, - ) - trainer.fit() + trainer = Trainer(backend="torch") + trainer.start() + trainer.run(train_func, checkpoint={"epoch": 3}) # 3 # 4 + trainer.shutdown() Args: **kwargs: Any key value pair to be checkpointed by Train. @@ -226,16 +213,16 @@ def save_checkpoint(**kwargs) -> None: import time from ray import train - from ray.train.torch import TorchTrainer def train_func(): for iter in range(100): time.sleep(1) train.save_checkpoint(epoch=iter) - trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2}) - result = trainer.fit() - assert result.checkpoint + trainer = Trainer(backend="torch") + trainer.start() + trainer.run(train_func) + trainer.shutdown() Args: **kwargs: Any key value pair to be checkpointed by Train. @@ -255,14 +242,14 @@ def world_size() -> int: import time from ray import train - from ray.train.torch import TorchTrainer def train_func(): assert train.world_size() == 4 - trainer = TorchTrainer(train_func, scaling_config={"num_workers": 4}) - result = trainer.fit() - + trainer = Trainer(backend="torch", num_workers=4) + trainer.start() + trainer.run(train_func) + trainer.shutdown() """ session = get_session() if session is None: From baaaf47718c6b5a46d228a9829c764e8b5f9390e Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 30 Jun 2022 17:40:37 +0000 Subject: [PATCH 58/70] Rename example files in docs --- ..._fashion_mnist_example.rst => torch_fashion_mnist_example.rst} | 0 ...inear_dataset_example.rst => torch_linear_dataset_example.rst} | 0 .../{train_linear_example.rst => torch_linear_example.rst} | 0 ...r_pytorch_pbt_example.rst => tune_cifar_torch_pbt_example.rst} | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename doc/source/train/examples/{train_fashion_mnist_example.rst => torch_fashion_mnist_example.rst} (100%) rename doc/source/train/examples/{train_linear_dataset_example.rst => torch_linear_dataset_example.rst} (100%) rename doc/source/train/examples/{train_linear_example.rst => torch_linear_example.rst} (100%) rename doc/source/train/examples/{tune_cifar_pytorch_pbt_example.rst => tune_cifar_torch_pbt_example.rst} (100%) diff --git a/doc/source/train/examples/train_fashion_mnist_example.rst b/doc/source/train/examples/torch_fashion_mnist_example.rst similarity index 100% rename from doc/source/train/examples/train_fashion_mnist_example.rst rename to doc/source/train/examples/torch_fashion_mnist_example.rst diff --git a/doc/source/train/examples/train_linear_dataset_example.rst b/doc/source/train/examples/torch_linear_dataset_example.rst similarity index 100% rename from doc/source/train/examples/train_linear_dataset_example.rst rename to doc/source/train/examples/torch_linear_dataset_example.rst diff --git a/doc/source/train/examples/train_linear_example.rst b/doc/source/train/examples/torch_linear_example.rst similarity index 100% rename from doc/source/train/examples/train_linear_example.rst rename to doc/source/train/examples/torch_linear_example.rst diff --git a/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst b/doc/source/train/examples/tune_cifar_torch_pbt_example.rst similarity index 100% rename from doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst rename to doc/source/train/examples/tune_cifar_torch_pbt_example.rst From 691ce99d80343295a18be0947422cab225b53a19 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 30 Jun 2022 17:49:34 +0000 Subject: [PATCH 59/70] Add legacy tests --- python/ray/train/tests/test_minimal.py | 49 ++++++++++++++++ python/ray/train/tests/test_tune.py | 78 ++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) diff --git a/python/ray/train/tests/test_minimal.py b/python/ray/train/tests/test_minimal.py index 7541edb16852..e3a1670ed3fb 100644 --- a/python/ray/train/tests/test_minimal.py +++ b/python/ray/train/tests/test_minimal.py @@ -1,6 +1,11 @@ +from typing import List, Dict + import pytest import ray +import ray.train as train +from ray.train import Trainer +from ray.train.callbacks import TrainingCallback from ray.air import session from ray.air.checkpoint import Checkpoint from ray.train._internal.worker_group import WorkerGroup @@ -30,6 +35,14 @@ def on_shutdown(self, worker_group: WorkerGroup, backend_config: TestConfig): pass +class TestCallback(TrainingCallback): + def __init__(self): + self.result_list = [] + + def handle_result(self, results: List[Dict], **info): + self.result_list.append(results) + + def test_run(ray_start_4_cpus): """Tests that Train can be run without any specific backends.""" num_workers = 2 @@ -61,6 +74,42 @@ def train_func(): assert results.checkpoint.to_dict()[key] == checkpoint.to_dict()[key] +def test_run_legacy(ray_start_4_cpus): + """Tests that Train can be run without any specific backends.""" + num_workers = 2 + key = "value" + value = 1 + config = TestConfig() + + def train_func(): + checkpoint = train.load_checkpoint() + train.report(**checkpoint) + train.save_checkpoint(**checkpoint) + return checkpoint[key] + + checkpoint = {key: value} + test_callback = TestCallback() + + trainer = Trainer(config, num_workers=num_workers) + trainer.start() + results = trainer.run(train_func, checkpoint=checkpoint, callbacks=[test_callback]) + + # Test results. + assert len(results) == num_workers + assert all(result == 1 for result in results) + + # Test reporting and callbacks. + assert len(test_callback.result_list) == value + assert len(test_callback.result_list[0]) == num_workers + print(test_callback.result_list[0]) + assert all(result[key] == value for result in test_callback.result_list[0]) + + # Test checkpointing. + assert trainer.latest_checkpoint[key] == value + + trainer.shutdown() + + def test_failure(): """Tests that backend frameworks and non-critical libraries are not imported.""" with pytest.raises(ModuleNotFoundError): diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index 6c34bb7259b4..640fa98a19a0 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -5,6 +5,7 @@ import ray import ray.train as train from ray import tune +from ray.tune import TuneError from ray.air import Checkpoint, session from ray.air.config import FailureConfig, RunConfig from ray.train._internal.worker_group import WorkerGroup @@ -18,6 +19,7 @@ ) from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer from ray.train.torch.torch_trainer import TorchTrainer +from ray.train.trainer import Trainer from ray.tune.tune_config import TuneConfig from ray.tune.tuner import Tuner @@ -219,6 +221,82 @@ def train_func(): assert len(trial_dfs[0]["training_iteration"]) == 4 +def test_tune_error_legacy(ray_start_4_cpus): + def train_func(config): + raise RuntimeError("Error in training function!") + + trainer = Trainer(TestConfig(), num_workers=1) + TestTrainable = trainer.to_tune_trainable(train_func) + + with pytest.raises(TuneError): + tune.run(TestTrainable) + + +def test_tune_checkpoint_legacy(ray_start_4_cpus): + def train_func(): + for i in range(10): + train.report(test=i) + train.save_checkpoint(hello="world") + + trainer = Trainer(TestConfig(), num_workers=1) + TestTrainable = trainer.to_tune_trainable(train_func) + + [trial] = tune.run(TestTrainable).trials + checkpoint_path = trial.checkpoint.dir_or_data + assert os.path.exists(checkpoint_path) + checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() + assert checkpoint["hello"] == "world" + + +def test_reuse_checkpoint_legacy(ray_start_4_cpus): + def train_func(config): + itr = 0 + ckpt = train.load_checkpoint() + if ckpt is not None: + itr = ckpt["iter"] + 1 + + for i in range(itr, config["max_iter"]): + train.save_checkpoint(iter=i) + train.report(test=i, training_iteration=i) + + trainer = Trainer(TestConfig(), num_workers=1) + TestTrainable = trainer.to_tune_trainable(train_func) + + [trial] = tune.run(TestTrainable, config={"max_iter": 5}).trials + checkpoint_path = trial.checkpoint.dir_or_data + checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() + assert checkpoint["iter"] == 4 + analysis = tune.run(TestTrainable, config={"max_iter": 10}, restore=checkpoint_path) + trial_dfs = list(analysis.trial_dataframes.values()) + assert len(trial_dfs[0]["training_iteration"]) == 5 + + +def test_retry_legacy(ray_start_4_cpus): + def train_func(): + ckpt = train.load_checkpoint() + restored = bool(ckpt) # Does a previous checkpoint exist? + itr = 0 + if ckpt: + itr = ckpt["iter"] + 1 + + for i in range(itr, 4): + if i == 2 and not restored: + raise Exception("try to fail me") + train.save_checkpoint(iter=i) + train.report(test=i, training_iteration=i) + + trainer = Trainer(TestConfig(), num_workers=1) + TestTrainable = trainer.to_tune_trainable(train_func) + + analysis = tune.run(TestTrainable, max_failures=3) + checkpoint_path = analysis.trials[0].checkpoint.dir_or_data + checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() + assert checkpoint["iter"] == 3 + + trial_dfs = list(analysis.trial_dataframes.values()) + assert len(trial_dfs[0]["training_iteration"]) == 4 + + if __name__ == "__main__": import sys From d9122c38620d646fa923a130dbeed2b47951d0b1 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 5 Jul 2022 18:00:31 +0000 Subject: [PATCH 60/70] Switch to session in train code --- .../horovod/horovod_pytorch_example.py | 18 ++++---- .../examples/horovod/horovod_tune_example.py | 4 +- .../pytorch/torch_fashion_mnist_example.py | 9 ++-- .../pytorch/torch_linear_dataset_example.py | 12 ++--- .../examples/pytorch/torch_linear_example.py | 3 +- .../distributed_sage_example.py | 21 +++++---- .../tf/tensorflow_autoencoder_example.ipynb | 17 +++---- .../tf/tensorflow_autoencoder_example.py | 12 ++--- .../tf/tensorflow_linear_dataset_example.py | 12 ++--- .../examples/tf/tensorflow_mnist_example.py | 9 +--- python/ray/air/tests/test_dataset_config.py | 10 ++--- python/ray/air/util/check_ingest.py | 18 ++++---- .../ray/train/_internal/backend_executor.py | 19 ++++---- python/ray/train/_internal/checkpoint.py | 2 +- python/ray/train/_internal/dataset_spec.py | 8 ++-- python/ray/train/_internal/session.py | 2 +- python/ray/train/backend.py | 3 +- python/ray/train/constants.py | 6 +-- python/ray/train/data_parallel_trainer.py | 27 ++++++------ .../train/examples/horovod/horovod_example.py | 4 +- .../train/examples/mlflow_simple_example.py | 5 +-- .../examples/torch_fashion_mnist_example.py | 10 +++-- .../examples/torch_linear_dataset_example.py | 7 +-- .../train/examples/torch_linear_example.py | 3 +- .../examples/tune_cifar_torch_pbt_example.py | 14 +++--- python/ray/train/horovod/horovod_trainer.py | 38 +++++++++------- .../train/huggingface/_huggingface_utils.py | 17 +++---- .../train/huggingface/huggingface_trainer.py | 11 +++-- .../train/tensorflow/tensorflow_trainer.py | 44 ++++++++++--------- .../train/tests/test_tensorflow_trainer.py | 3 +- python/ray/train/tests/test_torch_trainer.py | 7 +-- python/ray/train/tests/test_tune.py | 8 ++-- python/ray/train/torch/torch_trainer.py | 37 +++++++++------- python/ray/train/torch/train_loop_utils.py | 27 ++++++++++-- .../horovod/workloads/horovod_tune_test.py | 21 +++++---- 35 files changed, 243 insertions(+), 225 deletions(-) diff --git a/python/ray/air/examples/horovod/horovod_pytorch_example.py b/python/ray/air/examples/horovod/horovod_pytorch_example.py index fe7355eb8a00..946cddc4fd59 100644 --- a/python/ray/air/examples/horovod/horovod_pytorch_example.py +++ b/python/ray/air/examples/horovod/horovod_pytorch_example.py @@ -2,6 +2,7 @@ from filelock import FileLock import horovod.torch as hvd import os +from ray.air.checkpoint import Checkpoint import torch.nn as nn import torch.nn.functional as F import torch.optim as optim @@ -9,7 +10,7 @@ from torchvision import datasets, transforms import ray -from ray import train +from ray.air import session from ray.train.horovod import HorovodTrainer @@ -141,19 +142,16 @@ def train_func(config): model, optimizer, train_loader, train_sampler = setup(config) - results = [] for epoch in range(num_epochs): loss = train_epoch( model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda ) - results.append(loss) - if save_model_as_dict: - train.save_checkpoint(model=model.state_dict()) - else: - train.save_checkpoint(model=model) - print("losses of each epoch:") - print(results) - return results + if save_model_as_dict: + checkpoint_dict = dict(model=model.state_dict()) + else: + checkpoint_dict = dict(model=model) + checkpoint_dict = Checkpoint.from_dict(checkpoint_dict) + session.report(dict(loss=loss), checkpoint=checkpoint_dict) def main(num_workers, use_gpu, kwargs): diff --git a/python/ray/air/examples/horovod/horovod_tune_example.py b/python/ray/air/examples/horovod/horovod_tune_example.py index 05ab4032924d..24539d759a40 100644 --- a/python/ray/air/examples/horovod/horovod_tune_example.py +++ b/python/ray/air/examples/horovod/horovod_tune_example.py @@ -3,8 +3,8 @@ import torch import ray -from ray import train from ray import tune +from ray.air import session from ray.train.horovod import HorovodTrainer from ray.tune.tune_config import TuneConfig from ray.tune.tuner import Tuner @@ -83,7 +83,7 @@ def train_loop_per_worker(config): optimizer.step() time.sleep(0.1) - train.report(loss=loss.item()) + session.report(dict(loss=loss.item())) total = time.time() - start print(f"Took {total:0.3f} s. Avg: {total / num_steps:0.3f} s.") diff --git a/python/ray/air/examples/pytorch/torch_fashion_mnist_example.py b/python/ray/air/examples/pytorch/torch_fashion_mnist_example.py index b4e2088b9d68..a618292ecf70 100644 --- a/python/ray/air/examples/pytorch/torch_fashion_mnist_example.py +++ b/python/ray/air/examples/pytorch/torch_fashion_mnist_example.py @@ -1,5 +1,6 @@ import argparse from typing import Dict +from ray.air import session import torch from torch import nn @@ -48,7 +49,7 @@ def forward(self, x): def train_epoch(dataloader, model, loss_fn, optimizer): - size = len(dataloader.dataset) // train.world_size() + size = len(dataloader.dataset) // session.get_world_size() model.train() for batch, (X, y) in enumerate(dataloader): # Compute prediction error @@ -66,7 +67,7 @@ def train_epoch(dataloader, model, loss_fn, optimizer): def validate_epoch(dataloader, model, loss_fn): - size = len(dataloader.dataset) // train.world_size() + size = len(dataloader.dataset) // session.get_world_size() num_batches = len(dataloader) model.eval() test_loss, correct = 0, 0 @@ -90,7 +91,7 @@ def train_func(config: Dict): lr = config["lr"] epochs = config["epochs"] - worker_batch_size = batch_size // train.world_size() + worker_batch_size = batch_size // session.get_world_size() # Create data loaders. train_dataloader = DataLoader(training_data, batch_size=worker_batch_size) @@ -109,7 +110,7 @@ def train_func(config: Dict): for _ in range(epochs): train_epoch(train_dataloader, model, loss_fn, optimizer) loss = validate_epoch(test_dataloader, model, loss_fn) - train.report(loss=loss) + session.report(dict(loss=loss)) def train_fashion_mnist(num_workers=2, use_gpu=False): diff --git a/python/ray/air/examples/pytorch/torch_linear_dataset_example.py b/python/ray/air/examples/pytorch/torch_linear_dataset_example.py index 42d1569e6623..6da379ba8de7 100644 --- a/python/ray/air/examples/pytorch/torch_linear_dataset_example.py +++ b/python/ray/air/examples/pytorch/torch_linear_dataset_example.py @@ -1,13 +1,14 @@ import argparse import random from typing import Tuple +from ray.air.checkpoint import Checkpoint import torch import torch.nn as nn import ray import ray.train as train -from ray.air import train_test_split +from ray.air import session, train_test_split from ray.air.result import Result from ray.data import Dataset from ray.train.batch_predictor import BatchPredictor @@ -64,8 +65,8 @@ def train_func(config): lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) - train_dataset_shard = train.get_dataset_shard("train") - validation_dataset = train.get_dataset_shard("validation") + train_dataset_shard = session.get_dataset_shard("train") + validation_dataset = session.get_dataset_shard("validation") model = nn.Linear(1, hidden_size) model = train.torch.prepare_model(model) @@ -95,13 +96,12 @@ def train_func(config): device = train.torch.get_device() train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) - if train.world_rank() == 0: + if session.get_world_rank() == 0: result = validate_epoch(validation_torch_dataset, model, loss_fn, device) else: result = {} - train.report(**result) results.append(result) - train.save_checkpoint(model=model) + session.report(result, checkpoint=Checkpoint.from_dict(dict(model=model))) return results diff --git a/python/ray/air/examples/pytorch/torch_linear_example.py b/python/ray/air/examples/pytorch/torch_linear_example.py index 31f856416296..ba0d12fd1197 100644 --- a/python/ray/air/examples/pytorch/torch_linear_example.py +++ b/python/ray/air/examples/pytorch/torch_linear_example.py @@ -1,6 +1,7 @@ import argparse import numpy as np +from ray.air import session import torch import torch.nn as nn import ray.train as train @@ -78,8 +79,8 @@ def train_func(config): for _ in range(epochs): train_epoch(train_loader, model, loss_fn, optimizer) result = validate_epoch(validation_loader, model, loss_fn) - train.report(**result) results.append(result) + session.report(result) return results diff --git a/python/ray/air/examples/pytorch_geometric/distributed_sage_example.py b/python/ray/air/examples/pytorch_geometric/distributed_sage_example.py index a1740e6cefc6..9c1ab2e00063 100644 --- a/python/ray/air/examples/pytorch_geometric/distributed_sage_example.py +++ b/python/ray/air/examples/pytorch_geometric/distributed_sage_example.py @@ -4,6 +4,7 @@ import os import argparse from filelock import FileLock +from ray.air import session import torch import torch.nn.functional as F @@ -63,8 +64,8 @@ def train_loop_per_worker(train_loop_config): data = dataset[0] train_idx = data.train_mask.nonzero(as_tuple=False).view(-1) - train_idx = train_idx.split(train_idx.size(0) // train.world_size())[ - train.world_rank() + train_idx = train_idx.split(train_idx.size(0) // session.get_world_size())[ + session.get_world_rank() ] train_loader = NeighborSampler( @@ -79,7 +80,7 @@ def train_loop_per_worker(train_loop_config): train_loader = train.torch.prepare_data_loader(train_loader, add_dist_sampler=False) # Do validation on rank 0 worker only. - if train.world_rank() == 0: + if session.get_world_rank() == 0: subgraph_loader = NeighborSampler( data.edge_index, node_idx=None, sizes=[-1], batch_size=2048, shuffle=False ) @@ -112,13 +113,13 @@ def train_loop_per_worker(train_loop_config): loss.backward() optimizer.step() - if train.world_rank() == 0: + if session.get_world_rank() == 0: print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}") train_accuracy = validation_accuracy = test_accuracy = None # Do validation on rank 0 worker only. - if train.world_rank() == 0: + if session.get_world_rank() == 0: model.eval() with torch.no_grad(): out = model.module.test(x, subgraph_loader) @@ -131,10 +132,12 @@ def train_loop_per_worker(train_loop_config): ) test_accuracy = int(res[data.test_mask].sum()) / int(data.test_mask.sum()) - train.report( - train_accuracy=train_accuracy, - validation_accuracy=validation_accuracy, - test_accuracy=test_accuracy, + session.report( + dict( + train_accuracy=train_accuracy, + validation_accuracy=validation_accuracy, + test_accuracy=test_accuracy, + ) ) diff --git a/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb b/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb index bbab099eaced..0d8e36efcebc 100644 --- a/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb +++ b/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb @@ -238,8 +238,8 @@ "source": [ "`train_func` contains regular TensorFlow code with a few notable exceptions:\n", "* We build and compile our model in the [`MultiWorkerMirrioredStrategy`](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy) context.\n", - "* We call {py:func}`train.get_dataset_shard ` to get a subset of our training data, and call {py:meth}`Dataset.to_tf ` with {py:func}`prepare_dataset_shard ` to convert the subset to a TensorFlow dataset.\n", - "* We save model state using {py:func}`train.save_checkpoint `.\n" + "* We call {py:func}`session.get_dataset_shard ` to get a subset of our training data, and call {py:meth}`Dataset.to_tf ` with {py:func}`prepare_dataset_shard ` to convert the subset to a TensorFlow dataset.\n", + "* We use the {py:class}`ray.air.callbacks.keras.Callback ` for metric reporting and checkpointing.\n" ] }, { @@ -250,14 +250,9 @@ "source": [ "import os\n", "import json\n", - "from ray import train\n", + "from ray.air import session\n", + "from ray.air.callbacks.keras import Callback\n", "from ray.train.tensorflow import prepare_dataset_shard\n", - "from tensorflow.keras.callbacks import Callback\n", - "\n", - "class TrainCheckpointReportCallback(Callback):\n", - " def on_epoch_end(self, epoch, logs=None):\n", - " train.save_checkpoint(**{\"model\": self.model.get_weights()})\n", - " train.report(**logs)\n", "\n", "def train_func(config: dict):\n", "\n", @@ -268,7 +263,7 @@ " tf_config = json.loads(os.environ[\"TF_CONFIG\"])\n", " num_workers = len(tf_config[\"cluster\"][\"worker\"])\n", "\n", - " dataset_shard = train.get_dataset_shard(\"train\")\n", + " dataset_shard = session.get_dataset_shard(\"train\")\n", "\n", " strategy = tf.distribute.MultiWorkerMirroredStrategy()\n", "\n", @@ -296,7 +291,7 @@ " )\n", " )\n", " history = multi_worker_model.fit(\n", - " tf_dataset, callbacks=[TrainCheckpointReportCallback()]\n", + " tf_dataset, callbacks=[Callback()]\n", " )\n", " results.append(history.history)\n", " return results" diff --git a/python/ray/air/examples/tf/tensorflow_autoencoder_example.py b/python/ray/air/examples/tf/tensorflow_autoencoder_example.py index e90a334de885..c3b61d2738f2 100644 --- a/python/ray/air/examples/tf/tensorflow_autoencoder_example.py +++ b/python/ray/air/examples/tf/tensorflow_autoencoder_example.py @@ -5,7 +5,7 @@ import argparse import numpy as np import pandas as pd -import ray.train as train +from ray.air import session import tensorflow as tf import tensorflow_datasets as tfds from ray.data.datasource import SimpleTensorFlowDatasource @@ -14,19 +14,13 @@ from ray.air.result import Result from ray.air.train.integrations.tensorflow import TensorflowTrainer from ray.train.tensorflow import prepare_dataset_shard -from tensorflow.keras.callbacks import Callback +from ray.air.callbacks.keras import Callback as TrainCheckpointReportCallback import ray from ray.data.extensions import TensorArray -class TrainCheckpointReportCallback(Callback): - def on_epoch_end(self, epoch, logs=None): - train.save_checkpoint(**{"model": self.model.get_weights()}) - train.report(**logs) - - def get_dataset(split_type="train"): def dataset_factory(): return tfds.load("mnist", split=[split_type], as_supervised=True)[0].take(128) @@ -80,7 +74,7 @@ def train_func(config: dict): per_worker_batch_size = config.get("batch_size", 64) epochs = config.get("epochs", 3) - dataset_shard = train.get_dataset_shard("train") + dataset_shard = session.get_dataset_shard("train") strategy = tf.distribute.MultiWorkerMirroredStrategy() diff --git a/python/ray/air/examples/tf/tensorflow_linear_dataset_example.py b/python/ray/air/examples/tf/tensorflow_linear_dataset_example.py index c38afc61c92c..18a74e3cc343 100644 --- a/python/ray/air/examples/tf/tensorflow_linear_dataset_example.py +++ b/python/ray/air/examples/tf/tensorflow_linear_dataset_example.py @@ -2,10 +2,10 @@ import numpy as np import tensorflow as tf -from tensorflow.keras.callbacks import Callback import ray -import ray.train as train +from ray.air import session +from ray.air.callbacks.keras import Callback as TrainCheckpointReportCallback from ray.air.result import Result from ray.data import Dataset from ray.train.batch_predictor import BatchPredictor @@ -16,12 +16,6 @@ ) -class TrainCheckpointReportCallback(Callback): - def on_epoch_end(self, epoch, logs=None): - train.save_checkpoint(**{"model": self.model.get_weights()}) - train.report(**logs) - - def get_dataset(a=5, b=10, size=1000) -> Dataset: items = [i / size for i in range(size)] dataset = ray.data.from_items([{"x": x, "y": a * x + b} for x in items]) @@ -53,7 +47,7 @@ def train_func(config: dict): metrics=[tf.keras.metrics.mean_squared_error], ) - dataset = train.get_dataset_shard("train") + dataset = session.get_dataset_shard("train") results = [] for _ in range(epochs): diff --git a/python/ray/air/examples/tf/tensorflow_mnist_example.py b/python/ray/air/examples/tf/tensorflow_mnist_example.py index cfa3701f2fdd..e008bc0ae28f 100644 --- a/python/ray/air/examples/tf/tensorflow_mnist_example.py +++ b/python/ray/air/examples/tf/tensorflow_mnist_example.py @@ -8,16 +8,9 @@ import numpy as np from ray.air.result import Result import tensorflow as tf -from tensorflow.keras.callbacks import Callback -import ray.train as train from ray.train.tensorflow import TensorflowTrainer - - -class TrainCheckpointReportCallback(Callback): - def on_epoch_end(self, epoch, logs=None): - train.save_checkpoint(**{"model": self.model.get_weights()}) - train.report(**logs) +from ray.air.callbacks.keras import Callback as TrainCheckpointReportCallback def mnist_dataset(batch_size: int) -> tf.data.Dataset: diff --git a/python/ray/air/tests/test_dataset_config.py b/python/ray/air/tests/test_dataset_config.py index 9b30f7252e0d..c6a352ac33c0 100644 --- a/python/ray/air/tests/test_dataset_config.py +++ b/python/ray/air/tests/test_dataset_config.py @@ -4,7 +4,7 @@ import pytest import ray -from ray import train +from ray.air import session from ray.air.config import DatasetConfig from ray.data import Dataset, DatasetPipeline from ray.data.preprocessors import BatchMapper @@ -29,13 +29,13 @@ def __init__( self, num_workers: int, expect_ds: bool, expect_sizes: Optional[dict], **kwargs ): def train_loop_per_worker(): - data_shard = train.get_dataset_shard("train") + data_shard = session.get_dataset_shard("train") if expect_ds: assert isinstance(data_shard, Dataset), data_shard else: assert isinstance(data_shard, DatasetPipeline), data_shard for k, v in expect_sizes.items(): - shard = train.get_dataset_shard(k) + shard = session.get_dataset_shard(k) if v == -1: assert shard is None, shard else: @@ -197,7 +197,7 @@ class TestStream(DataParallelTrainer): def __init__(self, check_results_fn, **kwargs): def train_loop_per_worker(): - data_shard = train.get_dataset_shard("train") + data_shard = session.get_dataset_shard("train") assert isinstance(data_shard, DatasetPipeline), data_shard results = [] for epoch in data_shard.iter_epochs(2): @@ -218,7 +218,7 @@ class TestBatch(DataParallelTrainer): def __init__(self, check_results_fn, **kwargs): def train_loop_per_worker(): - data_shard = train.get_dataset_shard("train") + data_shard = session.get_dataset_shard("train") assert isinstance(data_shard, Dataset), data_shard results = data_shard.take() check_results_fn(data_shard, results) diff --git a/python/ray/air/util/check_ingest.py b/python/ray/air/util/check_ingest.py index 74c72107f149..0073a9bbaf20 100755 --- a/python/ray/air/util/check_ingest.py +++ b/python/ray/air/util/check_ingest.py @@ -7,7 +7,7 @@ import numpy as np import ray -from ray import train +from ray.air import session from ray.air.config import DatasetConfig from ray.data import DatasetPipeline, Dataset from ray.data.preprocessors import BatchMapper, Chain @@ -67,8 +67,8 @@ def make_train_loop( def train_loop_per_worker(): import pandas as pd - rank = train.world_rank() - data_shard = train.get_dataset_shard("train") + rank = session.get_world_rank() + data_shard = session.get_dataset_shard("train") start = time.perf_counter() epochs_read, batches_read, bytes_read = 0, 0, 0 batch_delays = [] @@ -102,11 +102,13 @@ def generate_epochs(data: Union[Dataset, DatasetPipeline], epochs: int): # NOTE: This isn't recursive and will just return the size of # the object pointers if list of non-primitive types. bytes_read += sys.getsizeof(batch) - train.report( - bytes_read=bytes_read, - batches_read=batches_read, - epochs_read=epochs_read, - batch_delay=batch_delay, + session.report( + dict( + bytes_read=bytes_read, + batches_read=batches_read, + epochs_read=epochs_read, + batch_delay=batch_delay, + ) ) batch_start = time.perf_counter() delta = time.perf_counter() - start diff --git a/python/ray/train/_internal/backend_executor.py b/python/ray/train/_internal/backend_executor.py index b8ccec69ca3c..c658a3ae66b1 100644 --- a/python/ray/train/_internal/backend_executor.py +++ b/python/ray/train/_internal/backend_executor.py @@ -44,7 +44,7 @@ class BackendExecutor: This class holds a worker group and is responsible for executing the training function on the workers, and collecting intermediate results - from ``train.report()`` and ``train.checkpoint()``. + from ``session.report()``. Args: backend_config: The configurations for this @@ -288,7 +288,7 @@ def start_training( Dataset. checkpoint: The checkpoint data that should be loaded onto each worker and accessed by the - training function via ``train.load_checkpoint()``. If this + training function via ``session.get_checkpoint()``. If this is ``None`` then no checkpoint will be loaded. """ use_detailed_autofilled_metrics = env_integer( @@ -362,8 +362,7 @@ def get_next_results(self) -> Optional[List[TrainingResult]]: """Fetches the next ``TrainingResult`` from each worker. Each ``TrainingResult`` is expected to correspond to the same step from - each worker (e.g. the same call to ``train.report()`` or - ``train.checkpoint()``). + each worker (e.g. the same call to ``session.report()``). Returns: A list of ``TrainingResult``s with the same @@ -396,7 +395,8 @@ def get_next(): raise RuntimeError( "Some workers returned results while " "others didn't. Make sure that " - "`train.report()` and `train.save_checkpoint()` " + "`session.report()` (legacy API:" + "`train.report()` and `train.save_checkpoint()`) " "are called the same number of times on all " "workers." ) @@ -408,10 +408,11 @@ def get_next(): if any(r.type != result_type for r in results): raise RuntimeError( "Some workers returned results with " - "different types. Make sure `train.report()` " - "and `train.save_checkpoint()` are called the " - "same number of times and in the same order on " - "each worker." + "different types. Make sure that " + "`session.report()` (legacy API:" + "`train.report()` and `train.save_checkpoint()`) " + "are called the same number of times on all " + "workers." ) return results diff --git a/python/ray/train/_internal/checkpoint.py b/python/ray/train/_internal/checkpoint.py index 8bffe957833d..bef2a63fe9ac 100644 --- a/python/ray/train/_internal/checkpoint.py +++ b/python/ray/train/_internal/checkpoint.py @@ -116,7 +116,7 @@ def _process_checkpoint( f"checkpoint_score_attribute: " f"{score_attr}. " f"Include this attribute in the call to " - f"train.save_checkpoint." + f"`session.report()`." ) tracked_checkpoint = _TrackedCheckpoint( diff --git a/python/ray/train/_internal/dataset_spec.py b/python/ray/train/_internal/dataset_spec.py index 6e96a522a84d..f4e3d5e3fb9e 100644 --- a/python/ray/train/_internal/dataset_spec.py +++ b/python/ray/train/_internal/dataset_spec.py @@ -17,10 +17,10 @@ class RayDatasetSpec: dataset_or_dict: An optional Ray Dataset (or DatasetPipeline) or a dictionary of datasets to be sharded across all the training workers, which can be accessed - from the training function via ``train.get_dataset_shard()``. Multiple Datasets - can be passed in as a dictionary that maps each name key to a Dataset value, - and each Dataset can be accessed from the training function by passing in a - `dataset_name` argument to ``train.get_dataset_shard()``. + from the training function via ``session.get_dataset_shard()``. Multiple + Datasets can be passed in as a dictionary that maps each name key to a + Dataset value, and each Dataset can be accessed from the training function + by passing in a `dataset_name` argument to ``session.get_dataset_shard()``. dataset_split_fn: An optional callable to specify how the provided ``dataset`` should be split across the training workers. It is expected to take in two arguments. The first one is the ``dataset``, just as is passed in to the diff --git a/python/ray/train/_internal/session.py b/python/ray/train/_internal/session.py index 10008bda62a4..b184cd3787df 100644 --- a/python/ray/train/_internal/session.py +++ b/python/ray/train/_internal/session.py @@ -124,7 +124,7 @@ def start(self): self.training_thread.start() def pause_reporting(self): - """Ignore all future ``train.report()`` calls.""" + """Ignore all future ``session.report()`` calls.""" self.ignore_report = True def finish(self): diff --git a/python/ray/train/backend.py b/python/ray/train/backend.py index f450b27544be..7e22f0f431cb 100644 --- a/python/ray/train/backend.py +++ b/python/ray/train/backend.py @@ -46,8 +46,7 @@ def encode_data(data_dict: Dict) -> EncodedData: """Logic to encode a data dict before sending to the driver. This function will be called on the workers for any data that is - sent to the driver via ``train.report()`` or - ``train.save_checkpoint()``. + sent to the driver via ``session.report()``. """ return data_dict diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py index d640293799a5..b814380972a9 100644 --- a/python/ray/train/constants.py +++ b/python/ray/train/constants.py @@ -14,7 +14,7 @@ WILDCARD_KEY, ) -# Autofilled train.report() metrics. Keys should be consistent with Tune. +# Autofilled session.report() metrics. Keys should be consistent with Tune. TIMESTAMP = "_timestamp" TIME_THIS_ITER_S = "_time_this_iter_s" TRAINING_ITERATION = "_training_iteration" @@ -72,10 +72,10 @@ # Reserved keyword used by the ``TorchWorkerProfiler`` and # ``TorchTensorboardProfilerCallback`` for passing PyTorch Profiler data -# through ``train.report()`` +# through ``session.report()`` PYTORCH_PROFILER_KEY = "_train_torch_profiler" # Reserved keys used across all Callbacks. -# By default these will be filtered out from ``train.report()``. +# By default these will be filtered out from ``session.report()``. # See ``TrainingCallback._preprocess_results`` for more details. ALL_RESERVED_KEYS = {PYTORCH_PROFILER_KEY} diff --git a/python/ray/train/data_parallel_trainer.py b/python/ray/train/data_parallel_trainer.py index c8729282ab40..be8eae27d351 100644 --- a/python/ray/train/data_parallel_trainer.py +++ b/python/ray/train/data_parallel_trainer.py @@ -76,36 +76,35 @@ def train_loop_per_worker(config: Dict): If the ``datasets`` dict contains a training dataset (denoted by the "train" key), then it will be split into multiple dataset - shards that can then be accessed by ``ray.train.get_dataset_shard("train")`` inside + shards that can then be accessed by ``session.get_dataset_shard("train")`` inside ``train_loop_per_worker``. All the other datasets will not be split and - ``ray.train.get_dataset_shard(...)`` will return the the entire Dataset. + ``session.get_dataset_shard(...)`` will return the the entire Dataset. Inside the ``train_loop_per_worker`` function, you can use any of the + :ref:`Ray AIR session methods ` and :ref:`Ray Train function utils `. .. code-block:: python def train_loop_per_worker(): - # Report intermediate results for callbacks or logging. - train.report(...) - - # Checkpoints the provided args as restorable state. - train.save_checkpoint(...) + # Report intermediate results for callbacks or logging and + # checkpoint data. + session.report(...) # Returns dict of last saved checkpoint. - train.load_checkpoint() + session.get_checkpoint() # Returns the Ray Dataset shard for the given key. - train.get_dataset_shard("my_dataset") + session.get_dataset_shard("my_dataset") # Returns the total number of workers executing training. - train.get_world_size() + session.get_world_size() # Returns the rank of this worker. - train.get_world_rank() + session.get_world_rank() # Returns the rank of the worker on the current node. - train.get_local_rank() + session.get_local_rank() **How do I use ``DataParallelTrainer`` or any of its subclasses?** @@ -114,10 +113,10 @@ def train_loop_per_worker(): .. code-block:: python import ray - from ray import train + from ray.air import session def train_loop_for_worker(): - dataset_shard_for_this_worker = train.get_dataset_shard("train") + dataset_shard_for_this_worker = session.get_dataset_shard("train") assert len(dataset_shard_for_this_worker) == 1 diff --git a/python/ray/train/examples/horovod/horovod_example.py b/python/ray/train/examples/horovod/horovod_example.py index c01788008ec5..bb5e4ee7a567 100644 --- a/python/ray/train/examples/horovod/horovod_example.py +++ b/python/ray/train/examples/horovod/horovod_example.py @@ -2,6 +2,7 @@ import os import horovod.torch as hvd +from ray.air import session import torch.nn as nn import torch.nn.functional as F import torch.optim as optim @@ -10,7 +11,6 @@ from torchvision import datasets, transforms import ray -from ray import train from ray.train.horovod import HorovodTrainer @@ -148,7 +148,7 @@ def train_func(config): loss = train_epoch( model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda ) - train.report(loss=loss) + session.report(dict(loss=loss)) def main(num_workers, use_gpu, kwargs): diff --git a/python/ray/train/examples/mlflow_simple_example.py b/python/ray/train/examples/mlflow_simple_example.py index d64d0525ae58..a5803a072a71 100644 --- a/python/ray/train/examples/mlflow_simple_example.py +++ b/python/ray/train/examples/mlflow_simple_example.py @@ -1,5 +1,4 @@ -from ray import train -from ray.air import RunConfig +from ray.air import RunConfig, session from ray.train.torch import TorchTrainer from ray.tune.integration.mlflow import MLflowLoggerCallback from ray.tune.logger import TBXLoggerCallback @@ -7,7 +6,7 @@ def train_func(): for i in range(3): - train.report(epoch=i) + session.report(dict(epoch=i)) trainer = TorchTrainer( diff --git a/python/ray/train/examples/torch_fashion_mnist_example.py b/python/ray/train/examples/torch_fashion_mnist_example.py index 6e8db3220db4..725ba078e5d5 100644 --- a/python/ray/train/examples/torch_fashion_mnist_example.py +++ b/python/ray/train/examples/torch_fashion_mnist_example.py @@ -1,5 +1,6 @@ import argparse from typing import Dict +from ray.air import session import torch from torch import nn @@ -48,7 +49,7 @@ def forward(self, x): def train_epoch(dataloader, model, loss_fn, optimizer): - size = len(dataloader.dataset) // train.world_size() + size = len(dataloader.dataset) // session.get_world_size() model.train() for batch, (X, y) in enumerate(dataloader): # Compute prediction error @@ -66,7 +67,7 @@ def train_epoch(dataloader, model, loss_fn, optimizer): def validate_epoch(dataloader, model, loss_fn): - size = len(dataloader.dataset) // train.world_size() + size = len(dataloader.dataset) // session.get_world_size() num_batches = len(dataloader) model.eval() test_loss, correct = 0, 0 @@ -90,7 +91,7 @@ def train_func(config: Dict): lr = config["lr"] epochs = config["epochs"] - worker_batch_size = batch_size // train.world_size() + worker_batch_size = batch_size // session.get_world_size() # Create data loaders. train_dataloader = DataLoader(training_data, batch_size=worker_batch_size) @@ -111,9 +112,10 @@ def train_func(config: Dict): for _ in range(epochs): train_epoch(train_dataloader, model, loss_fn, optimizer) loss = validate_epoch(test_dataloader, model, loss_fn) - train.report(loss=loss) loss_results.append(loss) + session.report(dict(loss=loss)) + # return required for backwards compatibility with the old API return loss_results diff --git a/python/ray/train/examples/torch_linear_dataset_example.py b/python/ray/train/examples/torch_linear_dataset_example.py index acfa0ce2e637..a1e8889c4ea6 100644 --- a/python/ray/train/examples/torch_linear_dataset_example.py +++ b/python/ray/train/examples/torch_linear_dataset_example.py @@ -1,5 +1,6 @@ import argparse from typing import Dict, Tuple +from ray.air import session import torch import torch.nn as nn @@ -77,8 +78,8 @@ def train_func(config): lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) - train_dataset_pipeline_shard = train.get_dataset_shard("train") - validation_dataset_pipeline_shard = train.get_dataset_shard("validation") + train_dataset_pipeline_shard = session.get_dataset_shard("train") + validation_dataset_pipeline_shard = session.get_dataset_shard("validation") model = nn.Linear(1, hidden_size) model = train.torch.prepare_model(model) @@ -113,7 +114,7 @@ def train_func(config): train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) result = validate_epoch(validation_torch_dataset, model, loss_fn, device) - train.report(**result) + session.report(result) def train_linear(num_workers=2, use_gpu=False): diff --git a/python/ray/train/examples/torch_linear_example.py b/python/ray/train/examples/torch_linear_example.py index ceabd0c2853f..15d7ade41622 100644 --- a/python/ray/train/examples/torch_linear_example.py +++ b/python/ray/train/examples/torch_linear_example.py @@ -1,6 +1,7 @@ import argparse import numpy as np +from ray.air import session import torch import torch.nn as nn @@ -78,8 +79,8 @@ def train_func(config): for _ in range(epochs): train_epoch(train_loader, model, loss_fn, optimizer) result = validate_epoch(validation_loader, model, loss_fn) - train.report(**result) results.append(result) + session.report(result) # return required for backwards compatibility with the old API return results diff --git a/python/ray/train/examples/tune_cifar_torch_pbt_example.py b/python/ray/train/examples/tune_cifar_torch_pbt_example.py index f0b5c786ff8d..50389cd6e503 100644 --- a/python/ray/train/examples/tune_cifar_torch_pbt_example.py +++ b/python/ray/train/examples/tune_cifar_torch_pbt_example.py @@ -1,6 +1,7 @@ import argparse import numpy as np +from ray.air import session import torch import torch.nn as nn import torchvision.transforms as transforms @@ -20,7 +21,7 @@ def train_epoch(dataloader, model, loss_fn, optimizer): - size = len(dataloader.dataset) // train.world_size() + size = len(dataloader.dataset) // session.get_world_size() model.train() for batch, (X, y) in enumerate(dataloader): # Compute prediction error @@ -38,7 +39,7 @@ def train_epoch(dataloader, model, loss_fn, optimizer): def validate_epoch(dataloader, model, loss_fn): - size = len(dataloader.dataset) // train.world_size() + size = len(dataloader.dataset) // session.get_world_size() num_batches = len(dataloader) model.eval() test_loss, correct = 0, 0 @@ -98,7 +99,7 @@ def train_func(config): train_dataset = Subset(train_dataset, list(range(64))) validation_dataset = Subset(validation_dataset, list(range(64))) - worker_batch_size = config["batch_size"] // train.world_size() + worker_batch_size = config["batch_size"] // session.get_world_size() train_loader = DataLoader(train_dataset, batch_size=worker_batch_size) validation_loader = DataLoader(validation_dataset, batch_size=worker_batch_size) @@ -109,15 +110,10 @@ def train_func(config): # Create loss. criterion = nn.CrossEntropyLoss() - results = [] - for _ in range(epochs): train_epoch(train_loader, model, criterion, optimizer) result = validate_epoch(validation_loader, model, criterion) - train.report(**result) - results.append(result) - - return results + session.report(result) if __name__ == "__main__": diff --git a/python/ray/train/horovod/horovod_trainer.py b/python/ray/train/horovod/horovod_trainer.py index 3b5e6e76c5ee..527170c62ee6 100644 --- a/python/ray/train/horovod/horovod_trainer.py +++ b/python/ray/train/horovod/horovod_trainer.py @@ -38,40 +38,40 @@ def train_loop_per_worker(config: Dict): If the ``datasets`` dict contains a training dataset (denoted by the "train" key), then it will be split into multiple dataset - shards that can then be accessed by ``ray.train.get_dataset_shard("train")`` inside + shards that can then be accessed by ``session.get_dataset_shard("train")`` inside ``train_loop_per_worker``. All the other datasets will not be split and - ``ray.train.get_dataset_shard(...)`` will return the the entire Dataset. + ``session.get_dataset_shard(...)`` will return the the entire Dataset. Inside the ``train_loop_per_worker`` function, you can use any of the + :ref:`Ray AIR session methods ` and :ref:`Ray Train function utils `. .. code-block:: python def train_loop_per_worker(): - # Report intermediate results for callbacks or logging. - train.report(...) - - # Checkpoints the provided args as restorable state. - train.save_checkpoint(...) + # Report intermediate results for callbacks or logging and + # checkpoint data. + session.report(...) # Returns dict of last saved checkpoint. - train.load_checkpoint() + session.get_checkpoint() # Returns the Ray Dataset shard for the given key. - train.get_dataset_shard("my_dataset") + session.get_dataset_shard("my_dataset") # Returns the total number of workers executing training. - train.get_world_size() + session.get_world_size() # Returns the rank of this worker. - train.get_world_rank() + session.get_world_rank() # Returns the rank of the worker on the current node. - train.get_local_rank() + session.get_local_rank() You could use ``TensorflowPredictor`` or ``TorchPredictor`` in conjunction with - HorovodTrainer. You must save the model under the "model" kwarg in - ``train.save_checkpoint()``, so that it can be used by corresponding predictors. + HorovodTrainer. You must save the model under the "model" kwarg in the + ``Checkpoint`` passed to ``session.report()``, so that it can be used by + corresponding predictors. Example: @@ -83,6 +83,7 @@ def train_loop_per_worker(): import horovod.torch as hvd import torch import torch.nn as nn + from ray.air import session, Checkpoint from ray.train.horovod import HorovodTrainer input_size = 1 @@ -101,7 +102,7 @@ def forward(self, input): def train_loop_per_worker(): hvd.init() - dataset_shard = train.get_dataset_shard("train") + dataset_shard = session.get_dataset_shard("train") model = NeuralNetwork() device = train.torch.get_device() model.to(device) @@ -132,7 +133,12 @@ def train_loop_per_worker(): loss.backward() optimizer.step() print(f"epoch: {epoch}, loss: {loss.item()}") - train.save_checkpoint(model=model.state_dict()) + session.report( + {}, + checkpoint=Checkpoint.from_dict( + dict(model=model.state_dict()) + ), + ) train_dataset = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)]) scaling_config = {"num_workers": 3} # If using GPUs, use the below scaling config instead. diff --git a/python/ray/train/huggingface/_huggingface_utils.py b/python/ray/train/huggingface/_huggingface_utils.py index 2f3dd53f0ad8..d7b50f810d99 100644 --- a/python/ray/train/huggingface/_huggingface_utils.py +++ b/python/ray/train/huggingface/_huggingface_utils.py @@ -5,7 +5,8 @@ import transformers.trainer from transformers.trainer_callback import TrainerCallback -from ray import train +from ray.air import session +from ray.air.checkpoint import Checkpoint from ray.util import get_node_ip_address from ray.data.dataset import Dataset @@ -118,9 +119,9 @@ def __init__(self) -> None: # HF first logs metrics, and then checkpoints. With Ray AIR, we need the # opposite. Furthermore, some metrics are logged just at the end. # Therefore, if we detect that a checkpoint will be created, - # we delay the train.report call after the checkpoint is reported + # we delay the session.report call after the checkpoint is reported # to Ray Train. - self.delayed_report = {} + self.delayed_report = {"metrics": {}, "checkpoint": None} super().__init__() def on_step_end(self, args, state, control, **kwargs): @@ -132,7 +133,7 @@ def on_step_end(self, args, state, control, **kwargs): def on_log(self, args, state, control, model=None, logs=None, **kwargs): # Log is called in multiple places (evaluation, train metrics). report = {**logs, "step": state.global_step, "epoch": state.epoch} - self.delayed_report.update(report) + self.delayed_report["metrics"].update(report) def on_save(self, args, state, control, **kwargs): # Save is called after evaluation. @@ -140,8 +141,8 @@ def on_save(self, args, state, control, **kwargs): transformers.trainer.get_last_checkpoint(args.output_dir) ).absolute() if checkpoint_path: - train.save_checkpoint( - **{ + self.delayed_report["checkpoint"] = Checkpoint.from_dict( + { NODE_IP_KEY: get_node_ip_address(), CHECKPOINT_PATH_ON_NODE_KEY: str(checkpoint_path), } @@ -149,8 +150,8 @@ def on_save(self, args, state, control, **kwargs): def _report(self): if self.delayed_report: - train.report(**self.delayed_report) - self.delayed_report = {} + session.report(**self.delayed_report) + self.delayed_report = {"metrics": {}, "checkpoint": None} def on_epoch_begin(self, args, state, control, **kwargs): # Report previous epoch - this way we ensure everything diff --git a/python/ray/train/huggingface/huggingface_trainer.py b/python/ray/train/huggingface/huggingface_trainer.py index 786909b927b3..e8ef4a78d21f 100644 --- a/python/ray/train/huggingface/huggingface_trainer.py +++ b/python/ray/train/huggingface/huggingface_trainer.py @@ -13,7 +13,6 @@ import transformers.training_args from torch.utils.data import Dataset as TorchDataset -from ray import train from ray.air import session from ray.air._internal.checkpointing import ( save_preprocessor_to_dir, @@ -408,12 +407,12 @@ def _huggingface_train_loop_per_worker(config): trainer_init_per_worker = config.pop("_trainer_init_per_worker") # Env vars necessary for HF to setup DDP - os.environ["RANK"] = str(train.world_rank()) - os.environ["WORLD_SIZE"] = str(train.world_size()) - os.environ["LOCAL_RANK"] = str(train.local_rank()) + os.environ["RANK"] = str(session.get_world_rank()) + os.environ["WORLD_SIZE"] = str(session.get_world_size()) + os.environ["LOCAL_RANK"] = str(session.get_local_rank()) - train_dataset = train.get_dataset_shard(TRAIN_DATASET_KEY) - eval_dataset = train.get_dataset_shard(EVALUATION_DATASET_KEY) + train_dataset = session.get_dataset_shard(TRAIN_DATASET_KEY) + eval_dataset = session.get_dataset_shard(EVALUATION_DATASET_KEY) train_torch_dataset, eval_torch_dataset = process_datasets( train_dataset, diff --git a/python/ray/train/tensorflow/tensorflow_trainer.py b/python/ray/train/tensorflow/tensorflow_trainer.py index 3d5921767724..ffda25726780 100644 --- a/python/ray/train/tensorflow/tensorflow_trainer.py +++ b/python/ray/train/tensorflow/tensorflow_trainer.py @@ -38,36 +38,35 @@ def train_loop_per_worker(config: Dict): If the ``datasets`` dict contains a training dataset (denoted by the "train" key), then it will be split into multiple dataset - shards that can then be accessed by ``ray.train.get_dataset_shard("train")`` inside + shards that can then be accessed by ``session.get_dataset_shard("train")`` inside ``train_loop_per_worker``. All the other datasets will not be split and - ``ray.train.get_dataset_shard(...)`` will return the the entire Dataset. + ``session.get_dataset_shard(...)`` will return the the entire Dataset. Inside the ``train_loop_per_worker`` function, you can use any of the + :ref:`Ray AIR session methods ` and :ref:`Ray Train function utils `. .. code-block:: python def train_loop_per_worker(): - # Report intermediate results for callbacks or logging. - train.report(...) - - # Checkpoints the provided args as restorable state. - train.save_checkpoint(...) + # Report intermediate results for callbacks or logging and + # checkpoint data. + session.report(...) # Returns dict of last saved checkpoint. - train.load_checkpoint() + session.get_checkpoint() # Returns the Ray Dataset shard for the given key. - train.get_dataset_shard("my_dataset") + session.get_dataset_shard("my_dataset") # Returns the total number of workers executing training. - train.get_world_size() + session.get_world_size() # Returns the rank of this worker. - train.get_world_rank() + session.get_world_rank() # Returns the rank of the worker on the current node. - train.get_local_rank() + session.get_local_rank() You can also use any of the :ref:`TensorFlow specific function utils `. @@ -77,12 +76,12 @@ def train_loop_per_worker(): def train_loop_per_worker(): # Turns off autosharding for a dataset. # You should use this if you are doing - # `train.get_dataset_shard(...).to_tf(...)` + # `session.get_dataset_shard(...).to_tf(...)` # as the data will be already sharded. train.tensorflow.prepare_dataset_shard(...) To save a model to use for the ``TensorflowPredictor``, you must save it under the - "model" kwarg in ``train.save_checkpoint()``. + "model" kwarg in ``Checkpoint`` passed to ``session.report()``. Example: @@ -92,9 +91,8 @@ def train_loop_per_worker(): import ray from ray import train - from ray.train.tensorflow import prepare_dataset_shard - - from ray.train.tensorflow import TensorflowTrainer + from ray.air import session, Checkpoint + from ray.train.tensorflow import prepare_dataset_shard, TensorflowTrainer input_size = 1 @@ -106,7 +104,7 @@ def build_model(): ) def train_loop_for_worker(config): - dataset_shard = train.get_dataset_shard("train") + dataset_shard = session.get_dataset_shard("train") strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() with strategy.scope(): model = build_model() @@ -125,8 +123,14 @@ def train_loop_for_worker(config): ) ) model.fit(tf_dataset) - train.save_checkpoint( - epoch=epoch, model=model.get_weights()) + # You can also use ray.air.callbacks.keras.Callback + # for reporting and checkpointing instead of reporting manually. + session.report( + {}, + checkpoint=Checkpoint.from_dict( + dict(epoch=epoch, model=model.get_weights()) + ), + ) train_dataset = ray.data.from_items( [{"x": x, "y": x + 1} for x in range(32)]) diff --git a/python/ray/train/tests/test_tensorflow_trainer.py b/python/ray/train/tests/test_tensorflow_trainer.py index 95c3f84b92e1..1f17de0c3e25 100644 --- a/python/ray/train/tests/test_tensorflow_trainer.py +++ b/python/ray/train/tests/test_tensorflow_trainer.py @@ -4,7 +4,6 @@ import pytest import ray -from ray import train from ray.air import session from ray.air.checkpoint import Checkpoint from ray.air.examples.tf.tensorflow_linear_dataset_example import get_dataset @@ -65,7 +64,7 @@ def train_func(config): def test_tensorflow_e2e(ray_start_4_cpus): def train_func(): model = build_model().get_weights() - train.save_checkpoint(**{MODEL_KEY: model}) + session.report({}, checkpoint=Checkpoint.from_dict({MODEL_KEY: model})) scaling_config = {"num_workers": 2} trainer = TensorflowTrainer( diff --git a/python/ray/train/tests/test_torch_trainer.py b/python/ray/train/tests/test_torch_trainer.py index de0b6f8606c4..0d158e3456a2 100644 --- a/python/ray/train/tests/test_torch_trainer.py +++ b/python/ray/train/tests/test_torch_trainer.py @@ -1,8 +1,9 @@ import pytest +from ray.air import session +from ray.air.checkpoint import Checkpoint import torch import ray -from ray import train from ray.air.examples.pytorch.torch_linear_example import ( train_func as linear_train_func, ) @@ -39,7 +40,7 @@ def train_func(config): def test_torch_e2e(ray_start_4_cpus): def train_func(): model = torch.nn.Linear(1, 1) - train.save_checkpoint(model=model) + session.report({}, checkpoint=Checkpoint.from_dict(dict(model=model))) scaling_config = {"num_workers": 2} trainer = TorchTrainer( @@ -65,7 +66,7 @@ def __call__(self, x): def test_torch_e2e_state_dict(ray_start_4_cpus): def train_func(): model = torch.nn.Linear(1, 1).state_dict() - train.save_checkpoint(model=model) + session.report({}, checkpoint=Checkpoint.from_dict(dict(model=model))) scaling_config = {"num_workers": 2} trainer = TorchTrainer( diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index 640fa98a19a0..4d1480a2c1c5 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -134,9 +134,11 @@ def train_func(config): def test_tune_checkpoint(ray_start_4_cpus): def train_func(): - for i in range(10): - train.report(test=i) - train.save_checkpoint(hello="world") + for i in range(9): + session.report(dict(test=i)) + session.report( + dict(test=i + 1), checkpoint=Checkpoint.from_dict(dict(hello="world")) + ) trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1) diff --git a/python/ray/train/torch/torch_trainer.py b/python/ray/train/torch/torch_trainer.py index f43a8e259f17..fb9a0e148ae0 100644 --- a/python/ray/train/torch/torch_trainer.py +++ b/python/ray/train/torch/torch_trainer.py @@ -38,36 +38,35 @@ def train_loop_per_worker(config: Dict): If the ``datasets`` dict contains a training dataset (denoted by the "train" key), then it will be split into multiple dataset - shards that can then be accessed by ``ray.train.get_dataset_shard("train")`` inside + shards that can then be accessed by ``session.get_dataset_shard("train")`` inside ``train_loop_per_worker``. All the other datasets will not be split and - ``ray.train.get_dataset_shard(...)`` will return the the entire Dataset. + ``session.get_dataset_shard(...)`` will return the the entire Dataset. Inside the ``train_loop_per_worker`` function, you can use any of the + :ref:`Ray AIR session methods ` and :ref:`Ray Train function utils `. .. code-block:: python def train_loop_per_worker(): - # Report intermediate results for callbacks or logging. - train.report(...) - - # Checkpoints the provided args as restorable state. - train.save_checkpoint(...) + # Report intermediate results for callbacks or logging and + # checkpoint data. + session.report(...) # Returns dict of last saved checkpoint. - train.load_checkpoint() + session.get_checkpoint() # Returns the Ray Dataset shard for the given key. - train.get_dataset_shard("my_dataset") + session.get_dataset_shard("my_dataset") # Returns the total number of workers executing training. - train.get_world_size() + session.get_world_size() # Returns the rank of this worker. - train.get_world_rank() + session.get_world_rank() # Returns the rank of the worker on the current node. - train.get_local_rank() + session.get_local_rank() You can also use any of the :ref:`Torch specific function utils `. @@ -82,14 +81,14 @@ def train_loop_per_worker(): # Configures the dataloader for distributed training by adding a # `DistributedSampler`. # You should NOT use this if you are doing - # `train.get_dataset_shard(...).to_torch(...)` + # `session.get_dataset_shard(...).to_torch(...)` train.torch.prepare_data_loader(...) # Returns the current torch device. train.torch.get_device() To save a model to use for the ``TorchPredictor``, you must save it under the - "model" kwarg in ``train.save_checkpoint()``. + "model" kwarg in ``Checkpoint`` passed to ``session.report()``. Example: .. code-block:: python @@ -99,6 +98,7 @@ def train_loop_per_worker(): import ray from ray import train + from ray.air import session, Checkpoint from ray.train.torch import TorchTrainer input_size = 1 @@ -117,7 +117,7 @@ def forward(self, input): return self.layer2(self.relu(self.layer1(input))) def train_loop_per_worker(): - dataset_shard = train.get_dataset_shard("train") + dataset_shard = session.get_dataset_shard("train") model = NeuralNetwork() loss_fn = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=0.1) @@ -133,7 +133,12 @@ def train_loop_per_worker(): optimizer.step() print(f"epoch: {epoch}, loss: {loss.item()}") - train.save_checkpoint(model=model.state_dict()) + session.report( + {}, + checkpoint=Checkpoint.from_dict( + dict(epoch=epoch, model=model.state_dict()) + ), + ) train_dataset = ray.data.from_items([1, 2, 3]) scaling_config = {"num_workers": 3} diff --git a/python/ray/train/torch/train_loop_utils.py b/python/ray/train/torch/train_loop_utils.py index 65b8368cbcf0..dee94bd2ec8c 100644 --- a/python/ray/train/torch/train_loop_utils.py +++ b/python/ray/train/torch/train_loop_utils.py @@ -10,6 +10,7 @@ import ray from ray import train +from ray.air import session from ray.train._internal.accelerator import Accelerator from ray.train.constants import PYTORCH_PROFILER_KEY from torch.optim import Optimizer @@ -282,7 +283,11 @@ def prepare_model( """ ddp_kwargs = ddp_kwargs or {} - rank = train.local_rank() + # Backwards compatibility + try: + rank = session.get_local_rank() + except Exception: + rank = train.local_rank() device = self.get_device() @@ -327,7 +332,13 @@ def model_get_state(self): # See https://stackoverflow.com/questions/972/adding-a-method-to-an-existing-object-instance. # noqa: E501 model.__getstate__ = types.MethodType(model_get_state, model) - if wrap_ddp and train.world_size() > 1: + # Backwards compatibility + try: + world_size = session.get_world_size() + except Exception: + world_size = train.world_size() + + if wrap_ddp and world_size > 1: logger.info("Wrapping provided model in DDP.") if torch.cuda.is_available(): model = DistributedDataParallel( @@ -365,13 +376,21 @@ def prepare_data_loader( if ``move_to_device`` is False. """ + # Backwards compatibility + try: + world_size = session.get_world_size() + world_rank = session.get_world_rank() + except Exception: + world_size = train.world_size() + world_rank = train.world_rank() + # Only add Distributed Sampler if the following conditions hold: # 1. More than one training worker is being used. # 2. A DistributedSampler has not already been added by the user. # 3. The dataset is not an IterableDataset. Samplers do not worker with # IterableDatasets. if ( - train.world_size() > 1 + world_size > 1 and not isinstance(data_loader.sampler, DistributedSampler) and not ( hasattr(data_loader, "dataset") @@ -413,7 +432,7 @@ def wrapper(worker_id): using_default_sampler = isinstance( loader.sampler, (SequentialSampler, RandomSampler) ) - if not using_default_sampler and train.world_rank() == 0: + if not using_default_sampler and world_rank == 0: logger.warn( f"The {loader.sampler.__class__.__name__} will be overwritten " "with a DistributedSampler. You can disable this by setting " diff --git a/release/air_tests/horovod/workloads/horovod_tune_test.py b/release/air_tests/horovod/workloads/horovod_tune_test.py index 5008834bea5d..bab14b9bbf0e 100755 --- a/release/air_tests/horovod/workloads/horovod_tune_test.py +++ b/release/air_tests/horovod/workloads/horovod_tune_test.py @@ -2,7 +2,7 @@ import torch.nn as nn import numpy as np import torchvision -from ray.air import RunConfig +from ray.air import RunConfig, session from ray.train.horovod import HorovodTrainer from ray.tune.tune_config import TuneConfig from ray.tune.tuner import Tuner @@ -12,7 +12,7 @@ import ray from ray import tune -from ray import train +from ray.air.checkpoint import Checkpoint from ray.tune.schedulers import create_scheduler from ray.util.ml_utils.resnet import ResNet18 @@ -37,7 +37,7 @@ def train_loop_per_worker(config): ) epoch = 0 - checkpoint = train.load_checkpoint() + checkpoint = session.get_checkpoint() if checkpoint: model_state = checkpoint["model_state"] optimizer_state = checkpoint["optimizer_state"] @@ -79,17 +79,20 @@ def train_loop_per_worker(config): # print statistics running_loss += loss.item() epoch_steps += 1 - train.report(loss=running_loss / epoch_steps) if i % 2000 == 1999: # print every 2000 mini-batches print( "[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps) ) - - train.save_checkpoint( - model_state=net.state_dict(), - optimizer_state=optimizer.state_dict(), - epoch=epoch, + session.report( + dict(loss=running_loss / epoch_steps), + checkpoint=Checkpoint.from_dict( + dict( + model_state=net.state_dict(), + optimizer_state=optimizer.state_dict(), + epoch=epoch, + ) + ), ) From 17366cef2a9c621afd60c2fecdcdb97a40da504e Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 5 Jul 2022 18:32:47 +0000 Subject: [PATCH 61/70] Update docs --- doc/source/ray-air/doc_code/air_ingest.py | 8 +-- .../ray-air/doc_code/pytorch_starter.py | 9 +-- doc/source/ray-air/doc_code/tf_starter.py | 16 ++--- ...ert_existing_pytorch_code_to_ray_air.ipynb | 38 ++++++---- .../examples/tfx_tabular_train_to_serve.ipynb | 12 ++-- .../examples/torch_image_example.ipynb | 12 ++-- .../examples/torch_incremental_learning.ipynb | 23 +++--- .../datasets_train/datasets_train.py | 70 +++++++++---------- doc/source/train/faq.rst | 8 +-- doc/source/tune/examples/horovod_simple.ipynb | 4 +- .../tf/tensorflow_autoencoder_example.ipynb | 2 +- 11 files changed, 106 insertions(+), 96 deletions(-) diff --git a/doc/source/ray-air/doc_code/air_ingest.py b/doc/source/ray-air/doc_code/air_ingest.py index 4671406ce306..c14d285de5ad 100644 --- a/doc/source/ray-air/doc_code/air_ingest.py +++ b/doc/source/ray-air/doc_code/air_ingest.py @@ -86,7 +86,7 @@ # __config_4__ import ray -from ray import train +from ray.air import session from ray.data import Dataset from ray.train.torch import TorchTrainer from ray.air.config import DatasetConfig @@ -94,7 +94,7 @@ def train_loop_per_worker(): # By default, bulk loading is used and returns a Dataset object. - data_shard: Dataset = train.get_dataset_shard("train") + data_shard: Dataset = session.get_dataset_shard("train") # Manually iterate over the data 10 times (10 epochs). for _ in range(10): @@ -117,7 +117,7 @@ def train_loop_per_worker(): # __config_5__ import ray -from ray import train +from ray.air import session from ray.data import DatasetPipeline from ray.train.torch import TorchTrainer from ray.air.config import DatasetConfig @@ -125,7 +125,7 @@ def train_loop_per_worker(): def train_loop_per_worker(): # A DatasetPipeline object is returned when `use_stream_api` is set. - data_shard: DatasetPipeline = train.get_dataset_shard("train") + data_shard: DatasetPipeline = session.get_dataset_shard("train") # Use iter_epochs(10) to iterate over 10 epochs of data. for epoch in data_shard.iter_epochs(10): diff --git a/doc/source/ray-air/doc_code/pytorch_starter.py b/doc/source/ray-air/doc_code/pytorch_starter.py index 9c57ac24b4b3..01837655eab8 100644 --- a/doc/source/ray-air/doc_code/pytorch_starter.py +++ b/doc/source/ray-air/doc_code/pytorch_starter.py @@ -29,6 +29,7 @@ from torch import nn from torch.utils.data import DataLoader import ray.train as train +from ray.air import session from ray.train.torch import TorchTrainer # Define model @@ -52,7 +53,7 @@ def forward(self, x): def train_epoch(dataloader, model, loss_fn, optimizer): - size = len(dataloader.dataset) // train.world_size() + size = len(dataloader.dataset) // session.get_world_size() model.train() for batch, (X, y) in enumerate(dataloader): # Compute prediction error @@ -70,7 +71,7 @@ def train_epoch(dataloader, model, loss_fn, optimizer): def validate_epoch(dataloader, model, loss_fn): - size = len(dataloader.dataset) // train.world_size() + size = len(dataloader.dataset) // session.get_world_size() num_batches = len(dataloader) model.eval() test_loss, correct = 0, 0 @@ -94,7 +95,7 @@ def train_func(config): lr = config["lr"] epochs = config["epochs"] - worker_batch_size = batch_size // train.world_size() + worker_batch_size = batch_size // session.get_world_size() # Create data loaders. train_dataloader = DataLoader(training_data, batch_size=worker_batch_size) @@ -113,7 +114,7 @@ def train_func(config): for _ in range(epochs): train_epoch(train_dataloader, model, loss_fn, optimizer) loss = validate_epoch(test_dataloader, model, loss_fn) - train.report(loss=loss) + session.report(dict(loss=loss)) num_workers = 2 diff --git a/doc/source/ray-air/doc_code/tf_starter.py b/doc/source/ray-air/doc_code/tf_starter.py index 4116ff5dba58..2f20a65a159e 100644 --- a/doc/source/ray-air/doc_code/tf_starter.py +++ b/doc/source/ray-air/doc_code/tf_starter.py @@ -15,9 +15,9 @@ # __air_tf_train_start__ import tensorflow as tf -from tensorflow.keras.callbacks import Callback -import ray.train as train +from ray.air import session +from ray.air.callbacks.keras import Callback from ray.train.tensorflow import prepare_dataset_shard from ray.train.tensorflow import TensorflowTrainer @@ -33,12 +33,6 @@ def build_model() -> tf.keras.Model: return model -class TrainCheckpointReportCallback(Callback): - def on_epoch_end(self, epoch, logs=None): - train.save_checkpoint(**{"model": self.model.get_weights()}) - train.report(**logs) - - def train_func(config: dict): batch_size = config.get("batch_size", 64) epochs = config.get("epochs", 3) @@ -53,7 +47,7 @@ def train_func(config: dict): metrics=[tf.keras.metrics.mean_squared_error], ) - dataset = train.get_dataset_shard("train") + dataset = session.get_dataset_shard("train") results = [] for _ in range(epochs): @@ -67,9 +61,7 @@ def train_func(config: dict): batch_size=batch_size, ) ) - history = multi_worker_model.fit( - tf_dataset, callbacks=[TrainCheckpointReportCallback()] - ) + history = multi_worker_model.fit(tf_dataset, callbacks=[Callback()]) results.append(history.history) return results diff --git a/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb b/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb index a94a9e4cfc81..715703cd7e82 100644 --- a/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb +++ b/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb @@ -674,10 +674,11 @@ "\n", "To facilitate this, we only need a few changes to the code:\n", "\n", - "1. We import Ray Train:\n", + "1. We import Ray Train and Ray AIR Session:\n", "\n", "```python\n", "import ray.train as train\n", + "from ray.air import session\n", "```\n", "\n", "\n", @@ -693,7 +694,7 @@ "3. We dynamically adjust the worker batch size according to the number of workers:\n", "\n", "```python\n", - " batch_size_per_worker = batch_size // train.world_size()\n", + " batch_size_per_worker = batch_size // session.get_world_size()\n", "```\n", "\n", "4. We prepare the data loader for distributed data sharding:\n", @@ -716,13 +717,13 @@ "\n", "```python\n", " test_loss = test(test_dataloader, model, loss_fn)\n", - " train.report(loss=test_loss)\n", + " session.report(dict(loss=test_loss))\n", "```\n", "\n", "7. In the `train_epoch()` and `test_epoch()` functions we divide the `size` by the world size:\n", "\n", "```python\n", - " size = len(dataloader.dataset) // train.world_size() # Divide by word size\n", + " size = len(dataloader.dataset) // session.get_world_size() # Divide by word size\n", "```\n", "\n", "8. In the `train_epoch()` function we can get rid of the device mapping. Ray Train does this for us:\n", @@ -745,7 +746,7 @@ "outputs": [], "source": [ "def train_epoch(dataloader, model, loss_fn, optimizer):\n", - " size = len(dataloader.dataset) // train.world_size() # Divide by word size\n", + " size = len(dataloader.dataset) // session.get_world_size() # Divide by word size\n", " model.train()\n", " for batch, (X, y) in enumerate(dataloader):\n", " # We don't need this anymore! Ray Train does this automatically:\n", @@ -781,7 +782,7 @@ "outputs": [], "source": [ "def test_epoch(dataloader, model, loss_fn):\n", - " size = len(dataloader.dataset) // train.world_size() # Divide by word size\n", + " size = len(dataloader.dataset) // session.get_world_size() # Divide by word size\n", " num_batches = len(dataloader)\n", " model.eval()\n", " test_loss, correct = 0, 0\n", @@ -821,14 +822,14 @@ ], "source": [ "import ray.train as train\n", - "\n", + "from ray.air import session\n", "\n", "def train_func(config: dict):\n", " batch_size = config[\"batch_size\"]\n", " lr = config[\"lr\"]\n", " epochs = config[\"epochs\"]\n", " \n", - " batch_size_per_worker = batch_size // train.world_size()\n", + " batch_size_per_worker = batch_size // session.get_world_size()\n", " \n", " # Create data loaders.\n", " train_dataloader = DataLoader(training_data, batch_size=batch_size_per_worker)\n", @@ -846,7 +847,7 @@ " for t in range(epochs):\n", " train_epoch(train_dataloader, model, loss_fn, optimizer)\n", " test_loss = test_epoch(test_dataloader, model, loss_fn)\n", - " train.report(loss=test_loss)\n", + " session.report(dict(loss=test_loss))\n", "\n", " print(\"Done!\")" ] @@ -1062,10 +1063,15 @@ "metadata": {}, "source": [ "### Enabling checkpointing to retrieve the model\n", - "Enabling checkpointing is pretty easy - we just need to call the `train.save_checkpoint()` API and pass the model state to it:\n", + "Enabling checkpointing is pretty easy - we just need to pass a `Checkpoint` object with the model state to the `session.report()` API.\n", "\n", "```python\n", - " train.save_checkpoint(epoch=t, model=model.module.state_dict())\n", + " from ray.air import Checkpoint\n", + "\n", + " checkpoint = Checkpoint.from_dict(\n", + " dict(epoch=t, model=model.module.state_dict())\n", + " )\n", + " session.report(dict(loss=test_loss), checkpoint=checkpoint)\n", "```\n", "\n", "Note that the `model.module` part is needed because the model gets wrapped in `torch.nn.DistributedDataParallel` by `train.torch.prepare_model`.\n", @@ -1086,6 +1092,8 @@ "metadata": {}, "outputs": [], "source": [ + "from ray.air import Checkpoint\n", + "\n", "def load_data():\n", " # Download training data from open datasets.\n", " training_data = datasets.FashionMNIST(\n", @@ -1110,7 +1118,7 @@ " lr = config[\"lr\"]\n", " epochs = config[\"epochs\"]\n", " \n", - " batch_size_per_worker = batch_size // train.world_size()\n", + " batch_size_per_worker = batch_size // session.get_world_size()\n", " \n", " training_data, test_data = load_data() # <- this is new!\n", " \n", @@ -1130,8 +1138,10 @@ " for t in range(epochs):\n", " train_epoch(train_dataloader, model, loss_fn, optimizer)\n", " test_loss = test_epoch(test_dataloader, model, loss_fn)\n", - " train.save_checkpoint(epoch=t, model=model.module.state_dict()) # <- this is new!\n", - " train.report(loss=test_loss)\n", + " checkpoint = Checkpoint.from_dict(\n", + " dict(epoch=t, model=model.module.state_dict())\n", + " )\n", + " session.report(dict(loss=test_loss), checkpoint=checkpoint)\n", "\n", " print(\"Done!\")" ] diff --git a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb index f9609fe8d962..514e8fad3a4e 100644 --- a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb +++ b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb @@ -619,12 +619,11 @@ }, "outputs": [], "source": [ - "from ray import train\n", + "from ray.air import session, Checkpoint\n", "from ray.train.tensorflow import prepare_dataset_shard\n", - "from ray.tune.integration.keras import TuneReportCallback\n", "\n", "def train_loop_per_worker():\n", - " dataset_shard = train.get_dataset_shard(\"train\")\n", + " dataset_shard = session.get_dataset_shard(\"train\")\n", "\n", " strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n", " with strategy.scope():\n", @@ -653,7 +652,12 @@ "\n", " model.fit(tf_dataset, verbose=0)\n", " # This saves checkpoint in a way that can be used by Ray Serve coherently.\n", - " train.save_checkpoint(epoch=epoch, model=model.get_weights())" + " session.report(\n", + " {},\n", + " checkpoint=Checkpoint.from_dict(\n", + " dict(epoch=epoch, model=model.get_weights())\n", + " ),\n", + " )" ] }, { diff --git a/doc/source/ray-air/examples/torch_image_example.ipynb b/doc/source/ray-air/examples/torch_image_example.ipynb index 98cbfbee814e..10600a6291ed 100644 --- a/doc/source/ray-air/examples/torch_image_example.ipynb +++ b/doc/source/ray-air/examples/torch_image_example.ipynb @@ -253,8 +253,8 @@ "\n", "`train_loop_per_worker` contains regular PyTorch code with a few notable exceptions:\n", "* We wrap our model with {py:func}`train.torch.prepare_model `.\n", - "* We call {py:func}`train.get_dataset_shard ` and {py:meth}`Dataset.to_torch ` to convert a subset of our training data to a Torch dataset.\n", - "* We save model state using {py:func}`train.save_checkpoint `." + "* We call {py:func}`session.get_dataset_shard ` and {py:meth}`Dataset.to_torch ` to convert a subset of our training data to a Torch dataset.\n", + "* We save model state using {py:func}`session.report `." ] }, { @@ -265,6 +265,7 @@ "outputs": [], "source": [ "from ray import train\n", + "from ray.air import session, Checkpoint\n", "import torch.optim as optim\n", "\n", "\n", @@ -274,7 +275,7 @@ " criterion = nn.CrossEntropyLoss()\n", " optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)\n", "\n", - " train_dataset_shard: torch.utils.data.Dataset = train.get_dataset_shard(\"train\").to_torch(\n", + " train_dataset_shard: torch.utils.data.Dataset = session.get_dataset_shard(\"train\").to_torch(\n", " feature_columns=[\"image\"],\n", " label_column=\"label\",\n", " batch_size=config[\"batch_size\"],\n", @@ -303,7 +304,10 @@ " print(f\"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}\")\n", " running_loss = 0.0\n", "\n", - " train.save_checkpoint(model=model.module.state_dict())" + " session.report(\n", + " dict(running_loss=running_loss),\n", + " checkpoint=Checkpoint.from_dict(dict(model=model.module.state_dict())),\n", + " )" ] }, { diff --git a/doc/source/ray-air/examples/torch_incremental_learning.ipynb b/doc/source/ray-air/examples/torch_incremental_learning.ipynb index 03d7ed476781..dffcd987b438 100644 --- a/doc/source/ray-air/examples/torch_incremental_learning.ipynb +++ b/doc/source/ray-air/examples/torch_incremental_learning.ipynb @@ -498,11 +498,11 @@ "\n", "The training loop takes in a `config` Dict as an argument that we can use to pass in any configurations for training.\n", "\n", - "This is just standard PyTorch training, with the difference being that we can leverage [Ray Train's utility functions](https://docs.ray.io/en/master/train/api.html#training-function-utilities):\n", + "This is just standard PyTorch training, with the difference being that we can leverage [Ray Train's utility functions](https://docs.ray.io/en/master/train/api.html#training-function-utilities) and [Ray AIR Sesssion](https://docs.ray.io/en/master/ray-air/package-ref.html#module-ray.air.session):\n", "- `ray.train.torch.prepare_model(...)`: This will prepare the model for distributed training by wrapping it in PyTorch `DistributedDataParallel` and moving it to the correct accelerator device.\n", - "- `ray.train.get_dataset_shard(...)`: This will get the Ray Dataset shard for this particular Data Parallel worker.\n", - "- `ray.train.save_checkpoint(...)`: This will tell Ray Train to save the provided arguments as a checkpoint. Checkpoints will be written to disk under the `~/ray_results` directory.\n", - "- `ray.train.load_checkpoint()`: Returns a checkpoint to resume from. This is useful for either fault tolerance purposes, or for our purposes, to continue training the same model on a new incoming dataset." + "- `ray.air.session.get_dataset_shard(...)`: This will get the Ray Dataset shard for this particular Data Parallel worker.\n", + "- `ray.air.session.report({}, checkpoint=...)`: This will tell Ray Train to persist the provided `Checkpoint` object.\n", + "- `ray.air.session.get_checkpoint()`: Returns a checkpoint to resume from. This is useful for either fault tolerance purposes, or for our purposes, to continue training the same model on a new incoming dataset." ] }, { @@ -514,6 +514,7 @@ "outputs": [], "source": [ "from ray import train\n", + "from ray.air import session, Checkpoint\n", "\n", "from torch.optim import SGD\n", "from torch.nn import CrossEntropyLoss\n", @@ -529,9 +530,9 @@ " model = SimpleMLP(num_classes=10)\n", "\n", " # Load model from checkpoint if there is a checkpoint to load from.\n", - " checkpoint_to_load = train.load_checkpoint()\n", + " checkpoint_to_load = session.get_checkpoint()\n", " if checkpoint_to_load:\n", - " state_dict_to_resume_from = checkpoint_to_load[\"model\"]\n", + " state_dict_to_resume_from = checkpoint_to_load.to_dict()[\"model\"]\n", " model.load_state_dict(state_dict=state_dict_to_resume_from)\n", "\n", " model = train.torch.prepare_model(model)\n", @@ -540,7 +541,7 @@ " criterion = CrossEntropyLoss()\n", "\n", " # Get the Ray Dataset shard for this data parallel worker, and convert it to a PyTorch Dataset.\n", - " dataset_shard = train.get_dataset_shard(\"train\").to_torch(\n", + " dataset_shard = session.get_dataset_shard(\"train\").to_torch(\n", " label_column=\"label\",\n", " batch_size=batch_size,\n", " unsqueeze_feature_tensors=False,\n", @@ -548,6 +549,7 @@ " )\n", "\n", " for epoch_idx in range(num_epochs):\n", + " running_loss = 0\n", " for iteration, (train_mb_x, train_mb_y) in enumerate(dataset_shard):\n", " optimizer.zero_grad()\n", " train_mb_x = train_mb_x.to(train.torch.get_device())\n", @@ -562,13 +564,15 @@ " # Update\n", " optimizer.step()\n", "\n", - " if train.world_rank() == 0 and iteration % 500 == 0:\n", + " running_loss += loss.item()\n", + " if session.get_world_rank() == 0 and iteration % 500 == 0:\n", " print(f\"loss: {loss.item():>7f}, epoch: {epoch_idx}, iteration: {iteration}\")\n", "\n", " # Checkpoint model after every epoch.\n", " state_dict = model.state_dict()\n", " consume_prefix_in_state_dict_if_present(state_dict, \"module.\")\n", - " train.save_checkpoint(model=state_dict)" + " checkpoint = Checkpoint.from_dict(dict(model=state_dict))\n", + " session.report({\"loss\": running_loss}, checkpoint=checkpoint)" ] }, { @@ -1237,7 +1241,6 @@ "source": [ "from ray.train.torch import TorchTrainer\n", "from ray.train.torch import TorchPredictor\n", - "from ray.air import Checkpoint\n", "from ray import serve\n", "from ray.serve.model_wrappers import ModelWrapperDeployment\n", "from ray.serve.http_adapters import json_to_ndarray\n", diff --git a/doc/source/ray-core/_examples/datasets_train/datasets_train.py b/doc/source/ray-core/_examples/datasets_train/datasets_train.py index dcc9635251ba..a915c3029f27 100644 --- a/doc/source/ray-core/_examples/datasets_train/datasets_train.py +++ b/doc/source/ray-core/_examples/datasets_train/datasets_train.py @@ -18,6 +18,7 @@ import boto3 import mlflow import pandas as pd +from ray.train.torch.torch_trainer import TorchTrainer import torch import torch.nn as nn import torch.optim as optim @@ -25,10 +26,9 @@ import ray from ray import train +from ray.air import session, Checkpoint, RunConfig from ray.data.aggregate import Mean, Std -from ray.train import Trainer -from ray.train.callbacks import TBXLoggerCallback -from ray.train.callbacks.logging import MLflowLoggerCallback +from ray.air.callbacks.mlflow import MLflowLoggerCallback def make_and_upload_dataset(dir_path): @@ -404,14 +404,16 @@ def train_func(config): # Setup device. device = torch.device( - f"cuda:{train.local_rank()}" if use_gpu and torch.cuda.is_available() else "cpu" + f"cuda:{session.get_local_rank()}" + if use_gpu and torch.cuda.is_available() + else "cpu" ) print(f"Device: {device}") # Setup data. - train_dataset_pipeline = train.get_dataset_shard("train_dataset") + train_dataset_pipeline = session.get_dataset_shard("train") train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs() - test_dataset = train.get_dataset_shard("test_dataset") + test_dataset = session.get_dataset_shard("test") test_torch_dataset = test_dataset.to_torch( label_column="label", batch_size=batch_size ) @@ -456,20 +458,20 @@ def train_func(config): f"{test_num_correct} / {test_num_total} = {test_acc:.4f}" ) - # Record and log stats. - train.report( - train_acc=train_acc, - train_loss=train_running_loss, - test_acc=test_acc, - test_loss=test_running_loss, - ) - # Checkpoint model. module = net.module if isinstance(net, DistributedDataParallel) else net - train.save_checkpoint(model_state_dict=module.state_dict()) + checkpoint = Checkpoint.from_dict(dict(model=module.cpu())) - if train.world_rank() == 0: - return module.cpu() + # Record and log stats. + session.report( + dict( + train_acc=train_acc, + train_loss=train_running_loss, + test_acc=test_acc, + test_loss=test_running_loss, + ), + checkpoint=checkpoint, + ) if __name__ == "__main__": @@ -598,7 +600,7 @@ def train_func(config): train_dataset_pipeline = train_dataset.repeat().random_shuffle_each_window() del train_dataset - datasets = {"train_dataset": train_dataset_pipeline, "test_dataset": test_dataset} + datasets = {"train": train_dataset_pipeline, "test": test_dataset} config = { "use_gpu": use_gpu, @@ -611,15 +613,8 @@ def train_func(config): "num_features": num_features, } - # Create 2 callbacks: one for TensorBoard Logging and one for MLflow - # logging. Pass these into Trainer, and all results that are - # reported by ``train.report()`` will be logged to these 2 places. - # TODO: TBXLoggerCallback should create nonexistent logdir - # and should also create 1 directory per file. - tbx_logdir = "./runs" - os.makedirs(tbx_logdir, exist_ok=True) + # Create the MLflowLoggerCallback callbacks = [ - TBXLoggerCallback(logdir=tbx_logdir), MLflowLoggerCallback( experiment_name="cuj-big-data-training", save_artifact=True ), @@ -628,18 +623,19 @@ def train_func(config): # Remove CPU resource so Datasets can be scheduled. resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None - trainer = Trainer( - backend="torch", - num_workers=num_workers, - use_gpu=use_gpu, - resources_per_worker=resources_per_worker, - ) - trainer.start() - results = trainer.run( - train_func=train_func, config=config, callbacks=callbacks, dataset=datasets + trainer = TorchTrainer( + train_func, + train_loop_config=config, + datasets=datasets, + scaling_config=dict( + num_workers=num_workers, + use_gpu=use_gpu, + resources_per_worker=resources_per_worker, + ), + run_config=RunConfig(callbacks=callbacks), ) - model = results[0] - trainer.shutdown() + results = trainer.fit() + model = results.checkpoint.to_dict()["model"] if args.mlflow_register_model: mlflow.pytorch.log_model( diff --git a/doc/source/train/faq.rst b/doc/source/train/faq.rst index 3230b4ff6041..f28fbecaf9d1 100644 --- a/doc/source/train/faq.rst +++ b/doc/source/train/faq.rst @@ -9,7 +9,7 @@ How fast is Ray Train compared to PyTorch, TensorFlow, etc.? At its core, training speed should be the same - while Ray Train launches distributed training workers via Ray Actors, communication during training (e.g. gradient synchronization) is handled by the backend training framework itself. -For example, when running Ray Train with the ``"torch"`` backend, +For example, when running Ray Train with the ``TorchTrainer``, distributed training communication is done with Torch's ``DistributedDataParallel``. How do I set resources? @@ -18,7 +18,7 @@ How do I set resources? By default, each worker will reserve 1 CPU resource, and an additional 1 GPU resource if ``use_gpu=True``. To override these resource requests or request additional custom resources, -you can initialize the ``Trainer`` with ``resources_per_worker``. +you can initialize the ``Trainer`` with ``resources_per_worker`` specified in ``scaling_config``. .. note:: Some GPU utility functions (e.g. :ref:`train-api-torch-get-device`, :ref:`train-api-torch-prepare-model`) @@ -36,5 +36,5 @@ If you try to create a Matplotlib plot in the training function, you may encount To handle this, consider the following approaches: -1. If there is no dependency on any code in your training function, simply move the Matplotlib logic out and execute it before or after ``trainer.run``. -2. If you are plotting metrics, you can pass the metrics via ``train.report()`` and create a :ref:`custom callback ` to plot the results. +1. If there is no dependency on any code in your training function, simply move the Matplotlib logic out and execute it before or after ``trainer.fit()``. +2. If you are plotting metrics, you can pass the metrics via ``session.report()`` and create a :ref:`custom callback ` to plot the results. diff --git a/doc/source/tune/examples/horovod_simple.ipynb b/doc/source/tune/examples/horovod_simple.ipynb index bcf716f99dc3..5fe71d15f809 100644 --- a/doc/source/tune/examples/horovod_simple.ipynb +++ b/doc/source/tune/examples/horovod_simple.ipynb @@ -40,8 +40,8 @@ "import torch\n", "\n", "import ray\n", - "from ray import train\n", "from ray import tune\n", + "from ray.air import session\n", "from ray.train.horovod import HorovodTrainer\n", "from ray.tune.tune_config import TuneConfig\n", "from ray.tune.tuner import Tuner\n", @@ -119,7 +119,7 @@ "\n", " optimizer.step()\n", " time.sleep(0.1)\n", - " train.report(loss=loss.item())\n", + " session.report(dict(loss=loss.item()))\n", " total = time.time() - start\n", " print(f\"Took {total:0.3f} s. Avg: {total / num_steps:0.3f} s.\")\n", "\n", diff --git a/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb b/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb index 0d8e36efcebc..5862d75e0d43 100644 --- a/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb +++ b/python/ray/air/examples/tf/tensorflow_autoencoder_example.ipynb @@ -2034,7 +2034,7 @@ } ], "source": [ - "from ray.air.train.integrations.tensorflow import TensorflowTrainer\n", + "from ray.train.tensorflow import TensorflowTrainer\n", "from ray.air.result import Result\n", "\n", "def train_tensorflow_mnist(\n", From cc7d0663167cbf2e77c5fb76402f64442fdcc9fc Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 5 Jul 2022 19:25:35 +0000 Subject: [PATCH 62/70] Fix horovod test --- .../horovod/workloads/horovod_tune_test.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/release/air_tests/horovod/workloads/horovod_tune_test.py b/release/air_tests/horovod/workloads/horovod_tune_test.py index bab14b9bbf0e..e437d4e99abe 100755 --- a/release/air_tests/horovod/workloads/horovod_tune_test.py +++ b/release/air_tests/horovod/workloads/horovod_tune_test.py @@ -58,6 +58,7 @@ def train_loop_per_worker(config): trainloader = DataLoader( trainset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=4 ) + trainloader_len = len(trainloader) for epoch in range(epoch, 40): # loop over the dataset multiple times running_loss = 0.0 @@ -79,21 +80,22 @@ def train_loop_per_worker(config): # print statistics running_loss += loss.item() epoch_steps += 1 + if i == trainloader_len - 1: + checkpoint = Checkpoint.from_dict( + dict( + model_state=net.state_dict(), + optimizer_state=optimizer.state_dict(), + epoch=epoch, + ) + ) + else: + checkpoint = None + session.report(dict(loss=running_loss / epoch_steps), checkpoint=checkpoint) if i % 2000 == 1999: # print every 2000 mini-batches print( "[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps) ) - session.report( - dict(loss=running_loss / epoch_steps), - checkpoint=Checkpoint.from_dict( - dict( - model_state=net.state_dict(), - optimizer_state=optimizer.state_dict(), - epoch=epoch, - ) - ), - ) if __name__ == "__main__": From 6cf961ea14f297165097412ce88e179ca9a537a0 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 5 Jul 2022 20:56:29 +0000 Subject: [PATCH 63/70] Fix CI --- .../_examples/datasets_train/datasets_train.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/ray-core/_examples/datasets_train/datasets_train.py b/doc/source/ray-core/_examples/datasets_train/datasets_train.py index a915c3029f27..69560c14c31c 100644 --- a/doc/source/ray-core/_examples/datasets_train/datasets_train.py +++ b/doc/source/ray-core/_examples/datasets_train/datasets_train.py @@ -18,6 +18,7 @@ import boto3 import mlflow import pandas as pd +from ray.air.config import DatasetConfig from ray.train.torch.torch_trainer import TorchTrainer import torch import torch.nn as nn @@ -596,11 +597,7 @@ def train_func(config): DROPOUT_EVERY = 5 DROPOUT_PROB = 0.2 - # Random global shuffle - train_dataset_pipeline = train_dataset.repeat().random_shuffle_each_window() - del train_dataset - - datasets = {"train": train_dataset_pipeline, "test": test_dataset} + datasets = {"train": train_dataset, "test": test_dataset} config = { "use_gpu": use_gpu, @@ -633,6 +630,9 @@ def train_func(config): resources_per_worker=resources_per_worker, ), run_config=RunConfig(callbacks=callbacks), + dataset_config={ + "train": DatasetConfig(use_stream_api=True, global_shuffle=True) + }, ) results = trainer.fit() model = results.checkpoint.to_dict()["model"] From 330c36be89b7a9b294d52caf69380146333b0128 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 5 Jul 2022 22:05:24 +0000 Subject: [PATCH 64/70] Fix CI --- .../ray-core/_examples/datasets_train/datasets_train.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/ray-core/_examples/datasets_train/datasets_train.py b/doc/source/ray-core/_examples/datasets_train/datasets_train.py index 69560c14c31c..8f539d37ae04 100644 --- a/doc/source/ray-core/_examples/datasets_train/datasets_train.py +++ b/doc/source/ray-core/_examples/datasets_train/datasets_train.py @@ -631,7 +631,9 @@ def train_func(config): ), run_config=RunConfig(callbacks=callbacks), dataset_config={ - "train": DatasetConfig(use_stream_api=True, global_shuffle=True) + "train": DatasetConfig( + use_stream_api=True, stream_window_size=-1, global_shuffle=True + ) }, ) results = trainer.fit() From 30c9ab896cda4d1cc7e8ba94c03b93cf015e024d Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 5 Jul 2022 22:59:58 +0000 Subject: [PATCH 65/70] Fix CI --- .../datasets_train/datasets_train.py | 45 +++++++++---------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/doc/source/ray-core/_examples/datasets_train/datasets_train.py b/doc/source/ray-core/_examples/datasets_train/datasets_train.py index 8f539d37ae04..350fe50d61a2 100644 --- a/doc/source/ray-core/_examples/datasets_train/datasets_train.py +++ b/doc/source/ray-core/_examples/datasets_train/datasets_train.py @@ -416,7 +416,7 @@ def train_func(config): train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs() test_dataset = session.get_dataset_shard("test") test_torch_dataset = test_dataset.to_torch( - label_column="label", batch_size=batch_size + label_column="label", batch_size=batch_size, drop_last=True ) net = Net( @@ -461,9 +461,10 @@ def train_func(config): # Checkpoint model. module = net.module if isinstance(net, DistributedDataParallel) else net - checkpoint = Checkpoint.from_dict(dict(model=module.cpu())) + checkpoint = Checkpoint.from_dict(dict(model=module.state_dict())) # Record and log stats. + print(f"session report on {session.get_world_rank()}") session.report( dict( train_acc=train_acc, @@ -637,9 +638,27 @@ def train_func(config): }, ) results = trainer.fit() - model = results.checkpoint.to_dict()["model"] + state_dict = results.checkpoint.to_dict()["model"] + + def load_model_func(): + num_layers = config["num_layers"] + num_hidden = config["num_hidden"] + dropout_every = config["dropout_every"] + dropout_prob = config["dropout_prob"] + num_features = config["num_features"] + + model = Net( + n_layers=num_layers, + n_features=num_features, + num_hidden=num_hidden, + dropout_every=dropout_every, + drop_prob=dropout_prob, + ) + model.load_state_dict(state_dict) + return model if args.mlflow_register_model: + model = load_model_func() mlflow.pytorch.log_model( model, artifact_path="models", registered_model_name="torch_model" ) @@ -658,26 +677,6 @@ def load_model_func(): model_uri = f"models:/torch_model/{latest_version}" return mlflow.pytorch.load_model(model_uri) - else: - state_dict = model.state_dict() - - def load_model_func(): - num_layers = config["num_layers"] - num_hidden = config["num_hidden"] - dropout_every = config["dropout_every"] - dropout_prob = config["dropout_prob"] - num_features = config["num_features"] - - model = Net( - n_layers=num_layers, - n_features=num_features, - num_hidden=num_hidden, - dropout_every=dropout_every, - drop_prob=dropout_prob, - ) - model.load_state_dict(state_dict) - return model - class BatchInferModel: def __init__(self, load_model_func): self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") From d0affbcfa2f452bfd42d7ae4852431a533908437 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 6 Jul 2022 16:06:48 +0000 Subject: [PATCH 66/70] Fix tests --- .../examples/horovod/horovod_pytorch_example.py | 5 +++++ python/ray/train/huggingface/_huggingface_utils.py | 2 +- python/ray/train/tests/test_huggingface_trainer.py | 14 +++++++------- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/python/ray/air/examples/horovod/horovod_pytorch_example.py b/python/ray/air/examples/horovod/horovod_pytorch_example.py index 946cddc4fd59..62bcfaa4e92d 100644 --- a/python/ray/air/examples/horovod/horovod_pytorch_example.py +++ b/python/ray/air/examples/horovod/horovod_pytorch_example.py @@ -142,6 +142,7 @@ def train_func(config): model, optimizer, train_loader, train_sampler = setup(config) + results = [] for epoch in range(num_epochs): loss = train_epoch( model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda @@ -151,8 +152,12 @@ def train_func(config): else: checkpoint_dict = dict(model=model) checkpoint_dict = Checkpoint.from_dict(checkpoint_dict) + results.append(loss) session.report(dict(loss=loss), checkpoint=checkpoint_dict) + # Only used for testing. + return results + def main(num_workers, use_gpu, kwargs): trainer = HorovodTrainer( diff --git a/python/ray/train/huggingface/_huggingface_utils.py b/python/ray/train/huggingface/_huggingface_utils.py index d7b50f810d99..7fc4237d16cd 100644 --- a/python/ray/train/huggingface/_huggingface_utils.py +++ b/python/ray/train/huggingface/_huggingface_utils.py @@ -149,7 +149,7 @@ def on_save(self, args, state, control, **kwargs): ) def _report(self): - if self.delayed_report: + if self.delayed_report["metrics"]: session.report(**self.delayed_report) self.delayed_report = {"metrics": {}, "checkpoint": None} diff --git a/python/ray/train/tests/test_huggingface_trainer.py b/python/ray/train/tests/test_huggingface_trainer.py index bde24f1bf7a1..a1b30cdeb48d 100644 --- a/python/ray/train/tests/test_huggingface_trainer.py +++ b/python/ray/train/tests/test_huggingface_trainer.py @@ -107,7 +107,7 @@ def test_reporting(): def _fake_report(**kwargs): reports.append(kwargs) - with patch("ray.train.report", _fake_report): + with patch("ray.air.session.report", _fake_report): state = TrainerState() report_callback = TrainReportCallback() report_callback.on_epoch_begin(None, state, None) @@ -125,12 +125,12 @@ def _fake_report(**kwargs): report_callback.on_train_end(None, state, None) assert len(reports) == 2 - assert "log1" in reports[0] - assert "log2" in reports[0] - assert reports[0]["epoch"] == 1 - assert "log1" in reports[1] - assert "log2" in reports[1] - assert reports[1]["epoch"] == 2 + assert "log1" in reports[0]["metrics"] + assert "log2" in reports[0]["metrics"] + assert reports[0]["metrics"]["epoch"] == 1 + assert "log1" in reports[1]["metrics"] + assert "log2" in reports[1]["metrics"] + assert reports[1]["metrics"]["epoch"] == 2 if __name__ == "__main__": From 587ad5634779021acc6ac63a6df49ae67bbf47d3 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 6 Jul 2022 18:24:59 +0000 Subject: [PATCH 67/70] Add todo --- python/ray/train/examples/torch_fashion_mnist_example.py | 2 ++ python/ray/train/examples/torch_linear_example.py | 1 + python/ray/train/examples/tune_cifar_torch_pbt_example.py | 2 ++ 3 files changed, 5 insertions(+) diff --git a/python/ray/train/examples/torch_fashion_mnist_example.py b/python/ray/train/examples/torch_fashion_mnist_example.py index 6e8db3220db4..5d716cb2dd91 100644 --- a/python/ray/train/examples/torch_fashion_mnist_example.py +++ b/python/ray/train/examples/torch_fashion_mnist_example.py @@ -114,6 +114,8 @@ def train_func(config: Dict): train.report(loss=loss) loss_results.append(loss) + # return required for backwards compatibility with the old API + # TODO(team-ml) clean up and remove return return loss_results diff --git a/python/ray/train/examples/torch_linear_example.py b/python/ray/train/examples/torch_linear_example.py index ceabd0c2853f..892cbb486244 100644 --- a/python/ray/train/examples/torch_linear_example.py +++ b/python/ray/train/examples/torch_linear_example.py @@ -81,6 +81,7 @@ def train_func(config): train.report(**result) results.append(result) # return required for backwards compatibility with the old API + # TODO(team-ml) clean up and remove return return results diff --git a/python/ray/train/examples/tune_cifar_torch_pbt_example.py b/python/ray/train/examples/tune_cifar_torch_pbt_example.py index f0b5c786ff8d..bddc01e1cd95 100644 --- a/python/ray/train/examples/tune_cifar_torch_pbt_example.py +++ b/python/ray/train/examples/tune_cifar_torch_pbt_example.py @@ -117,6 +117,8 @@ def train_func(config): train.report(**result) results.append(result) + # return required for backwards compatibility with the old API + # TODO(team-ml) clean up and remove return return results From 139f44d495083261ac0cb3ff33c80f18d59a5ff0 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 6 Jul 2022 18:33:30 +0000 Subject: [PATCH 68/70] Use `trial_logdir` instead --- python/ray/air/result.py | 3 ++- python/ray/train/examples/mlflow_simple_example.py | 2 +- python/ray/tune/result_grid.py | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/ray/air/result.py b/python/ray/air/result.py index 833aa7660f33..a6cf8fe47353 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +from pathlib import Path from typing import Any, Dict, List, Optional, Tuple from ray.air.checkpoint import Checkpoint @@ -38,7 +39,7 @@ class Result: metrics: Optional[Dict[str, Any]] checkpoint: Optional[Checkpoint] error: Optional[Exception] - log_dir: Optional[str] + log_dir: Optional[Path] metrics_dataframe: Optional[pd.DataFrame] best_checkpoints: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]] diff --git a/python/ray/train/examples/mlflow_simple_example.py b/python/ray/train/examples/mlflow_simple_example.py index d64d0525ae58..b3a7264b20d8 100644 --- a/python/ray/train/examples/mlflow_simple_example.py +++ b/python/ray/train/examples/mlflow_simple_example.py @@ -41,7 +41,7 @@ def train_func(): # Print the latest run directory and keep note of it. # For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06 -print("Run directory:", result.log_dir) +print("Run directory:", result.log_dir.parent) # TensorBoard is saved in parent dir # How to visualize the logs diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 77994e2f491b..bdf39b97f4ef 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -1,4 +1,5 @@ import os +from pathlib import Path from typing import Optional, Union import pandas as pd @@ -180,7 +181,7 @@ def _trial_to_result(self, trial: Trial) -> Result: checkpoint=checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), - log_dir=trial.local_dir, + log_dir=Path(trial.logdir), metrics_dataframe=self._experiment_analysis.trial_dataframes.get( trial.logdir ) From 3a4d3f347d3f1170269ce77ad4adbd7ab057e32d Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 6 Jul 2022 21:49:00 +0000 Subject: [PATCH 69/70] Fix --- python/ray/tune/result_grid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index bdf39b97f4ef..bec6a1438bee 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -181,7 +181,7 @@ def _trial_to_result(self, trial: Trial) -> Result: checkpoint=checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), - log_dir=Path(trial.logdir), + log_dir=Path(trial.logdir) if trial.logdir else None, metrics_dataframe=self._experiment_analysis.trial_dataframes.get( trial.logdir ) From 2ea93d78a4bffc4d2b3e4e000dafdae3dc18390f Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 7 Jul 2022 17:51:47 +0000 Subject: [PATCH 70/70] Only print metrics --- python/ray/train/examples/horovod/horovod_example.py | 2 +- python/ray/train/examples/mlflow_fashion_mnist_example.py | 2 +- python/ray/train/examples/tensorflow_linear_dataset_example.py | 2 +- python/ray/train/examples/tensorflow_mnist_example.py | 2 +- .../auto_pipeline_for_host_to_device_data_transfer.py | 2 +- python/ray/train/examples/torch_fashion_mnist_example.py | 2 +- python/ray/train/examples/torch_linear_dataset_example.py | 2 +- python/ray/train/examples/torch_linear_example.py | 2 +- python/ray/train/examples/transformers/transformers_example.py | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/ray/train/examples/horovod/horovod_example.py b/python/ray/train/examples/horovod/horovod_example.py index c01788008ec5..8e930f7d151f 100644 --- a/python/ray/train/examples/horovod/horovod_example.py +++ b/python/ray/train/examples/horovod/horovod_example.py @@ -158,7 +158,7 @@ def main(num_workers, use_gpu, kwargs): scaling_config={"use_gpu": use_gpu, "num_workers": num_workers}, ) results = trainer.fit() - print(results) + print(results.metrics) # Horovod Class API. diff --git a/python/ray/train/examples/mlflow_fashion_mnist_example.py b/python/ray/train/examples/mlflow_fashion_mnist_example.py index 2d223c43ec1d..99f7b73a525a 100644 --- a/python/ray/train/examples/mlflow_fashion_mnist_example.py +++ b/python/ray/train/examples/mlflow_fashion_mnist_example.py @@ -17,7 +17,7 @@ def main(num_workers=2, use_gpu=False): ) final_results = trainer.fit() - print("Full results for rank 0 worker: ", final_results) + print("Final metrics: ", final_results.metrics) if __name__ == "__main__": diff --git a/python/ray/train/examples/tensorflow_linear_dataset_example.py b/python/ray/train/examples/tensorflow_linear_dataset_example.py index 0ee9d48d2077..f3a938e06c0e 100644 --- a/python/ray/train/examples/tensorflow_linear_dataset_example.py +++ b/python/ray/train/examples/tensorflow_linear_dataset_example.py @@ -83,7 +83,7 @@ def train_tensorflow_linear(num_workers=2, use_gpu=False): scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) results = trainer.fit() - print(f"Results: {results}") + print(f"Results: {results.metrics}") return results diff --git a/python/ray/train/examples/tensorflow_mnist_example.py b/python/ray/train/examples/tensorflow_mnist_example.py index 97e8db033025..14f4cf6dc7ef 100644 --- a/python/ray/train/examples/tensorflow_mnist_example.py +++ b/python/ray/train/examples/tensorflow_mnist_example.py @@ -81,7 +81,7 @@ def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4): scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) results = trainer.fit() - print(f"Results: {results[0]}") + print(f"Results: {results.metrics}") if __name__ == "__main__": diff --git a/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py b/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py index 03e69ca67f96..1220f541d034 100644 --- a/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py +++ b/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py @@ -109,7 +109,7 @@ def train_linear(num_workers=1, num_hidden_layers=1, use_auto_transfer=True, epo ) results = trainer.fit() - print(results) + print(results.metrics) return results diff --git a/python/ray/train/examples/torch_fashion_mnist_example.py b/python/ray/train/examples/torch_fashion_mnist_example.py index 5d716cb2dd91..7ad5017bbc5c 100644 --- a/python/ray/train/examples/torch_fashion_mnist_example.py +++ b/python/ray/train/examples/torch_fashion_mnist_example.py @@ -126,7 +126,7 @@ def train_fashion_mnist(num_workers=2, use_gpu=False): scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) result = trainer.fit() - print(f"Results: {result}") + print(f"Results: {result.metrics}") if __name__ == "__main__": diff --git a/python/ray/train/examples/torch_linear_dataset_example.py b/python/ray/train/examples/torch_linear_dataset_example.py index acfa0ce2e637..15fbf0da97b9 100644 --- a/python/ray/train/examples/torch_linear_dataset_example.py +++ b/python/ray/train/examples/torch_linear_dataset_example.py @@ -128,7 +128,7 @@ def train_linear(num_workers=2, use_gpu=False): scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) results = trainer.fit() - print(results) + print(results.metrics) return results diff --git a/python/ray/train/examples/torch_linear_example.py b/python/ray/train/examples/torch_linear_example.py index 892cbb486244..8be2e1d2dcc6 100644 --- a/python/ray/train/examples/torch_linear_example.py +++ b/python/ray/train/examples/torch_linear_example.py @@ -94,7 +94,7 @@ def train_linear(num_workers=2, use_gpu=False, epochs=3): ) results = trainer.fit() - print(results) + print(results.metrics) return results diff --git a/python/ray/train/examples/transformers/transformers_example.py b/python/ray/train/examples/transformers/transformers_example.py index 1b47e5ce2e31..30f9f0158f06 100644 --- a/python/ray/train/examples/transformers/transformers_example.py +++ b/python/ray/train/examples/transformers/transformers_example.py @@ -619,7 +619,7 @@ def main(): scaling_config={"num_workers": args.num_workers, "use_gpu": args.use_gpu}, ) results = trainer.fit() - print(results) + print(results.metrics) else: # Run training locally. train_func(config)