From b39a86490665efa7a307c21841edd6611f4bdfa8 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 13 Jun 2022 21:10:19 +0000 Subject: [PATCH 01/63] Use new Train API for examples --- doc/source/train/examples.rst | 2 +- .../examples/tune_linear_dataset_example.rst | 6 -- .../tune_torch_linear_dataset_example.rst | 6 ++ python/ray/air/result.py | 6 +- python/ray/train/BUILD | 10 --- .../train/examples/horovod/horovod_example.py | 17 +++-- .../examples/mlflow_fashion_mnist_example.py | 21 +++--- .../tensorflow_linear_dataset_example.py | 18 +++-- .../examples/tensorflow_mnist_example.py | 12 ++-- .../train/examples/tensorflow_quick_start.py | 17 +++-- ...peline_for_host_to_device_data_transfer.py | 13 ++-- .../ray/train/examples/torch_quick_start.py | 19 +++--- .../examples/train_fashion_mnist_example.py | 20 +++--- .../examples/train_linear_dataset_example.py | 15 ++-- .../train/examples/train_linear_example.py | 14 ++-- .../transformers/transformers_example.py | 17 +++-- .../tune_cifar_pytorch_pbt_example.py | 56 ++++++++------- .../examples/tune_linear_dataset_example.py | 68 ------------------- .../ray/train/examples/tune_linear_example.py | 34 ++++++---- .../examples/tune_tensorflow_mnist_example.py | 36 +++++----- python/ray/tune/result_grid.py | 6 +- 21 files changed, 178 insertions(+), 235 deletions(-) delete mode 100644 doc/source/train/examples/tune_linear_dataset_example.rst create mode 100644 doc/source/train/examples/tune_torch_linear_dataset_example.rst delete mode 100644 python/ray/train/examples/tune_linear_dataset_example.py diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst index 6affd7457a1c..e644f708b639 100644 --- a/doc/source/train/examples.rst +++ b/doc/source/train/examples.rst @@ -62,7 +62,7 @@ Ray Datasets Integration Examples * :doc:`/train/examples/train_linear_dataset_example`: Simple example for training a linear PyTorch model. -* :doc:`/train/examples/tune_linear_dataset_example`: +* :doc:`/air/examples/tune_torch_linear_dataset_example`: Simple example for tuning a linear PyTorch model. diff --git a/doc/source/train/examples/tune_linear_dataset_example.rst b/doc/source/train/examples/tune_linear_dataset_example.rst deleted file mode 100644 index d25af796465c..000000000000 --- a/doc/source/train/examples/tune_linear_dataset_example.rst +++ /dev/null @@ -1,6 +0,0 @@ -:orphan: - -tune_linear_dataset_example -=========================== - -.. literalinclude:: /../../python/ray/train/examples/tune_linear_dataset_example.py diff --git a/doc/source/train/examples/tune_torch_linear_dataset_example.rst b/doc/source/train/examples/tune_torch_linear_dataset_example.rst new file mode 100644 index 000000000000..22ad2e562660 --- /dev/null +++ b/doc/source/train/examples/tune_torch_linear_dataset_example.rst @@ -0,0 +1,6 @@ +:orphan: + +tune_torch_linear_dataset_example +================================= + +.. literalinclude:: /../../python/ray/air/examples/tune_torch_linear_dataset_example.py diff --git a/python/ray/air/result.py b/python/ray/air/result.py index 69cfd69926b8..97472c64d395 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -1,5 +1,5 @@ -from typing import Any, Dict, Optional from dataclasses import dataclass +from typing import Any, Dict, Optional from ray.air.checkpoint import Checkpoint from ray.util.annotations import PublicAPI @@ -13,7 +13,7 @@ class Result: This is the class produced by Trainer.fit(). It contains a checkpoint, which can be used for resuming training and for creating a Predictor object. It also contains a metrics object describing - training metrics. `error` is included so that non successful runs + training metrics. ``error`` is included so that non successful runs and trials can be represented as well. The constructor is a private API. @@ -22,11 +22,13 @@ class Result: metrics: The final metrics as reported by an Trainable. checkpoint: The final checkpoint of the Trainable. error: The execution error of the Trainable run, if the trial finishes in error. + log_dir: Directory where the trial logs are saved. """ metrics: Optional[Dict[str, Any]] checkpoint: Optional[Checkpoint] error: Optional[Exception] + log_dir: Optional[str] @property def config(self) -> Optional[Dict[str, Any]]: diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index dc2d6357b6cd..d33bad4d9ac2 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -82,16 +82,6 @@ py_test( args = ["--smoke-test"] ) -py_test( - name = "tune_linear_dataset_example", - size = "medium", - main = "examples/tune_linear_dataset_example.py", - srcs = ["examples/tune_linear_dataset_example.py"], - tags = ["team:ml", "exclusive", "gpu_only", "tune"], - deps = [":train_lib"], - args = ["--smoke-test", "--use-gpu"] -) - py_test( name = "tune_linear_example", size = "medium", diff --git a/python/ray/train/examples/horovod/horovod_example.py b/python/ray/train/examples/horovod/horovod_example.py index cb578b1fb18f..c3202307755f 100644 --- a/python/ray/train/examples/horovod/horovod_example.py +++ b/python/ray/train/examples/horovod/horovod_example.py @@ -2,15 +2,16 @@ import os import horovod.torch as hvd -import ray import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch.utils.data.distributed from filelock import FileLock -from ray.train import Trainer from torchvision import datasets, transforms +import ray +from ray.train.horovod import HorovodTrainer + def metric_average(val, name): tensor = torch.tensor(val) @@ -152,11 +153,13 @@ def train_func(config): def main(num_workers, use_gpu, kwargs): - trainer = Trainer("horovod", use_gpu=use_gpu, num_workers=num_workers) - trainer.start() - loss_per_epoch = trainer.run(train_func, config=kwargs) - trainer.shutdown() - print(loss_per_epoch) + trainer = HorovodTrainer( + train_func, + train_loop_config=kwargs, + scaling_config={"use_gpu": use_gpu, "num_workers": num_workers}, + ) + results = trainer.fit() + print(results) # Horovod Class API. diff --git a/python/ray/train/examples/mlflow_fashion_mnist_example.py b/python/ray/train/examples/mlflow_fashion_mnist_example.py index 05f915523543..7cd54b821859 100644 --- a/python/ray/train/examples/mlflow_fashion_mnist_example.py +++ b/python/ray/train/examples/mlflow_fashion_mnist_example.py @@ -1,20 +1,23 @@ import argparse -from ray.train import Trainer +from ray.air import RunConfig from ray.train.examples.train_fashion_mnist_example import train_func -from ray.train.callbacks.logging import MLflowLoggerCallback +from ray.train.torch import TorchTrainer +from ray.tune.integration.mlflow import MLflowLoggerCallback def main(num_workers=2, use_gpu=False): - trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu) - trainer.start() - final_results = trainer.run( - train_func=train_func, - config={"lr": 1e-3, "batch_size": 64, "epochs": 4}, - callbacks=[MLflowLoggerCallback(experiment_name="train_fashion_mnist")], + trainer = TorchTrainer( + train_func, + train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4}, + scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, + run_config=RunConfig( + callbacks=[MLflowLoggerCallback(experiment_name="train_fashion_mnist")] + ), ) + final_results = trainer.fit() - print("Full losses for rank 0 worker: ", final_results) + print("Full results for rank 0 worker: ", final_results) if __name__ == "__main__": diff --git a/python/ray/train/examples/tensorflow_linear_dataset_example.py b/python/ray/train/examples/tensorflow_linear_dataset_example.py index c1360195b36c..9271c5125da4 100644 --- a/python/ray/train/examples/tensorflow_linear_dataset_example.py +++ b/python/ray/train/examples/tensorflow_linear_dataset_example.py @@ -7,8 +7,7 @@ import ray.train as train from ray.data import Dataset from ray.data.dataset_pipeline import DatasetPipeline -from ray.train import Trainer -from ray.train.tensorflow import prepare_dataset_shard +from ray.train.tensorflow import TensorflowTrainer, prepare_dataset_shard class TrainReportCallback(Callback): @@ -55,7 +54,7 @@ def train_func(config): # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = build_and_compile_model(config) - dataset_pipeline = train.get_dataset_shard() + dataset_pipeline = train.get_dataset_shard("train") dataset_iterator = dataset_pipeline.iter_epochs() results = [] @@ -78,14 +77,13 @@ def train_func(config): def train_tensorflow_linear(num_workers=2, use_gpu=False): dataset_pipeline = get_dataset_pipeline() - trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) - trainer.start() - results = trainer.run( - train_func=train_func, - dataset=dataset_pipeline, - config={"lr": 1e-3, "batch_size": 32, "epochs": 4}, + trainer = TensorflowTrainer( + train_func, + train_loop_config={"lr": 1e-3, "batch_size": 32, "epochs": 4}, + datasets={"train": dataset_pipeline}, + scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) - trainer.shutdown() + results = trainer.fit() print(f"Results: {results[0]}") return results diff --git a/python/ray/train/examples/tensorflow_mnist_example.py b/python/ray/train/examples/tensorflow_mnist_example.py index 3e89969cc58e..980e58652d95 100644 --- a/python/ray/train/examples/tensorflow_mnist_example.py +++ b/python/ray/train/examples/tensorflow_mnist_example.py @@ -10,7 +10,7 @@ from tensorflow.keras.callbacks import Callback import ray.train as train -from ray.train import Trainer +from ray.train.tensorflow import TensorflowTrainer class TrainReportCallback(Callback): @@ -81,12 +81,12 @@ def train_func(config): def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4): - trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) - trainer.start() - results = trainer.run( - train_func=train_func, config={"lr": 1e-3, "batch_size": 64, "epochs": epochs} + trainer = TensorflowTrainer( + train_func, + train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": epochs}, + scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) - trainer.shutdown() + results = trainer.fit() print(f"Results: {results[0]}") diff --git a/python/ray/train/examples/tensorflow_quick_start.py b/python/ray/train/examples/tensorflow_quick_start.py index 0907853135b9..0ac3666672e2 100644 --- a/python/ray/train/examples/tensorflow_quick_start.py +++ b/python/ray/train/examples/tensorflow_quick_start.py @@ -3,6 +3,9 @@ # __tf_setup_begin__ +import json +import os + import numpy as np import tensorflow as tf @@ -47,8 +50,6 @@ def train_func(): # __tf_distributed_begin__ -import json -import os def train_func_distributed(): per_worker_batch_size = 64 @@ -78,15 +79,13 @@ def train_func_distributed(): # __tf_trainer_begin__ - from ray.train import Trainer - - trainer = Trainer(backend="tensorflow", num_workers=4) + from ray.train.tensorflow import TensorflowTrainer # For GPU Training, set `use_gpu` to True. - # trainer = Trainer(backend="tensorflow", num_workers=4, use_gpu=True) + use_gpu = False + + trainer = TensorflowTrainer(train_func_distributed, scaling_config={"num_workers":4, "use_gpu":use_gpu}) - trainer.start() - results = trainer.run(train_func_distributed) - trainer.shutdown() + trainer.fit() # __tf_trainer_end__ diff --git a/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py b/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py index c8cc25b044a0..03e69ca67f96 100644 --- a/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py +++ b/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py @@ -5,8 +5,9 @@ import numpy as np import torch import torch.nn as nn + import ray.train as train -from ray.train import Trainer +from ray.train.torch import TorchTrainer class Net(nn.Module): @@ -94,7 +95,6 @@ def train_func(config): def train_linear(num_workers=1, num_hidden_layers=1, use_auto_transfer=True, epochs=3): - trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=True) config = { "lr": 1e-2, "hidden_size": num_hidden_layers, @@ -102,9 +102,12 @@ def train_linear(num_workers=1, num_hidden_layers=1, use_auto_transfer=True, epo "epochs": epochs, "use_auto_transfer": use_auto_transfer, } - trainer.start() - results = trainer.run(train_func, config) - trainer.shutdown() + trainer = TorchTrainer( + train_func, + train_loop_config=config, + scaling_config={"use_gpu": True, "num_workers": num_workers}, + ) + results = trainer.fit() print(results) return results diff --git a/python/ray/train/examples/torch_quick_start.py b/python/ray/train/examples/torch_quick_start.py index e152c8604610..eaf07a95a5d1 100644 --- a/python/ray/train/examples/torch_quick_start.py +++ b/python/ray/train/examples/torch_quick_start.py @@ -4,6 +4,10 @@ # __torch_setup_begin__ import torch import torch.nn as nn +import torch.optim as optim + +import ray.train.torch +from ray import train num_samples = 20 input_size = 10 @@ -28,7 +32,6 @@ def forward(self, input): # __torch_single_begin__ -import torch.optim as optim def train_func(): num_epochs = 3 @@ -48,8 +51,6 @@ def train_func(): # __torch_distributed_begin__ -from ray import train -import ray.train.torch def train_func_distributed(): num_epochs = 3 @@ -78,15 +79,13 @@ def train_func_distributed(): # __torch_trainer_begin__ - from ray.train import Trainer - - trainer = Trainer(backend="torch", num_workers=4) + from ray.train.torch import TorchTrainer # For GPU Training, set `use_gpu` to True. - # trainer = Trainer(backend="torch", num_workers=4, use_gpu=True) + use_gpu = False + + trainer = TorchTrainer(train_func_distributed, scaling_config={"num_workers":4, "use_gpu":use_gpu}) - trainer.start() - results = trainer.run(train_func_distributed) - trainer.shutdown() + results = trainer.fit() # __torch_trainer_end__ diff --git a/python/ray/train/examples/train_fashion_mnist_example.py b/python/ray/train/examples/train_fashion_mnist_example.py index 5c172dc5a949..6e8db3220db4 100644 --- a/python/ray/train/examples/train_fashion_mnist_example.py +++ b/python/ray/train/examples/train_fashion_mnist_example.py @@ -2,14 +2,14 @@ from typing import Dict import torch -import ray.train as train -from ray.train.trainer import Trainer -from ray.train.callbacks import JsonLoggerCallback from torch import nn from torch.utils.data import DataLoader from torchvision import datasets from torchvision.transforms import ToTensor +import ray.train as train +from ray.train.torch import TorchTrainer + # Download training data from open datasets. training_data = datasets.FashionMNIST( root="~/data", @@ -118,15 +118,13 @@ def train_func(config: Dict): def train_fashion_mnist(num_workers=2, use_gpu=False): - trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu) - trainer.start() - result = trainer.run( - train_func=train_func, - config={"lr": 1e-3, "batch_size": 64, "epochs": 4}, - callbacks=[JsonLoggerCallback()], + trainer = TorchTrainer( + train_func, + train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4}, + scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) - trainer.shutdown() - print(f"Loss results: {result}") + result = trainer.fit() + print(f"Results: {result}") if __name__ == "__main__": diff --git a/python/ray/train/examples/train_linear_dataset_example.py b/python/ray/train/examples/train_linear_dataset_example.py index 2ce30c9b81b8..1cfbff434c9f 100644 --- a/python/ray/train/examples/train_linear_dataset_example.py +++ b/python/ray/train/examples/train_linear_dataset_example.py @@ -8,8 +8,7 @@ import ray.train as train from ray.data import Dataset from ray.data.dataset_pipeline import DatasetPipeline -from ray.train import Trainer -from ray.train.callbacks import JsonLoggerCallback, TBXLoggerCallback +from ray.train.torch import TorchTrainer def get_datasets(a=5, b=10, size=1000, split=0.8) -> Dict[str, DatasetPipeline]: @@ -120,16 +119,14 @@ def train_func(config): def train_linear(num_workers=2, use_gpu=False): datasets = get_datasets() - trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} - trainer.start() - results = trainer.run( + trainer = TorchTrainer( train_func, - config, - dataset=datasets, - callbacks=[JsonLoggerCallback(), TBXLoggerCallback()], + train_loop_config=config, + datasets=datasets, + scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) - trainer.shutdown() + results = trainer.fit() print(results) return results diff --git a/python/ray/train/examples/train_linear_example.py b/python/ray/train/examples/train_linear_example.py index 8a784190d3cc..40d850754401 100644 --- a/python/ray/train/examples/train_linear_example.py +++ b/python/ray/train/examples/train_linear_example.py @@ -3,9 +3,9 @@ import numpy as np import torch import torch.nn as nn + import ray.train as train -from ray.train import Trainer -from ray.train.callbacks import JsonLoggerCallback, TBXLoggerCallback +from ray.train.torch import TorchTrainer class LinearDataset(torch.utils.data.Dataset): @@ -86,13 +86,13 @@ def train_func(config): def train_linear(num_workers=2, use_gpu=False, epochs=3): - trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} - trainer.start() - results = trainer.run( - train_func, config, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()] + trainer = TorchTrainer( + train_func, + train_loop_config=config, + scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) - trainer.shutdown() + results = trainer.fit() print(results) return results diff --git a/python/ray/train/examples/transformers/transformers_example.py b/python/ray/train/examples/transformers/transformers_example.py index ce269733d4de..b6cb461c4b73 100644 --- a/python/ray/train/examples/transformers/transformers_example.py +++ b/python/ray/train/examples/transformers/transformers_example.py @@ -20,14 +20,12 @@ import math import os import random -from typing import Dict, Any +from typing import Any, Dict import datasets -import ray import transformers from accelerate import Accelerator from datasets import load_dataset, load_metric -from ray.train import Trainer from torch.utils.data.dataloader import DataLoader from tqdm.auto import tqdm from transformers import ( @@ -44,6 +42,9 @@ ) from transformers.utils.versions import require_version +import ray +from ray.train.torch import TorchTrainer + logger = logging.getLogger(__name__) require_version( @@ -612,9 +613,13 @@ def main(): else: # Connect to a Ray cluster for distributed training. ray.init(address=args.address) - trainer = Trainer("torch", num_workers=args.num_workers, use_gpu=args.use_gpu) - trainer.start() - trainer.run(train_func, config) + trainer = TorchTrainer( + train_func, + train_loop_config=config, + scaling_config={"num_workers": args.num_workers, "use_gpu": args.use_gpu}, + ) + results = trainer.fit() + print(results) else: # Run training locally. train_func(config) diff --git a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py index c600684e2479..5e4711adae84 100644 --- a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py +++ b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py @@ -11,9 +11,11 @@ import ray import ray.train as train from ray import tune -from ray.train import Trainer -from ray.tune import CLIReporter +from ray.air.config import FailureConfig, RunConfig +from ray.train.torch import TorchTrainer from ray.tune.schedulers import PopulationBasedTraining +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner from ray.util.ml_utils.resnet import ResNet18 @@ -149,8 +151,10 @@ def train_func(config): else: ray.init(address=args.address) - trainer = Trainer("torch", num_workers=args.num_workers, use_gpu=args.use_gpu) - Trainable = trainer.to_tune_trainable(train_func) + trainer = TorchTrainer( + train_func, + scaling_config={"num_workers": args.num_workers, "use_gpu": args.use_gpu}, + ) pbt_scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="loss", @@ -158,32 +162,32 @@ def train_func(config): perturbation_interval=1, hyperparam_mutations={ # distribution for resampling - "lr": lambda: np.random.uniform(0.001, 1), + "train_loop_config/lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values - "momentum": [0.8, 0.9, 0.99], + "train_loop_config/momentum": [0.8, 0.9, 0.99], }, ) - reporter = CLIReporter() - reporter.add_metric_column("loss", "loss") - - analysis = tune.run( - Trainable, - num_samples=4, - config={ - "lr": tune.choice([0.001, 0.01, 0.1]), - "momentum": 0.8, - "batch_size": 128 * args.num_workers, - "epochs": args.num_epochs, - "test_mode": args.smoke_test, # whether to to subset the data + tuner = Tuner( + trainer, + param_space={ + "train_loop_config": { + "lr": tune.choice([0.001, 0.01, 0.1]), + "momentum": 0.8, + "batch_size": 128 * args.num_workers, + "epochs": args.num_epochs, + "test_mode": args.smoke_test, # whether to to subset the data + } }, - stop={"training_iteration": 2 if args.smoke_test else 100}, - max_failures=3, # used for fault tolerance - checkpoint_freq=3, # used for fault tolerance - keep_checkpoints_num=1, # used for fault tolerance - verbose=2, - progress_reporter=reporter, - scheduler=pbt_scheduler, + tune_config=TuneConfig( + num_samples=4, metric="loss", mode="min", scheduler=pbt_scheduler + ), + run_config=RunConfig( + stop={"training_iteration": 2 if args.smoke_test else 100}, + failure=FailureConfig(max_failures=3), # used for fault tolerance + ), ) - print(analysis.get_best_config(metric="loss", mode="min")) + results = tuner.fit() + + print(results.get_best_result(metric="loss", mode="min")) diff --git a/python/ray/train/examples/tune_linear_dataset_example.py b/python/ray/train/examples/tune_linear_dataset_example.py deleted file mode 100644 index adc04f9ba3e3..000000000000 --- a/python/ray/train/examples/tune_linear_dataset_example.py +++ /dev/null @@ -1,68 +0,0 @@ -import argparse - -import ray -from ray import tune -from ray.train import Trainer - -from train_linear_dataset_example import train_func, get_datasets - - -def tune_linear(num_workers, num_samples, use_gpu): - datasets = get_datasets() - - trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu) - Trainable = trainer.to_tune_trainable(train_func, dataset=datasets) - analysis = tune.run( - Trainable, - num_samples=num_samples, - config={ - "lr": tune.loguniform(1e-4, 1e-1), - "batch_size": tune.choice([4, 16, 32]), - "epochs": 3, - }, - ) - results = analysis.get_best_config(metric="loss", mode="min") - print(results) - return results - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--smoke-test", - action="store_true", - default=False, - help="Finish quickly for testing.", - ) - parser.add_argument( - "--address", required=False, type=str, help="the address to use for Ray" - ) - parser.add_argument( - "--num-workers", - "-n", - type=int, - default=2, - help="Sets number of workers for training.", - ) - parser.add_argument( - "--num-samples", - type=int, - default=2, - help="Sets number of samples for training.", - ) - parser.add_argument( - "--use-gpu", action="store_true", default=False, help="Use GPU for training." - ) - - args = parser.parse_args() - - if args.smoke_test: - # 1 for driver, 1 for datasets - num_cpus = args.num_workers + 2 - num_gpus = args.num_workers if args.use_gpu else 0 - ray.init(num_cpus=args.num_workers + 2, num_gpus=num_gpus) - else: - ray.init(address=args.address) - tune_linear( - num_workers=args.num_workers, use_gpu=args.use_gpu, num_samples=args.num_samples - ) diff --git a/python/ray/train/examples/tune_linear_example.py b/python/ray/train/examples/tune_linear_example.py index a0641906c202..5d4a8edc911b 100644 --- a/python/ray/train/examples/tune_linear_example.py +++ b/python/ray/train/examples/tune_linear_example.py @@ -1,27 +1,31 @@ import argparse +from train_linear_example import train_func + import ray from ray import tune -from ray.train import Trainer - -from train_linear_example import train_func +from ray.train.torch import TorchTrainer +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner def tune_linear(num_workers, num_samples): - trainer = Trainer("torch", num_workers=num_workers) - Trainable = trainer.to_tune_trainable(train_func) - analysis = tune.run( - Trainable, - num_samples=num_samples, - config={ - "lr": tune.loguniform(1e-4, 1e-1), - "batch_size": tune.choice([4, 16, 32]), - "epochs": 3, + trainer = TorchTrainer(train_func, scaling_config={"num_workers": num_workers}) + tuner = Tuner( + trainer, + param_space={ + "train_loop_config": { + "lr": tune.loguniform(1e-4, 1e-1), + "batch_size": tune.choice([4, 16, 32]), + "epochs": 3, + }, }, + tune_config=TuneConfig(num_samples=num_samples), ) - results = analysis.get_best_config(metric="loss", mode="min") - print(results) - return results + analysis = tuner.fit() + result = analysis.get_best_result(metric="loss", mode="min") + print(result) + return result if __name__ == "__main__": diff --git a/python/ray/train/examples/tune_tensorflow_mnist_example.py b/python/ray/train/examples/tune_tensorflow_mnist_example.py index 8ab6776c3b64..4fc408b2d6eb 100644 --- a/python/ray/train/examples/tune_tensorflow_mnist_example.py +++ b/python/ray/train/examples/tune_tensorflow_mnist_example.py @@ -1,28 +1,32 @@ import argparse +from tensorflow_mnist_example import train_func + import ray from ray import tune -from ray.train import Trainer - -from tensorflow_mnist_example import train_func +from ray.train.tensorflow import TensorflowTrainer +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner def tune_tensorflow_mnist(num_workers, num_samples): - trainer = Trainer(backend="tensorflow", num_workers=num_workers) - Trainable = trainer.to_tune_trainable(train_func) - analysis = tune.run( - Trainable, - num_samples=num_samples, - config={ - "lr": tune.loguniform(1e-4, 1e-1), - "batch_size": tune.choice([32, 64, 128]), - "epochs": 3, + trainer = TensorflowTrainer(train_func, scaling_config={"num_workers": num_workers}) + tuner = Tuner( + trainer, + param_space={ + "train_loop_config": { + "lr": tune.loguniform(1e-4, 1e-1), + "batch_size": tune.choice([32, 64, 128]), + "epochs": 3, + }, }, + tune_config=TuneConfig(num_samples=num_samples), ) - best_loss = analysis.get_best_config(metric="loss", mode="min") - best_accuracy = analysis.get_best_config(metric="accuracy", mode="max") - print(f"Best loss config: {best_loss}") - print(f"Best accuracy config: {best_accuracy}") + analysis = tuner.fit() + best_loss = analysis.get_best_result(metric="loss", mode="min") + best_accuracy = analysis.get_best_result(metric="accuracy", mode="max") + print(f"Best loss result: {best_loss}") + print(f"Best accuracy result: {best_accuracy}") return analysis diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 66440074a62f..f770e3f05f35 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -2,10 +2,11 @@ from typing import Optional, Union import pandas as pd -from ray.cloudpickle import cloudpickle -from ray.exceptions import RayTaskError + from ray.air.checkpoint import Checkpoint from ray.air.result import Result +from ray.cloudpickle import cloudpickle +from ray.exceptions import RayTaskError from ray.tune import ExperimentAnalysis from ray.tune.error import TuneError from ray.tune.trial import Trial @@ -177,5 +178,6 @@ def _trial_to_result(self, trial: Trial) -> Result: checkpoint=checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), + log_dir=trial.logdir, ) return result From b31399ef71afd24f0ac84df9bcbb8a761be893b6 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 16:07:15 +0000 Subject: [PATCH 02/63] Fix FailureConfig not being a dataclass --- python/ray/air/config.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/python/ray/air/config.py b/python/ray/air/config.py index 40b63603f51c..a8f5c2b85c66 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -1,14 +1,5 @@ from dataclasses import dataclass -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - List, - Mapping, - Optional, - Union, -) +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Mapping, Optional, Union from ray.air.constants import WILDCARD_KEY from ray.tune.syncer import SyncConfig @@ -267,6 +258,7 @@ def _merge(self, other: "DatasetConfig") -> "DatasetConfig": return new_config +@dataclass @PublicAPI(stability="alpha") class FailureConfig: """Configuration related to failure handling of each run/trial. From 5cc9229716f8526f10632fddf0ef282308a47da4 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 16:07:22 +0000 Subject: [PATCH 03/63] Fix errors --- .../examples/mlflow_fashion_mnist_example.py | 2 +- .../train/examples/mlflow_simple_example.py | 36 ++++++++++--------- .../examples/tensorflow_mnist_example.py | 2 +- .../train/examples/train_linear_example.py | 2 +- 4 files changed, 23 insertions(+), 19 deletions(-) diff --git a/python/ray/train/examples/mlflow_fashion_mnist_example.py b/python/ray/train/examples/mlflow_fashion_mnist_example.py index 7cd54b821859..1cda7fc3e1ac 100644 --- a/python/ray/train/examples/mlflow_fashion_mnist_example.py +++ b/python/ray/train/examples/mlflow_fashion_mnist_example.py @@ -47,7 +47,7 @@ def main(num_workers=2, use_gpu=False): import ray if args.smoke_test: - ray.init(num_cpus=2) + ray.init(num_cpus=4) args.num_workers = 2 args.use_gpu = False else: diff --git a/python/ray/train/examples/mlflow_simple_example.py b/python/ray/train/examples/mlflow_simple_example.py index 548b44d96f3c..d61a435ce3e3 100644 --- a/python/ray/train/examples/mlflow_simple_example.py +++ b/python/ray/train/examples/mlflow_simple_example.py @@ -1,6 +1,8 @@ from ray import train -from ray.train import Trainer -from ray.train.callbacks import MLflowLoggerCallback, TBXLoggerCallback +from ray.air import RunConfig +from ray.train.torch import TorchTrainer +from ray.tune.integration.mlflow import MLflowLoggerCallback +from ray.tune.logger import TBXLoggerCallback def train_func(): @@ -8,29 +10,31 @@ def train_func(): train.report(epoch=i) -trainer = Trainer(backend="torch", num_workers=2) -trainer.start() +trainer = TorchTrainer( + train_func, + scaling_config={"num_workers": 2}, + run_config=RunConfig( + callbacks=[ + MLflowLoggerCallback(experiment_name="train_experiment"), + TBXLoggerCallback(), + ], + ), +) # Run the training function, logging all the intermediate results # to MLflow and Tensorboard. -result = trainer.run( - train_func, - callbacks=[ - MLflowLoggerCallback(experiment_name="train_experiment"), - TBXLoggerCallback(), - ], -) +result = trainer.fit() # Print the latest run directory and keep note of it. -# For example: /home/ray_results/train_2021-09-01_12-00-00/run_001 -print("Run directory:", trainer.latest_run_dir) - -trainer.shutdown() +# For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06\ +# /TorchTrainer_c02c7_00000_0_2022-06-13_20-31-06 +print("Run directory:", result.logdir) # How to visualize the logs # Navigate to the run directory of the trainer. -# For example `cd /home/ray_results/train_2021-09-01_12-00-00/run_001` +# For example `cd /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06\ +# /TorchTrainer_c02c7_00000_0_2022-06-13_20-31-06` # $ cd # # # View the MLflow UI. diff --git a/python/ray/train/examples/tensorflow_mnist_example.py b/python/ray/train/examples/tensorflow_mnist_example.py index 980e58652d95..a0ef319f8756 100644 --- a/python/ray/train/examples/tensorflow_mnist_example.py +++ b/python/ray/train/examples/tensorflow_mnist_example.py @@ -120,7 +120,7 @@ def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4): import ray if args.smoke_test: - ray.init(num_cpus=2) + ray.init(num_cpus=4) train_tensorflow_mnist() else: ray.init(address=args.address) diff --git a/python/ray/train/examples/train_linear_example.py b/python/ray/train/examples/train_linear_example.py index 40d850754401..069c6dd13db1 100644 --- a/python/ray/train/examples/train_linear_example.py +++ b/python/ray/train/examples/train_linear_example.py @@ -128,7 +128,7 @@ def train_linear(num_workers=2, use_gpu=False, epochs=3): import ray if args.smoke_test: - ray.init(num_cpus=2) + ray.init(num_cpus=4) train_linear() else: ray.init(address=args.address) From 523021843ab6886fbdce1f40381be0ed733ced98 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 17:01:17 +0000 Subject: [PATCH 04/63] Fix --- doc/source/train/examples/tune_torch_linear_dataset_example.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/train/examples/tune_torch_linear_dataset_example.rst b/doc/source/train/examples/tune_torch_linear_dataset_example.rst index 22ad2e562660..df74e93ebdf2 100644 --- a/doc/source/train/examples/tune_torch_linear_dataset_example.rst +++ b/doc/source/train/examples/tune_torch_linear_dataset_example.rst @@ -3,4 +3,4 @@ tune_torch_linear_dataset_example ================================= -.. literalinclude:: /../../python/ray/air/examples/tune_torch_linear_dataset_example.py +.. literalinclude:: /../../python/ray/air/examples/pytorch/tune_torch_linear_dataset_example.py From ef4a3fcda417ca38dae7fc8caf37b289481dd064 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 17:46:39 +0000 Subject: [PATCH 05/63] Fix link --- doc/source/train/examples.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst index e644f708b639..2a4e0b75bbd1 100644 --- a/doc/source/train/examples.rst +++ b/doc/source/train/examples.rst @@ -62,7 +62,7 @@ Ray Datasets Integration Examples * :doc:`/train/examples/train_linear_dataset_example`: Simple example for training a linear PyTorch model. -* :doc:`/air/examples/tune_torch_linear_dataset_example`: +* :doc:`/train/examples/tune_torch_linear_dataset_example`: Simple example for tuning a linear PyTorch model. From f5cfe6262dfeb173663ab7693ff1dfd60b385208 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 19:47:22 +0000 Subject: [PATCH 06/63] Fix simple example --- .../train/examples/mlflow_simple_example.py | 25 +++++++++++++------ python/ray/tune/result_grid.py | 2 +- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/python/ray/train/examples/mlflow_simple_example.py b/python/ray/train/examples/mlflow_simple_example.py index d61a435ce3e3..d64d0525ae58 100644 --- a/python/ray/train/examples/mlflow_simple_example.py +++ b/python/ray/train/examples/mlflow_simple_example.py @@ -25,20 +25,29 @@ def train_func(): # to MLflow and Tensorboard. result = trainer.fit() +# For MLFLow logs: + +# MLFlow logs will by default be saved in an `mlflow` directory +# in the current working directory. + +# $ cd mlflow +# # View the MLflow UI. +# $ mlflow ui + +# You can change the directory by setting the `tracking_uri` argument +# in `MLflowLoggerCallback`. + +# For TensorBoard logs: + # Print the latest run directory and keep note of it. -# For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06\ -# /TorchTrainer_c02c7_00000_0_2022-06-13_20-31-06 -print("Run directory:", result.logdir) +# For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06 +print("Run directory:", result.log_dir) # How to visualize the logs # Navigate to the run directory of the trainer. -# For example `cd /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06\ -# /TorchTrainer_c02c7_00000_0_2022-06-13_20-31-06` +# For example `cd /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06` # $ cd # -# # View the MLflow UI. -# $ mlflow ui -# # # View the tensorboard UI. # $ tensorboard --logdir . diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index f770e3f05f35..38ebf357e5e4 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -178,6 +178,6 @@ def _trial_to_result(self, trial: Trial) -> Result: checkpoint=checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), - log_dir=trial.logdir, + log_dir=trial.local_dir, ) return result From 468f7e80f48049f041a8c5fb038cc2d49a280b14 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 20:09:43 +0000 Subject: [PATCH 07/63] train loop utils --- python/ray/train/train_loop_utils.py | 80 ++++++++++++++++------------ 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/python/ray/train/train_loop_utils.py b/python/ray/train/train_loop_utils.py index b4dde5f4ca7b..774fda94a324 100644 --- a/python/ray/train/train_loop_utils.py +++ b/python/ray/train/train_loop_utils.py @@ -1,11 +1,8 @@ -from typing import TYPE_CHECKING -from typing import Optional, Dict, Union import warnings +from typing import TYPE_CHECKING, Dict, Optional, Union +from ray.train._internal.session import get_session from ray.train.constants import SESSION_MISUSE_LOG_ONCE_KEY -from ray.train._internal.session import ( - get_session, -) from ray.util import PublicAPI, log_once if TYPE_CHECKING: @@ -41,23 +38,25 @@ def get_dataset_shard( import ray from ray import train + from ray.train.torch import TorchTrainer def train_func(): model = Net() for iter in range(100): - data_shard = train.get_dataset_shard().to_torch() + data_shard = train.get_dataset_shard("train").to_torch() model.train(data_shard) return model dataset = ray.data.read_csv("train.csv") dataset.filter(...).repeat().random_shuffle() - trainer = Trainer(backend="torch") - trainer.start() - # Trainer will automatically handle sharding. - train_model = trainer.run(train_func, dataset=dataset) - trainer.shutdown() + trainer = TorchTrainer( + train_func, + datasets={"train": dataset}, + scaling_config={"num_workers": 2}, + ) + trainer.fit() Args: dataset_name: If a Dictionary of Datasets was passed to ``Trainer``, then @@ -98,16 +97,15 @@ def report(**kwargs) -> None: import time from ray import train + from ray.train.torch import TorchTrainer def train_func(): for iter in range(100): time.sleep(1) train.report(hello="world") - trainer = Trainer(backend="torch") - trainer.start() - trainer.run(train_func) - trainer.shutdown() + trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2}) + trainer.fit() Args: **kwargs: Any key value pair to be reported by Train. @@ -129,6 +127,7 @@ def world_rank() -> int: import time from ray import train + from ray.train.torch import TorchTrainer def train_func(): for iter in range(100): @@ -136,10 +135,8 @@ def train_func(): if train.world_rank() == 0: print("Worker 0") - trainer = Trainer(backend="torch") - trainer.start() - trainer.run(train_func) - trainer.shutdown() + trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2}) + trainer.fit() """ session = get_session() @@ -156,16 +153,18 @@ def local_rank() -> int: import time from ray import train + from ray.train.torch import TorchTrainer def train_func(): if torch.cuda.is_available(): torch.cuda.set_device(train.local_rank()) ... - trainer = Trainer(backend="torch", use_gpu=True) - trainer.start() - trainer.run(train_func) - trainer.shutdown() + trainer = TorchTrainer( + train_func, + scaling_config={"use_gpu": True, "num_workers": 2}, + ) + trainer.fit() """ session = get_session() @@ -181,18 +180,29 @@ def load_checkpoint() -> Optional[Dict]: .. code-block:: python from ray import train + from ray.air import Checkpoint + from ray.train.torch import TorchTrainer def train_func(): checkpoint = train.load_checkpoint() for iter in range(checkpoint["epoch"], 5): print(iter) - trainer = Trainer(backend="torch") - trainer.start() - trainer.run(train_func, checkpoint={"epoch": 3}) + checkpoint = Checkpoint.from_dict( + { + # this would be set during checkpoint saving + "_current_checkpoint_id": 1, + "epoch": 3, + } + ) + trainer = TorchTrainer( + train_func, + resume_from_checkpoint=checkpoint, + scaling_config={"num_workers": 2}, + ) + trainer.fit() # 3 # 4 - trainer.shutdown() Args: **kwargs: Any key value pair to be checkpointed by Train. @@ -216,16 +226,16 @@ def save_checkpoint(**kwargs) -> None: import time from ray import train + from ray.train.torch import TorchTrainer def train_func(): for iter in range(100): time.sleep(1) train.save_checkpoint(epoch=iter) - trainer = Trainer(backend="torch") - trainer.start() - trainer.run(train_func) - trainer.shutdown() + trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2}) + result = trainer.fit() + assert result.checkpoint Args: **kwargs: Any key value pair to be checkpointed by Train. @@ -245,14 +255,14 @@ def world_size() -> int: import time from ray import train + from ray.train.torch import TorchTrainer def train_func(): assert train.world_size() == 4 - trainer = Trainer(backend="torch", num_workers=4) - trainer.start() - trainer.run(train_func) - trainer.shutdown() + trainer = TorchTrainer(train_func, scaling_config={"num_workers": 4}) + result = trainer.fit() + """ session = get_session() if session is None: From 4ef6302cc5d4bfd3c00caaeb2e9ad6c678946858 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 20:29:42 +0000 Subject: [PATCH 08/63] Remove tensorboard example --- python/ray/train/BUILD | 9 -- .../torch_tensorboard_profiler_example.py | 84 ------------------- 2 files changed, 93 deletions(-) delete mode 100644 python/ray/train/examples/torch_tensorboard_profiler_example.py diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index d33bad4d9ac2..bf73935be49c 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -39,15 +39,6 @@ py_test( deps = [":train_lib"] ) -py_test( - name = "torch_tensorboard_profiler_example", - size = "small", - main = "examples/torch_tensorboard_profiler_example.py", - srcs = ["examples/torch_tensorboard_profiler_example.py"], - tags = ["team:ml", "exclusive"], - deps = [":train_lib"] -) - py_test( name = "transformers_example_gpu", size = "large", diff --git a/python/ray/train/examples/torch_tensorboard_profiler_example.py b/python/ray/train/examples/torch_tensorboard_profiler_example.py deleted file mode 100644 index 5f3641c31c8d..000000000000 --- a/python/ray/train/examples/torch_tensorboard_profiler_example.py +++ /dev/null @@ -1,84 +0,0 @@ -import argparse - -import torch -from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present -from torch.profiler import profile, record_function, schedule - -import ray -import ray.train as train -from ray.train import Trainer -from ray.train.callbacks import TBXLoggerCallback -from ray.train.callbacks.profile import TorchTensorboardProfilerCallback -from ray.train.torch import TorchWorkerProfiler - - -def train_func(): - twp = TorchWorkerProfiler() - with profile( - activities=[], - schedule=schedule(wait=0, warmup=0, active=1), - on_trace_ready=twp.trace_handler, - ) as p: - - # Setup model. - model = torch.nn.Linear(1, 1) - model = train.torch.prepare_model(model) - loss_fn = torch.nn.MSELoss() - optimizer = torch.optim.SGD(model.parameters(), lr=1e-2) - - # Setup data. - input = torch.randn(1000, 1) - labels = input * 2 - dataset = torch.utils.data.TensorDataset(input, labels) - dataloader = torch.utils.data.DataLoader(dataset, batch_size=32) - dataloader = train.torch.prepare_data_loader(dataloader) - - # Train. - for epoch in range(5): - with record_function("train_epoch"): - for X, y in dataloader: - pred = model(X) - loss = loss_fn(pred, y) - optimizer.zero_grad() - loss.backward() - optimizer.step() - - with record_function("train_checkpoint"): - state_dict = model.state_dict() - consume_prefix_in_state_dict_if_present(state_dict, "module.") - train.save_checkpoint(epoch=epoch, model_weights=state_dict) - - p.step() - - with record_function("train_report"): - profile_results = twp.get_and_clear_profile_traces() - train.report(epoch=epoch, **profile_results) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--address", required=False, type=str, help="the address to use for Ray" - ) - parser.add_argument( - "--num-workers", - "-n", - type=int, - default=2, - help="Sets number of workers for training.", - ) - parser.add_argument( - "--use-gpu", action="store_true", default=False, help="Enables GPU training" - ) - - args = parser.parse_args() - - ray.init(address=args.address) - - callbacks = [TorchTensorboardProfilerCallback(), TBXLoggerCallback()] - trainer = Trainer( - backend="torch", num_workers=args.num_workers, use_gpu=args.use_gpu - ) - trainer.start() - trainer.run(train_func, callbacks=callbacks) - trainer.shutdown() From 5db3c14e400cc94da0448dd26b3cf5328b82ea4d Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 20:30:09 +0000 Subject: [PATCH 09/63] PBT test update --- .../tune_cifar_pytorch_pbt_example.py | 13 ++-- .../workloads/pytorch_pbt_failure.py | 77 ++++++++++--------- 2 files changed, 49 insertions(+), 41 deletions(-) diff --git a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py index 5e4711adae84..a7031b3116a1 100644 --- a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py +++ b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py @@ -58,6 +58,7 @@ def validate_epoch(dataloader, model, loss_fn): def train_func(config): + # print(config) epochs = config.pop("epochs", 3) model = ResNet18(config) model = train.torch.prepare_model(model) @@ -157,14 +158,14 @@ def train_func(config): ) pbt_scheduler = PopulationBasedTraining( time_attr="training_iteration", - metric="loss", - mode="min", perturbation_interval=1, hyperparam_mutations={ - # distribution for resampling - "train_loop_config/lr": lambda: np.random.uniform(0.001, 1), - # allow perturbations within this set of categorical values - "train_loop_config/momentum": [0.8, 0.9, 0.99], + "train_loop_config": { + # distribution for resampling + "lr": lambda: np.random.uniform(0.001, 1), + # allow perturbations within this set of categorical values + "momentum": [0.8, 0.9, 0.99], + } }, ) diff --git a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py index 903e2a1cc553..d354b2834ac6 100644 --- a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py +++ b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py @@ -4,16 +4,16 @@ import numpy as np import ray - from ray import tune +from ray.air.config import RunConfig +from ray.train.examples.tune_cifar_pytorch_pbt_example import train_func +from ray.train.torch import TorchConfig, TorchTrainer from ray.tune.schedulers import PopulationBasedTraining +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner from ray.tune.utils.mock import FailureInjectorCallback from ray.tune.utils.release_test_util import ProgressCallback -from ray.train import Trainer -from ray.train.examples.tune_cifar_pytorch_pbt_example import train_func -from ray.train.torch import TorchConfig - parser = argparse.ArgumentParser() parser.add_argument( "--smoke-test", @@ -26,46 +26,53 @@ ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True) num_training_workers = 1 if args.smoke_test else 3 -trainer = Trainer( - num_workers=num_training_workers, - use_gpu=not args.smoke_test, - backend=TorchConfig(backend="gloo"), +trainer = TorchTrainer( + train_func, + scaling_config=dict( + num_workers=num_training_workers, + use_gpu=not args.smoke_test, + ), + torch_config=TorchConfig(backend="gloo"), ) -TorchTrainable = trainer.to_tune_trainable(train_func=train_func) pbt_scheduler = PopulationBasedTraining( time_attr="training_iteration", - metric="loss", - mode="min", perturbation_interval=1, hyperparam_mutations={ - # distribution for resampling - "lr": lambda: np.random.uniform(0.001, 1), - # allow perturbations within this set of categorical values - "momentum": [0.8, 0.9, 0.99], + "train_loop_config": { + # distribution for resampling + "lr": lambda: np.random.uniform(0.001, 1), + # allow perturbations within this set of categorical values + "momentum": [0.8, 0.9, 0.99], + } }, ) -analysis = tune.run( - TorchTrainable, - num_samples=4, - config={ - "lr": tune.choice([0.001, 0.01, 0.1]), - "momentum": 0.8, - "head_location": None, - "worker_locations": None, - "test_mode": args.smoke_test, - "batch_size": 128 * num_training_workers, - # For the long running test, we want the training to run forever, and it will - # be terminated by the release test infra. - "epochs": 1 if args.smoke_test else sys.maxsize, +tuner = Tuner( + trainer, + param_space={ + "train_loop_config": { + "lr": tune.choice([0.001, 0.01, 0.1]), + "momentum": 0.8, + "head_location": None, + "worker_locations": None, + "test_mode": args.smoke_test, + "batch_size": 128 * num_training_workers, + # For the long running test, we want the training to run forever, + # and it will be terminated by the release test infra. + "epochs": 1 if args.smoke_test else sys.maxsize, + } }, - max_failures=-1, # used for fault tolerance - checkpoint_freq=2, # used for fault tolerance - scheduler=pbt_scheduler, - callbacks=[FailureInjectorCallback(time_between_checks=90), ProgressCallback()], - stop={"training_iteration": 1} if args.smoke_test else None, + tune_config=TuneConfig( + num_samples=4, metric="loss", mode="min", scheduler=pbt_scheduler + ), + run_config=RunConfig( + stop={"training_iteration": 1} if args.smoke_test else None, + callbacks=[FailureInjectorCallback(time_between_checks=90), ProgressCallback()], + ), ) -print(analysis.get_best_config(metric="loss", mode="min")) +results = tuner.fit() + +print(results.get_best_result(metric="loss", mode="min")) From cb805f297c6f14e0878cdca9fbc3774d4070191e Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 14 Jun 2022 20:48:31 +0000 Subject: [PATCH 10/63] WIP --- .../transformers/transformers_example.py | 2 +- python/ray/train/tests/test_examples.py | 19 ++++++++----------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/python/ray/train/examples/transformers/transformers_example.py b/python/ray/train/examples/transformers/transformers_example.py index b6cb461c4b73..1b47e5ce2e31 100644 --- a/python/ray/train/examples/transformers/transformers_example.py +++ b/python/ray/train/examples/transformers/transformers_example.py @@ -609,7 +609,7 @@ def main(): if args.start_local or args.address or args.num_workers > 1 or args.use_gpu: if args.start_local: # Start a local Ray runtime. - ray.init(num_cpus=args.num_workers) + ray.init(num_cpus=args.num_workers + 2) else: # Connect to a Ray cluster for distributed training. ray.init(address=args.address) diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index c72249b95ad5..10ebefa03588 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -19,7 +19,9 @@ train_func as fashion_mnist_train_func, ) from ray.train.examples.train_linear_example import train_func as linear_train_func +from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer from ray.train.tests.test_trainer import KillCallback +from ray.train.torch.torch_trainer import TorchTrainer @pytest.fixture @@ -35,14 +37,11 @@ def test_tensorflow_mnist(ray_start_2_cpus, num_workers): num_workers = num_workers epochs = 3 - trainer = Trainer("tensorflow", num_workers=num_workers) + trainer = TensorflowTrainer(tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=num_workers)) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} - trainer.start() - results = trainer.run(tensorflow_mnist_train_func, config) - trainer.shutdown() + results = trainer.fit() - assert len(results) == num_workers - result = results[0] + result = results.metrics loss = result["loss"] assert len(loss) == epochs @@ -56,17 +55,15 @@ def test_tensorflow_mnist(ray_start_2_cpus, num_workers): def test_tf_non_distributed(ray_start_2_cpus): """Make sure Ray Train works without TF MultiWorkerMirroredStrategy.""" - trainer = Trainer(backend="torch", num_workers=1) - trainer.start() - trainer.run(tf_quick_start_train_func) - trainer.shutdown() + trainer = TorchTrainer(tf_quick_start_train_func, scaling_config=dict(num_workers=1)) + trainer.fit() def test_tensorflow_mnist_fail(ray_start_2_cpus): """Tests if tensorflow example works even with worker failure.""" epochs = 3 - trainer = Trainer("tensorflow", num_workers=2) + trainer = TensorflowTrainer(tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=num_workers)) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} trainer.start() kill_callback = KillCallback(fail_on=0, trainer=trainer) From 2f69e37e50b57e282c406f198096848a6e03c5d1 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 15 Jun 2022 18:08:55 +0000 Subject: [PATCH 11/63] Do not use pipeline --- .../ray/train/examples/train_linear_dataset_example.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/python/ray/train/examples/train_linear_dataset_example.py b/python/ray/train/examples/train_linear_dataset_example.py index 1cfbff434c9f..f84faa5f7a11 100644 --- a/python/ray/train/examples/train_linear_dataset_example.py +++ b/python/ray/train/examples/train_linear_dataset_example.py @@ -7,11 +7,10 @@ import ray import ray.train as train from ray.data import Dataset -from ray.data.dataset_pipeline import DatasetPipeline from ray.train.torch import TorchTrainer -def get_datasets(a=5, b=10, size=1000, split=0.8) -> Dict[str, DatasetPipeline]: +def get_datasets(a=5, b=10, size=1000, split=0.8) -> Dict[str, Dataset]: def get_dataset(a, b, size) -> Dataset: items = [i / size for i in range(size)] dataset = ray.data.from_items([{"x": x, "y": a * x + b} for x in items]) @@ -23,12 +22,9 @@ def get_dataset(a, b, size) -> Dataset: [split] ) - train_dataset_pipeline = train_dataset.repeat().random_shuffle_each_window() - validation_dataset_pipeline = validation_dataset.repeat() - datasets = { - "train": train_dataset_pipeline, - "validation": validation_dataset_pipeline, + "train": train_dataset, + "validation": validation_dataset, } return datasets From 0d8eeb4a879f161e7215f117c75ef4ded4358099 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 15 Jun 2022 18:14:21 +0000 Subject: [PATCH 12/63] Remove callback test --- python/ray/train/BUILD | 8 - python/ray/train/tests/test_callbacks.py | 357 ----------------------- 2 files changed, 365 deletions(-) delete mode 100644 python/ray/train/tests/test_callbacks.py diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index bf73935be49c..1db84dd79d37 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -113,14 +113,6 @@ py_test( deps = [":train_lib"] ) -py_test( - name = "test_callbacks", - size = "medium", - srcs = ["tests/test_callbacks.py"], - tags = ["team:ml", "exclusive"], - deps = [":train_lib"] -) - py_test( name = "test_data_parallel_trainer", size = "medium", diff --git a/python/ray/train/tests/test_callbacks.py b/python/ray/train/tests/test_callbacks.py deleted file mode 100644 index 9aeb54088801..000000000000 --- a/python/ray/train/tests/test_callbacks.py +++ /dev/null @@ -1,357 +0,0 @@ -from typing import Dict, List -import glob -import io -import json -from collections import defaultdict -from contextlib import redirect_stdout -from pathlib import Path - -import pytest - -import ray -import ray.train as train -from ray.train import Trainer -from ray.train.backend import BackendConfig, Backend -from ray.train.callbacks import ( - TrainingCallback, - JsonLoggerCallback, - PrintCallback, - TBXLoggerCallback, - TorchTensorboardProfilerCallback, -) -from ray.train.callbacks.logging import ( - MLflowLoggerCallback, - _TrainCallbackLogdirManager, -) -from ray.train.constants import ( - TRAINING_ITERATION, - DETAILED_AUTOFILLED_KEYS, - BASIC_AUTOFILLED_KEYS, - ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, -) -from ray.train._internal.worker_group import WorkerGroup -from ray.train._internal.results_preprocessors.preprocessor import ( - SequentialResultsPreprocessor, -) - -try: - from tensorflow.python.summary.summary_iterator import summary_iterator -except ImportError: - summary_iterator = None - - -@pytest.fixture -def ray_start_4_cpus(): - address_info = ray.init(num_cpus=4) - yield address_info - # The code after the yield will run as teardown code. - ray.shutdown() - - -class TestConfig(BackendConfig): - @property - def backend_cls(self): - return TestBackend - - -class TestBackend(Backend): - def on_start(self, worker_group: WorkerGroup, backend_config: TestConfig): - pass - - def on_shutdown(self, worker_group: WorkerGroup, backend_config: TestConfig): - pass - - -def test_print(ray_start_4_cpus): - num_workers = 4 - - def train_func(): - train.report(rank=train.world_rank()) - - stream = io.StringIO() - with redirect_stdout(stream): - trainer = Trainer(TestConfig(), num_workers=num_workers) - trainer.start() - trainer.run(train_func, callbacks=[PrintCallback()]) - trainer.shutdown() - - output = stream.getvalue() - results = json.loads(output) - - assert len(results) == num_workers - for i, result in enumerate(results): - assert set(result.keys()) == (BASIC_AUTOFILLED_KEYS | {"rank"}) - assert result["rank"] == i - - -@pytest.mark.parametrize("input", [None, "dir", "file"]) -def test_train_callback_logdir_manager(tmp_path, input): - default_dir = tmp_path / "default_dir" - - if input == "dir": - input_logdir = tmp_path / "dir" - input_logdir.mkdir(parents=True) - elif input == "file": - input_logdir = tmp_path / "file" - input_logdir.touch() - else: - input_logdir = None - - logdir_manager = _TrainCallbackLogdirManager(input_logdir) - - if input_logdir: - path = logdir_manager.logdir_path - assert path == logdir_manager.logdir_path - else: - with pytest.raises(RuntimeError): - path = logdir_manager.logdir_path - - if input_logdir and not Path(input_logdir).is_dir(): - with pytest.raises(FileExistsError): - logdir_manager.setup_logdir(str(default_dir)) - else: - path = logdir_manager.setup_logdir(str(default_dir)) - assert path == logdir_manager.logdir_path - - -@pytest.mark.parametrize("workers_to_log", [0, None, [0, 1]]) -@pytest.mark.parametrize("detailed", [False, True]) -@pytest.mark.parametrize("filename", [None, "my_own_filename.json"]) -def test_json( - monkeypatch, ray_start_4_cpus, tmp_path, workers_to_log, detailed, filename -): - if detailed: - monkeypatch.setenv(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, "1") - - config = TestConfig() - - num_iters = 5 - num_workers = 4 - - if workers_to_log is None: - num_workers_to_log = num_workers - elif isinstance(workers_to_log, int): - num_workers_to_log = 1 - else: - num_workers_to_log = len(workers_to_log) - - def train_func(): - for i in range(num_iters): - train.report(index=i) - return 1 - - if filename is None: - # if None, use default value - callback = JsonLoggerCallback(workers_to_log=workers_to_log) - else: - callback = JsonLoggerCallback(filename=filename, workers_to_log=workers_to_log) - trainer = Trainer(config, num_workers=num_workers, logdir=str(tmp_path)) - trainer.start() - trainer.run(train_func, callbacks=[callback]) - if filename is None: - assert str(callback.log_path.name) == JsonLoggerCallback._default_filename - else: - assert str(callback.log_path.name) == filename - - with open(callback.log_path, "r") as f: - log = json.load(f) - print(log) - assert len(log) == num_iters - assert len(log[0]) == num_workers_to_log - assert all(len(element) == len(log[0]) for element in log) - assert all( - all(worker["index"] == worker[TRAINING_ITERATION] - 1 for worker in element) - for element in log - ) - assert all( - all(all(key in worker for key in BASIC_AUTOFILLED_KEYS) for worker in element) - for element in log - ) - if detailed: - assert all( - all( - all(key in worker for key in DETAILED_AUTOFILLED_KEYS) - for worker in element - ) - for element in log - ) - else: - assert all( - all( - not any(key in worker for key in DETAILED_AUTOFILLED_KEYS) - for worker in element - ) - for element in log - ) - - -def _validate_tbx_result(events_dir): - events_file = list(glob.glob(f"{events_dir}/events*"))[0] - results = defaultdict(list) - for event in summary_iterator(events_file): - for v in event.summary.value: - assert v.tag.startswith("ray/train") - results[v.tag[10:]].append(v.simple_value) - - assert len(results["episode_reward_mean"]) == 3 - assert [int(res) for res in results["episode_reward_mean"]] == [4, 5, 6] - assert len(results["score"]) == 1 - assert len(results["hello/world"]) == 1 - - -def test_TBX(ray_start_4_cpus, tmp_path): - config = TestConfig() - - temp_dir = tmp_path - num_workers = 4 - - def train_func(): - train.report(episode_reward_mean=4) - train.report(episode_reward_mean=5) - train.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1}) - return 1 - - callback = TBXLoggerCallback(temp_dir) - trainer = Trainer(config, num_workers=num_workers) - trainer.start() - trainer.run(train_func, callbacks=[callback]) - - _validate_tbx_result(temp_dir) - - -def test_mlflow(ray_start_4_cpus, tmp_path): - config = TestConfig() - - params = {"p1": "p1"} - - temp_dir = tmp_path - num_workers = 4 - - def train_func(config): - train.report(episode_reward_mean=4) - train.report(episode_reward_mean=5) - train.report(episode_reward_mean=6) - return 1 - - callback = MLflowLoggerCallback(experiment_name="test_exp", logdir=temp_dir) - trainer = Trainer(config, num_workers=num_workers) - trainer.start() - trainer.run(train_func, config=params, callbacks=[callback]) - - from mlflow.tracking import MlflowClient - - client = MlflowClient(tracking_uri=callback.mlflow_util._mlflow.get_tracking_uri()) - - experiment_id = client.get_experiment_by_name("test_exp").experiment_id - all_runs = callback.mlflow_util._mlflow.search_runs(experiment_ids=[experiment_id]) - assert len(all_runs) == 1 - # all_runs is a pandas dataframe. - all_runs = all_runs.to_dict(orient="records") - run_id = all_runs[0]["run_id"] - run = client.get_run(run_id) - - assert run.data.params == params - assert ( - "episode_reward_mean" in run.data.metrics - and run.data.metrics["episode_reward_mean"] == 6.0 - ) - assert ( - TRAINING_ITERATION in run.data.metrics - and run.data.metrics[TRAINING_ITERATION] == 3.0 - ) - - metric_history = client.get_metric_history(run_id=run_id, key="episode_reward_mean") - - assert len(metric_history) == 3 - iterations = [metric.step for metric in metric_history] - assert iterations == [1, 2, 3] - rewards = [metric.value for metric in metric_history] - assert rewards == [4, 5, 6] - - -def test_torch_tensorboard_profiler_callback(ray_start_4_cpus, tmp_path): - config = TestConfig() - - temp_dir = tmp_path - num_workers = 4 - num_epochs = 2 - - def train_func(): - from ray.train.torch import TorchWorkerProfiler - from torch.profiler import profile, record_function, schedule - - twp = TorchWorkerProfiler() - with profile( - activities=[], - schedule=schedule(wait=0, warmup=0, active=1), - on_trace_ready=twp.trace_handler, - ) as p: - - for epoch in range(num_epochs): - with record_function("test_function"): - pass - - p.step() - - profile_results = twp.get_and_clear_profile_traces() - train.report(epoch=epoch, **profile_results) - - callback = TorchTensorboardProfilerCallback(temp_dir) - trainer = Trainer(config, num_workers=num_workers) - trainer.start() - trainer.run(train_func, callbacks=[callback]) - - assert temp_dir.exists() - - count = 0 - for path in temp_dir.iterdir(): - assert path.is_file() - count += 1 - assert count == num_workers * num_epochs - - -# fix issue: repeat assignments for preprocessor results nested recursive calling -# see https://github.com/ray-project/ray/issues/25005 -def test_hotfix_callback_nested_recusive_calling(): - # test callback used to simulate the nested recursive calling for preprocess() - class TestCallback(TrainingCallback): - def __init__(self): - self.max_process_time = 0 - - def count_process_times(self, processor): - count = 0 - if processor: - if isinstance(processor, SequentialResultsPreprocessor): - for preprocessor in processor.preprocessors: - # recursive calling preprocessors in list - count += self.count_process_times(preprocessor) - else: - count = 1 - return count - - def handle_result(self, results: List[Dict], **info): - process_times = self.count_process_times(self.results_preprocessor) - if process_times > self.max_process_time: - self.max_process_time = process_times - print(f"process times: {process_times}") - - def train_func(): - for idx in range(num_iterates): - train.report(iterate=idx + 1) - - # python default limitation for iterate depth - num_iterates = 1000 - trainer = Trainer(TestConfig(), num_workers=1) - trainer.start() - test_callback = TestCallback() - trainer.run(train_func, callbacks=[test_callback]) - assert test_callback.max_process_time == 1 - print(f"callback max process time: {test_callback.max_process_time}") - trainer.shutdown() - - -if __name__ == "__main__": - import pytest - import sys - - sys.exit(pytest.main(["-v", "-x", __file__])) From 4a3103ec4a3fcd06761a9ddf51895407bbef3c87 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 15 Jun 2022 19:10:27 +0000 Subject: [PATCH 13/63] Examples tests --- .../train/examples/horovod/horovod_example.py | 3 +- python/ray/train/tests/test_examples.py | 119 +++++++++--------- 2 files changed, 62 insertions(+), 60 deletions(-) diff --git a/python/ray/train/examples/horovod/horovod_example.py b/python/ray/train/examples/horovod/horovod_example.py index c3202307755f..1e163da70052 100644 --- a/python/ray/train/examples/horovod/horovod_example.py +++ b/python/ray/train/examples/horovod/horovod_example.py @@ -10,6 +10,7 @@ from torchvision import datasets, transforms import ray +from ray import train from ray.train.horovod import HorovodTrainer @@ -148,7 +149,7 @@ def train_func(config): loss = train_epoch( model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda ) - results.append(loss) + train.report(loss=loss) return results diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index 10ebefa03588..e37cf43a4687 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -2,9 +2,10 @@ import ray from ray.train import Trainer +from ray.train.constants import TRAINING_ITERATION +from ray.train.examples.horovod.horovod_example import HorovodTrainClass from ray.train.examples.horovod.horovod_example import ( train_func as horovod_torch_train_func, - HorovodTrainClass, ) from ray.train.examples.tensorflow_mnist_example import ( train_func as tensorflow_mnist_train_func, @@ -19,52 +20,56 @@ train_func as fashion_mnist_train_func, ) from ray.train.examples.train_linear_example import train_func as linear_train_func +from ray.train.horovod.horovod_trainer import HorovodTrainer from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer from ray.train.tests.test_trainer import KillCallback from ray.train.torch.torch_trainer import TorchTrainer @pytest.fixture -def ray_start_2_cpus(): - address_info = ray.init(num_cpus=2) +def ray_start_4_cpus(): + address_info = ray.init(num_cpus=4) yield address_info # The code after the yield will run as teardown code. ray.shutdown() @pytest.mark.parametrize("num_workers", [1, 2]) -def test_tensorflow_mnist(ray_start_2_cpus, num_workers): +def test_tensorflow_mnist(ray_start_4_cpus, num_workers): num_workers = num_workers epochs = 3 - trainer = TensorflowTrainer(tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=num_workers)) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} + trainer = TensorflowTrainer( + tensorflow_mnist_train_func, + train_loop_config=config, + scaling_config=dict(num_workers=num_workers), + ) results = trainer.fit() result = results.metrics - loss = result["loss"] - assert len(loss) == epochs - assert loss[-1] < loss[0] - - accuracy = result["accuracy"] - assert len(accuracy) == epochs - assert accuracy[-1] > accuracy[0] + assert result[TRAINING_ITERATION] == epochs -def test_tf_non_distributed(ray_start_2_cpus): +def test_tf_non_distributed(ray_start_4_cpus): """Make sure Ray Train works without TF MultiWorkerMirroredStrategy.""" - trainer = TorchTrainer(tf_quick_start_train_func, scaling_config=dict(num_workers=1)) + trainer = TorchTrainer( + tf_quick_start_train_func, scaling_config=dict(num_workers=1) + ) trainer.fit() -def test_tensorflow_mnist_fail(ray_start_2_cpus): +@pytest.mark.skip("Refactor as a backend test.") +def test_tensorflow_mnist_fail(ray_start_4_cpus): """Tests if tensorflow example works even with worker failure.""" epochs = 3 - trainer = TensorflowTrainer(tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=num_workers)) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} + trainer = TensorflowTrainer( + tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=2) + ) trainer.start() kill_callback = KillCallback(fail_on=0, trainer=trainer) results = trainer.run( @@ -85,24 +90,24 @@ def test_tensorflow_mnist_fail(ray_start_2_cpus): @pytest.mark.parametrize("num_workers", [1, 2]) -def test_torch_linear(ray_start_2_cpus, num_workers): +def test_torch_linear(ray_start_4_cpus, num_workers): num_workers = num_workers epochs = 3 - trainer = Trainer("torch", num_workers=num_workers) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} - trainer.start() - results = trainer.run(linear_train_func, config) - trainer.shutdown() - - assert len(results) == num_workers + trainer = TorchTrainer( + linear_train_func, + train_loop_config=config, + scaling_config=dict(num_workers=num_workers), + ) + results = trainer.fit() - for result in results: - assert len(result) == epochs - assert result[-1]["loss"] < result[0]["loss"] + result = results.metrics + assert result[TRAINING_ITERATION] == epochs -def test_torch_linear_failure(ray_start_2_cpus): +@pytest.mark.skip("Refactor as a backend test.") +def test_torch_linear_failure(ray_start_4_cpus): num_workers = 2 epochs = 3 @@ -113,56 +118,51 @@ def test_torch_linear_failure(ray_start_2_cpus): results = trainer.run(linear_train_func, config, callbacks=[kill_callback]) trainer.shutdown() - assert len(results) == num_workers + result = results.metrics - for result in results: - assert len(result) == epochs - assert result[-1]["loss"] < result[0]["loss"] + assert result[TRAINING_ITERATION] == epochs -def test_torch_fashion_mnist(ray_start_2_cpus): +def test_torch_fashion_mnist(ray_start_4_cpus): num_workers = 2 epochs = 3 - trainer = Trainer("torch", num_workers=num_workers) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} - trainer.start() - results = trainer.run(fashion_mnist_train_func, config) - trainer.shutdown() - - assert len(results) == num_workers + trainer = TorchTrainer( + fashion_mnist_train_func, + train_loop_config=config, + scaling_config=dict(num_workers=num_workers), + ) + results = trainer.fit() - for result in results: - assert len(result) == epochs - assert result[-1] < result[0] + result = results.metrics + assert result[TRAINING_ITERATION] == epochs -def test_torch_non_distributed(ray_start_2_cpus): +def test_torch_non_distributed(ray_start_4_cpus): """Make sure Ray Train works without torch DDP.""" - trainer = Trainer(backend="torch", num_workers=1) - trainer.start() - trainer.run(torch_quick_start_train_func) - trainer.shutdown() + trainer = TorchTrainer( + torch_quick_start_train_func, scaling_config=dict(num_workers=1) + ) + trainer.fit() -def test_horovod_torch_mnist(ray_start_2_cpus): +def test_horovod_torch_mnist(ray_start_4_cpus): num_workers = 2 num_epochs = 2 - trainer = Trainer("horovod", num_workers) - trainer.start() - results = trainer.run( - horovod_torch_train_func, config={"num_epochs": num_epochs, "lr": 1e-3} + trainer = HorovodTrainer( + horovod_torch_train_func, + train_loop_config={"num_epochs": num_epochs, "lr": 1e-3}, + scaling_config=dict(num_workers=num_workers), ) - trainer.shutdown() - - assert len(results) == num_workers - for worker_result in results: - assert len(worker_result) == num_epochs - assert worker_result[num_epochs - 1] < worker_result[0] + results = trainer.fit() + result = results.metrics + assert result[TRAINING_ITERATION] == num_workers -def test_horovod_torch_mnist_stateful(ray_start_2_cpus): +@pytest.mark.skip("Refactor as a backend test.") +def test_horovod_torch_mnist_stateful(ray_start_4_cpus): num_workers = 2 num_epochs = 2 trainer = Trainer("horovod", num_workers) @@ -180,7 +180,8 @@ def test_horovod_torch_mnist_stateful(ray_start_2_cpus): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) From f7f3ea8559f3b2237895108d03cdc985646f9c3f Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 15 Jun 2022 20:14:22 +0000 Subject: [PATCH 14/63] Move tests --- python/ray/train/tests/test_examples.py | 12 ++- python/ray/train/tests/test_gpu.py | 85 +++++++++--------- python/ray/train/tests/test_minimal.py | 59 +++++------- python/ray/train/tests/test_tune.py | 114 ++++++++++++++---------- 4 files changed, 138 insertions(+), 132 deletions(-) diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index e37cf43a4687..06c88577205e 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -61,15 +61,13 @@ def test_tf_non_distributed(ray_start_4_cpus): trainer.fit() -@pytest.mark.skip("Refactor as a backend test.") -def test_tensorflow_mnist_fail(ray_start_4_cpus): +# TODO: Refactor as a backend test. +def test_tensorflow_mnist_fail(ray_start_2_cpus): """Tests if tensorflow example works even with worker failure.""" epochs = 3 + trainer = Trainer("tensorflow", num_workers=2) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} - trainer = TensorflowTrainer( - tensorflow_mnist_train_func, config, scaling_config=dict(num_workers=2) - ) trainer.start() kill_callback = KillCallback(fail_on=0, trainer=trainer) results = trainer.run( @@ -106,7 +104,7 @@ def test_torch_linear(ray_start_4_cpus, num_workers): assert result[TRAINING_ITERATION] == epochs -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. def test_torch_linear_failure(ray_start_4_cpus): num_workers = 2 epochs = 3 @@ -161,7 +159,7 @@ def test_horovod_torch_mnist(ray_start_4_cpus): assert result[TRAINING_ITERATION] == num_workers -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. def test_horovod_torch_mnist_stateful(ray_start_4_cpus): num_workers = 2 num_epochs = 2 diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py index 875ad766ebda..16dac0c42fc7 100644 --- a/python/ray/train/tests/test_gpu.py +++ b/python/ray/train/tests/test_gpu.py @@ -1,15 +1,17 @@ import os -import pytest from timeit import default_timer as timer +import pytest import torch +import torchvision +from test_tune import torch_fashion_mnist, tune_tensorflow_mnist from torch.nn.parallel import DistributedDataParallel from torch.utils.data import DataLoader, DistributedSampler -import torchvision import ray import ray.train as train from ray.train import Trainer, TrainingCallback +from ray.train.constants import TRAINING_ITERATION from ray.train.examples.horovod.horovod_example import ( train_func as horovod_torch_train_func, ) @@ -20,7 +22,9 @@ train_func as fashion_mnist_train_func, ) from ray.train.examples.train_linear_example import LinearDataset -from test_tune import torch_fashion_mnist, tune_tensorflow_mnist +from ray.train.horovod.horovod_trainer import HorovodTrainer +from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer +from ray.train.torch.torch_trainer import TorchTrainer @pytest.fixture @@ -38,6 +42,7 @@ def ray_start_1_cpu_1_gpu(): ray.shutdown() +@pytest.mark.skip("Refactor as a backend test.") @pytest.mark.parametrize("num_gpus_per_worker", [0.5, 1]) def test_torch_get_device(ray_start_4_cpus_2_gpus, num_gpus_per_worker): def train_fn(): @@ -64,6 +69,7 @@ def train_fn(): ) +@pytest.mark.skip("Refactor as a backend test.") def test_torch_prepare_model(ray_start_4_cpus_2_gpus): """Tests if ``prepare_model`` correctly wraps in DDP.""" @@ -85,6 +91,7 @@ def train_fn(): trainer.shutdown() +@pytest.mark.skip("Refactor as a backend test.") def test_torch_prepare_dataloader(ray_start_4_cpus_2_gpus): data_loader = DataLoader(LinearDataset(a=1, b=2, size=10)) @@ -108,6 +115,7 @@ def train_fn(): trainer.shutdown() +@pytest.mark.skip("Refactor as a backend test.") @pytest.mark.parametrize("use_gpu", (False, True)) def test_enable_reproducibility(ray_start_4_cpus_2_gpus, use_gpu): # NOTE: Reproducible results aren't guaranteed between seeded executions, even with @@ -154,6 +162,7 @@ def train_func(): assert result1 == result2 +@pytest.mark.skip("Refactor as a backend test.") def test_torch_amp_performance(ray_start_4_cpus_2_gpus): def train_func(config): train.torch.accelerate(amp=config["amp"]) @@ -196,6 +205,7 @@ def latency(amp: bool) -> float: assert 1.05 * latency(amp=True) < latency(amp=False) +@pytest.mark.skip("Refactor as a backend test.") def test_checkpoint_torch_model_with_amp(ray_start_4_cpus_2_gpus): """Test that model with AMP is serializable.""" @@ -213,6 +223,7 @@ def train_func(): trainer.shutdown() +@pytest.mark.skip("Refactor as a backend test.") def test_torch_auto_gpu_to_cpu(ray_start_4_cpus_2_gpus): """Tests if GPU tensors are auto converted to CPU on driver.""" @@ -287,55 +298,47 @@ def test_tensorflow_mnist_gpu(ray_start_4_cpus_2_gpus): num_workers = 2 epochs = 3 - trainer = Trainer("tensorflow", num_workers=num_workers, use_gpu=True) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} - trainer.start() - results = trainer.run(tensorflow_mnist_train_func, config) - trainer.shutdown() - - assert len(results) == num_workers - result = results[0] + trainer = TensorflowTrainer( + tensorflow_mnist_train_func, + train_loop_config=config, + scaling_config=dict(num_workers=num_workers, use_gpu=True), + ) + results = trainer.fit() - loss = result["loss"] - assert len(loss) == epochs - assert loss[-1] < loss[0] + result = results.metrics - accuracy = result["accuracy"] - assert len(accuracy) == epochs - assert accuracy[-1] > accuracy[0] + assert result[TRAINING_ITERATION] == epochs def test_torch_fashion_mnist_gpu(ray_start_4_cpus_2_gpus): num_workers = 2 epochs = 3 - trainer = Trainer("torch", num_workers=num_workers, use_gpu=True) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} - trainer.start() - results = trainer.run(fashion_mnist_train_func, config) - trainer.shutdown() + trainer = TorchTrainer( + fashion_mnist_train_func, + train_loop_config=config, + scaling_config=dict(num_workers=num_workers, use_gpu=True), + ) + results = trainer.fit() - assert len(results) == num_workers + result = results.metrics - for result in results: - assert len(result) == epochs - assert result[-1] < result[0] + assert result[TRAINING_ITERATION] == epochs def test_horovod_torch_mnist_gpu(ray_start_4_cpus_2_gpus): num_workers = 2 num_epochs = 2 - trainer = Trainer("horovod", num_workers, use_gpu=True) - trainer.start() - results = trainer.run( - horovod_torch_train_func, config={"num_epochs": num_epochs, "lr": 1e-3} + trainer = HorovodTrainer( + horovod_torch_train_func, + train_loop_config={"num_epochs": num_epochs, "lr": 1e-3}, + scaling_config=dict(num_workers=num_workers, use_gpu=True), ) - trainer.shutdown() - - assert len(results) == num_workers - for worker_result in results: - assert len(worker_result) == num_epochs - assert worker_result[num_epochs - 1] < worker_result[0] + results = trainer.fit() + result = results.metrics + assert result[TRAINING_ITERATION] == num_workers def test_tune_fashion_mnist_gpu(ray_start_4_cpus_2_gpus): @@ -349,9 +352,7 @@ def test_tune_tensorflow_mnist_gpu(ray_start_4_cpus_2_gpus): def test_train_linear_dataset_gpu(ray_start_4_cpus_2_gpus): from ray.train.examples.train_linear_dataset_example import train_linear - results = train_linear(num_workers=2, use_gpu=True) - for result in results: - assert result[-1]["loss"] < result[0]["loss"] + assert train_linear(num_workers=2, use_gpu=True) def test_tensorflow_linear_dataset_gpu(ray_start_4_cpus_2_gpus): @@ -359,11 +360,10 @@ def test_tensorflow_linear_dataset_gpu(ray_start_4_cpus_2_gpus): train_tensorflow_linear, ) - results = train_tensorflow_linear(num_workers=2, use_gpu=True) - for result in results: - assert result[-1]["loss"] < result[0]["loss"] + assert train_tensorflow_linear(num_workers=2, use_gpu=True) +@pytest.mark.skip("Refactor as a backend test.") @pytest.mark.parametrize( ("device_choice", "auto_transfer"), [ @@ -376,8 +376,8 @@ def test_tensorflow_linear_dataset_gpu(ray_start_4_cpus_2_gpus): def test_auto_transfer_data_from_host_to_device( ray_start_1_cpu_1_gpu, device_choice, auto_transfer ): - import torch import numpy as np + import torch def compute_average_runtime(func): device = torch.device(device_choice) @@ -417,7 +417,8 @@ def host_to_device_auto_pipeline(device): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", "-s", __file__])) diff --git a/python/ray/train/tests/test_minimal.py b/python/ray/train/tests/test_minimal.py index c6c6d6bba7b3..5f3be1d4c3b3 100644 --- a/python/ray/train/tests/test_minimal.py +++ b/python/ray/train/tests/test_minimal.py @@ -1,18 +1,16 @@ -from typing import List, Dict - import pytest import ray import ray.train as train -from ray.train import Trainer -from ray.train.backend import BackendConfig, Backend -from ray.train.callbacks import TrainingCallback +from ray.air.checkpoint import Checkpoint from ray.train._internal.worker_group import WorkerGroup +from ray.train.backend import Backend, BackendConfig +from ray.train.data_parallel_trainer import DataParallelTrainer @pytest.fixture -def ray_start_2_cpus(): - address_info = ray.init(num_cpus=2) +def ray_start_4_cpus(): + address_info = ray.init(num_cpus=4) yield address_info # The code after the yield will run as teardown code. ray.shutdown() @@ -32,15 +30,7 @@ def on_shutdown(self, worker_group: WorkerGroup, backend_config: TestConfig): pass -class TestCallback(TrainingCallback): - def __init__(self): - self.result_list = [] - - def handle_result(self, results: List[Dict], **info): - self.result_list.append(results) - - -def test_run(ray_start_2_cpus): +def test_run(ray_start_4_cpus): """Tests that Train can be run without any specific backends.""" num_workers = 2 key = "value" @@ -53,27 +43,23 @@ def train_func(): train.save_checkpoint(**checkpoint) return checkpoint[key] - checkpoint = {key: value} - test_callback = TestCallback() - - trainer = Trainer(config, num_workers=num_workers) - trainer.start() - results = trainer.run(train_func, checkpoint=checkpoint, callbacks=[test_callback]) + checkpoint = Checkpoint.from_dict( + { + # this would be set during checkpoint saving + "_current_checkpoint_id": 1, + key: value, + } + ) - # Test results. - assert len(results) == num_workers - assert all(result == 1 for result in results) + trainer = DataParallelTrainer( + train_func, + backend_config=config, + resume_from_checkpoint=checkpoint, + scaling_config=dict(num_workers=num_workers), + ) + results = trainer.fit() - # Test reporting and callbacks. - assert len(test_callback.result_list) == value - assert len(test_callback.result_list[0]) == num_workers - print(test_callback.result_list[0]) - assert all(result[key] == value for result in test_callback.result_list[0]) - - # Test checkpointing. - assert trainer.latest_checkpoint[key] == value - - trainer.shutdown() + assert results.checkpoint def test_failure(): @@ -89,7 +75,8 @@ def test_failure(): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index f08b3da43dc6..2fed4e42fa43 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -1,25 +1,31 @@ import os import pytest + import ray import ray.train as train from ray import tune from ray.air import Checkpoint -from ray.tune import TuneError +from ray.air.config import FailureConfig, RunConfig from ray.train import Trainer +from ray.train._internal.worker_group import WorkerGroup from ray.train.backend import Backend, BackendConfig +from ray.train.data_parallel_trainer import DataParallelTrainer from ray.train.examples.tensorflow_mnist_example import ( train_func as tensorflow_mnist_train_func, ) from ray.train.examples.train_fashion_mnist_example import ( train_func as fashion_mnist_train_func, ) -from ray.train._internal.worker_group import WorkerGroup +from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer +from ray.train.torch.torch_trainer import TorchTrainer +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner @pytest.fixture -def ray_start_2_cpus(): - address_info = ray.init(num_cpus=2) +def ray_start_4_cpus(): + address_info = ray.init(num_cpus=4) yield address_info # The code after the yield will run as teardown code. ray.shutdown() @@ -50,18 +56,24 @@ def on_shutdown(self, worker_group: WorkerGroup, backend_config: TestConfig): def torch_fashion_mnist(num_workers, use_gpu, num_samples): epochs = 2 - trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu) - MnistTrainable = trainer.to_tune_trainable(fashion_mnist_train_func) - - analysis = tune.run( - MnistTrainable, - num_samples=num_samples, - config={ - "lr": tune.loguniform(1e-4, 1e-1), - "batch_size": tune.choice([32, 64, 128]), - "epochs": epochs, + trainer = TorchTrainer( + fashion_mnist_train_func, + scaling_config=dict(num_workers=num_workers, use_gpu=use_gpu), + ) + tuner = Tuner( + trainer, + param_space={ + "train_loop_config": { + "lr": tune.loguniform(1e-4, 1e-1), + "batch_size": tune.choice([32, 64, 128]), + "epochs": epochs, + } }, + tune_config=TuneConfig( + num_samples=num_samples, + ), ) + analysis = tuner.fit()._experiment_analysis # Check that loss decreases in each trial. for path, df in analysis.trial_dataframes.items(): @@ -74,18 +86,25 @@ def test_tune_torch_fashion_mnist(ray_start_8_cpus): def tune_tensorflow_mnist(num_workers, use_gpu, num_samples): epochs = 2 - trainer = Trainer("tensorflow", num_workers=num_workers, use_gpu=use_gpu) - MnistTrainable = trainer.to_tune_trainable(tensorflow_mnist_train_func) - - analysis = tune.run( - MnistTrainable, - num_samples=num_samples, - config={ - "lr": tune.loguniform(1e-4, 1e-1), - "batch_size": tune.choice([32, 64, 128]), - "epochs": epochs, + + trainer = TensorflowTrainer( + tensorflow_mnist_train_func, + scaling_config=dict(num_workers=num_workers, use_gpu=use_gpu), + ) + tuner = Tuner( + trainer, + param_space={ + "train_loop_config": { + "lr": tune.loguniform(1e-4, 1e-1), + "batch_size": tune.choice([32, 64, 128]), + "epochs": epochs, + } }, + tune_config=TuneConfig( + num_samples=num_samples, + ), ) + analysis = tuner.fit()._experiment_analysis # Check that loss decreases in each trial. for path, df in analysis.trial_dataframes.items(): @@ -96,18 +115,7 @@ def test_tune_tensorflow_mnist(ray_start_8_cpus): tune_tensorflow_mnist(num_workers=2, use_gpu=False, num_samples=2) -def test_tune_error(ray_start_2_cpus): - def train_func(config): - raise RuntimeError("Error in training function!") - - trainer = Trainer(TestConfig(), num_workers=1) - TestTrainable = trainer.to_tune_trainable(train_func) - - with pytest.raises(TuneError): - tune.run(TestTrainable) - - -def test_tune_checkpoint(ray_start_2_cpus): +def test_tune_checkpoint(ray_start_4_cpus): def train_func(): for i in range(10): train.report(test=i) @@ -123,7 +131,7 @@ def train_func(): assert checkpoint["hello"] == "world" -def test_reuse_checkpoint(ray_start_2_cpus): +def test_reuse_checkpoint(ray_start_4_cpus): def train_func(config): itr = 0 ckpt = train.load_checkpoint() @@ -134,19 +142,28 @@ def train_func(config): train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i) - trainer = Trainer(TestConfig(), num_workers=1) - TestTrainable = trainer.to_tune_trainable(train_func) - - [trial] = tune.run(TestTrainable, config={"max_iter": 5}).trials + trainer = DataParallelTrainer( + train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1) + ) + tuner = Tuner( + trainer, + param_space={"train_loop_config": {"max_iter": 5}}, + ) + [trial] = tuner.fit()._experiment_analysis.trials checkpoint_path = trial.checkpoint.dir_or_data checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["iter"] == 4 - analysis = tune.run(TestTrainable, config={"max_iter": 10}, restore=checkpoint_path) + + tuner = Tuner( + trainer, + param_space={"train_loop_config": {"max_iter": 10}}, + ).restore(trial.local_dir) + analysis = tuner.fit()._experiment_analysis trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 5 -def test_retry(ray_start_2_cpus): +def test_retry(ray_start_4_cpus): def train_func(): ckpt = train.load_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? @@ -160,10 +177,12 @@ def train_func(): train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i) - trainer = Trainer(TestConfig(), num_workers=1) - TestTrainable = trainer.to_tune_trainable(train_func) + trainer = DataParallelTrainer( + train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1) + ) + tuner = Tuner(trainer, run_config=RunConfig(failure=FailureConfig(max_failures=3))) - analysis = tune.run(TestTrainable, max_failures=3) + analysis = tuner.fit()._experiment_analysis checkpoint_path = analysis.trials[0].checkpoint.dir_or_data checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["iter"] == 3 @@ -173,7 +192,8 @@ def train_func(): if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", "-x", __file__])) From 50ca40b3a9b701d3a97a256295de9505f5d82605 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 15 Jun 2022 21:20:04 +0000 Subject: [PATCH 15/63] Fixture fix --- python/ray/train/tests/test_examples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index 06c88577205e..169c0a29e236 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -62,7 +62,7 @@ def test_tf_non_distributed(ray_start_4_cpus): # TODO: Refactor as a backend test. -def test_tensorflow_mnist_fail(ray_start_2_cpus): +def test_tensorflow_mnist_fail(ray_start_4_cpus): """Tests if tensorflow example works even with worker failure.""" epochs = 3 From 20b707571febcff0ec8f2d9ba79e8005d56f85fd Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 16 Jun 2022 15:48:50 +0000 Subject: [PATCH 16/63] CI fixes --- .../examples/train_linear_dataset_example.py | 22 ++++++++++++++----- python/ray/train/tests/test_examples.py | 6 +++-- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/python/ray/train/examples/train_linear_dataset_example.py b/python/ray/train/examples/train_linear_dataset_example.py index f84faa5f7a11..3038ac66aa9e 100644 --- a/python/ray/train/examples/train_linear_dataset_example.py +++ b/python/ray/train/examples/train_linear_dataset_example.py @@ -1,16 +1,19 @@ import argparse -from typing import Dict +from typing import Dict, Tuple import torch import torch.nn as nn import ray import ray.train as train +from ray.air.config import DatasetConfig from ray.data import Dataset from ray.train.torch import TorchTrainer -def get_datasets(a=5, b=10, size=1000, split=0.8) -> Dict[str, Dataset]: +def get_datasets_and_configs( + a=5, b=10, size=1000, split=0.8 +) -> Tuple[Dict[str, Dataset], Dict[str, DatasetConfig]]: def get_dataset(a, b, size) -> Dataset: items = [i / size for i in range(size)] dataset = ray.data.from_items([{"x": x, "y": a * x + b} for x in items]) @@ -27,7 +30,13 @@ def get_dataset(a, b, size) -> Dataset: "validation": validation_dataset, } - return datasets + # Use dataset pipelining + dataset_configs = { + "train": DatasetConfig(use_stream_api=True), + "validation": DatasetConfig(use_stream_api=True), + } + + return datasets, dataset_configs def train_epoch(iterable_dataset, model, loss_fn, optimizer, device): @@ -113,13 +122,14 @@ def train_func(config): def train_linear(num_workers=2, use_gpu=False): - datasets = get_datasets() + datasets, dataset_configs = get_datasets_and_configs() config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer = TorchTrainer( train_func, train_loop_config=config, datasets=datasets, + dataset_config=dataset_configs, scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) results = trainer.fit() @@ -152,8 +162,8 @@ def train_linear(num_workers=2, use_gpu=False): args, _ = parser.parse_known_args() if args.smoke_test: - # 1 for datasets - num_cpus = args.num_workers + 1 + # 1 for datasets, 1 for Trainable actor + num_cpus = args.num_workers + 2 num_gpus = args.num_workers if args.use_gpu else 0 ray.init(num_cpus=num_cpus, num_gpus=num_gpus) else: diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index 169c0a29e236..fd6a2fadbf91 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -116,9 +116,11 @@ def test_torch_linear_failure(ray_start_4_cpus): results = trainer.run(linear_train_func, config, callbacks=[kill_callback]) trainer.shutdown() - result = results.metrics + assert len(results) == num_workers - assert result[TRAINING_ITERATION] == epochs + for result in results: + assert len(result) == epochs + assert result[-1]["loss"] < result[0]["loss"] def test_torch_fashion_mnist(ray_start_4_cpus): From c3b7d42c5f15cf3d44fec370df0c5cff9443b96e Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 16 Jun 2022 16:51:32 +0000 Subject: [PATCH 17/63] Fix --- .../ray/train/examples/tensorflow_quick_start.py | 7 +++---- python/ray/train/examples/torch_quick_start.py | 7 +++---- python/ray/train/tests/test_gpu.py | 16 ++++++++-------- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/python/ray/train/examples/tensorflow_quick_start.py b/python/ray/train/examples/tensorflow_quick_start.py index 0ac3666672e2..f0c7f3d10f4e 100644 --- a/python/ray/train/examples/tensorflow_quick_start.py +++ b/python/ray/train/examples/tensorflow_quick_start.py @@ -1,15 +1,12 @@ # flake8: noqa # fmt: off +# isort: skip_file # __tf_setup_begin__ -import json -import os - import numpy as np import tensorflow as tf - def mnist_dataset(batch_size): (x_train, y_train), _ = tf.keras.datasets.mnist.load_data() # The `x` arrays are in uint8 and have values in the [0, 255] range. @@ -50,6 +47,8 @@ def train_func(): # __tf_distributed_begin__ +import json +import os def train_func_distributed(): per_worker_batch_size = 64 diff --git a/python/ray/train/examples/torch_quick_start.py b/python/ray/train/examples/torch_quick_start.py index eaf07a95a5d1..2f0da37ddbc9 100644 --- a/python/ray/train/examples/torch_quick_start.py +++ b/python/ray/train/examples/torch_quick_start.py @@ -1,13 +1,10 @@ # flake8: noqa # fmt: off +# isort: skip_file # __torch_setup_begin__ import torch import torch.nn as nn -import torch.optim as optim - -import ray.train.torch -from ray import train num_samples = 20 input_size = 10 @@ -32,6 +29,7 @@ def forward(self, input): # __torch_single_begin__ +import torch.optim as optim def train_func(): num_epochs = 3 @@ -51,6 +49,7 @@ def train_func(): # __torch_distributed_begin__ +from ray import train def train_func_distributed(): num_epochs = 3 diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py index 16dac0c42fc7..ac9a0afe7cfb 100644 --- a/python/ray/train/tests/test_gpu.py +++ b/python/ray/train/tests/test_gpu.py @@ -42,7 +42,7 @@ def ray_start_1_cpu_1_gpu(): ray.shutdown() -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. @pytest.mark.parametrize("num_gpus_per_worker", [0.5, 1]) def test_torch_get_device(ray_start_4_cpus_2_gpus, num_gpus_per_worker): def train_fn(): @@ -69,7 +69,7 @@ def train_fn(): ) -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. def test_torch_prepare_model(ray_start_4_cpus_2_gpus): """Tests if ``prepare_model`` correctly wraps in DDP.""" @@ -91,7 +91,7 @@ def train_fn(): trainer.shutdown() -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. def test_torch_prepare_dataloader(ray_start_4_cpus_2_gpus): data_loader = DataLoader(LinearDataset(a=1, b=2, size=10)) @@ -115,7 +115,7 @@ def train_fn(): trainer.shutdown() -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. @pytest.mark.parametrize("use_gpu", (False, True)) def test_enable_reproducibility(ray_start_4_cpus_2_gpus, use_gpu): # NOTE: Reproducible results aren't guaranteed between seeded executions, even with @@ -162,7 +162,7 @@ def train_func(): assert result1 == result2 -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. def test_torch_amp_performance(ray_start_4_cpus_2_gpus): def train_func(config): train.torch.accelerate(amp=config["amp"]) @@ -205,7 +205,7 @@ def latency(amp: bool) -> float: assert 1.05 * latency(amp=True) < latency(amp=False) -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. def test_checkpoint_torch_model_with_amp(ray_start_4_cpus_2_gpus): """Test that model with AMP is serializable.""" @@ -223,7 +223,7 @@ def train_func(): trainer.shutdown() -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. def test_torch_auto_gpu_to_cpu(ray_start_4_cpus_2_gpus): """Tests if GPU tensors are auto converted to CPU on driver.""" @@ -363,7 +363,7 @@ def test_tensorflow_linear_dataset_gpu(ray_start_4_cpus_2_gpus): assert train_tensorflow_linear(num_workers=2, use_gpu=True) -@pytest.mark.skip("Refactor as a backend test.") +# TODO: Refactor as a backend test. @pytest.mark.parametrize( ("device_choice", "auto_transfer"), [ From 37b81825e4c6fe406f88cc92446afac021ca4c74 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 16 Jun 2022 19:28:38 +0000 Subject: [PATCH 18/63] Apply suggestions from code review --- python/ray/train/BUILD | 8 + .../train/examples/horovod/horovod_example.py | 2 - .../tensorflow_linear_dataset_example.py | 30 +- .../examples/train_linear_dataset_example.py | 5 - .../train/examples/train_linear_example.py | 5 - python/ray/train/tests/test_callbacks.py | 357 ++++++++++++++++++ python/ray/train/tests/test_minimal.py | 2 +- python/ray/train/tests/test_tune.py | 28 +- 8 files changed, 407 insertions(+), 30 deletions(-) create mode 100644 python/ray/train/tests/test_callbacks.py diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index 6124c35c2606..6f719b725e64 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -129,6 +129,14 @@ py_test( deps = [":train_lib"] ) +py_test( + name = "test_callbacks", + size = "medium", + srcs = ["tests/test_callbacks.py"], + tags = ["team:ml", "exclusive"], + deps = [":train_lib"] +) + py_test( name = "test_data_parallel_trainer", size = "medium", diff --git a/python/ray/train/examples/horovod/horovod_example.py b/python/ray/train/examples/horovod/horovod_example.py index 1e163da70052..c01788008ec5 100644 --- a/python/ray/train/examples/horovod/horovod_example.py +++ b/python/ray/train/examples/horovod/horovod_example.py @@ -144,13 +144,11 @@ def train_func(config): model, optimizer, train_loader, train_sampler = setup(config) - results = [] for epoch in range(num_epochs): loss = train_epoch( model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda ) train.report(loss=loss) - return results def main(num_workers, use_gpu, kwargs): diff --git a/python/ray/train/examples/tensorflow_linear_dataset_example.py b/python/ray/train/examples/tensorflow_linear_dataset_example.py index 9271c5125da4..9dbb3205b7bf 100644 --- a/python/ray/train/examples/tensorflow_linear_dataset_example.py +++ b/python/ray/train/examples/tensorflow_linear_dataset_example.py @@ -1,12 +1,13 @@ import argparse +from typing import Dict, Tuple import tensorflow as tf from tensorflow.keras.callbacks import Callback import ray import ray.train as train +from ray.air.config import DatasetConfig from ray.data import Dataset -from ray.data.dataset_pipeline import DatasetPipeline from ray.train.tensorflow import TensorflowTrainer, prepare_dataset_shard @@ -15,17 +16,22 @@ def on_epoch_end(self, epoch, logs=None): train.report(**logs) -def get_dataset_pipeline(a=5, b=10, size=1000) -> DatasetPipeline: +def get_datasets_and_configs( + a=5, b=10, size=1000 +) -> Tuple[Dict[str, Dataset], Dict[str, DatasetConfig]]: def get_dataset(a, b, size) -> Dataset: items = [i / size for i in range(size)] dataset = ray.data.from_items([{"x": x, "y": a * x + b} for x in items]) return dataset - dataset = get_dataset(a, b, size) + datasets = {"train": get_dataset(a, b, size)} - dataset_pipeline = dataset.repeat().random_shuffle_each_window() + # Use dataset pipelining + dataset_configs = { + "train": DatasetConfig(use_stream_api=True), + } - return dataset_pipeline + return datasets, dataset_configs def build_and_compile_model(config): @@ -57,7 +63,6 @@ def train_func(config): dataset_pipeline = train.get_dataset_shard("train") dataset_iterator = dataset_pipeline.iter_epochs() - results = [] for _ in range(epochs): dataset = next(dataset_iterator) tf_dataset = prepare_dataset_shard( @@ -70,17 +75,16 @@ def train_func(config): batch_size=batch_size, ) ) - history = multi_worker_model.fit(tf_dataset, callbacks=[TrainReportCallback()]) - results.append(history.history) - return results + multi_worker_model.fit(tf_dataset, callbacks=[TrainReportCallback()]) def train_tensorflow_linear(num_workers=2, use_gpu=False): - dataset_pipeline = get_dataset_pipeline() + datasets, dataset_configs = get_datasets_and_configs() trainer = TensorflowTrainer( train_func, train_loop_config={"lr": 1e-3, "batch_size": 32, "epochs": 4}, - datasets={"train": dataset_pipeline}, + datasets=datasets, + dataset_config=dataset_configs, scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) results = trainer.fit() @@ -113,8 +117,8 @@ def train_tensorflow_linear(num_workers=2, use_gpu=False): args, _ = parser.parse_known_args() if args.smoke_test: - # 1 for datasets - num_cpus = args.num_workers + 1 + # 1 for datasets, 1 for Trainable actor + num_cpus = args.num_workers + 2 num_gpus = args.num_workers if args.use_gpu else 0 ray.init(num_cpus=num_cpus, num_gpus=num_gpus) else: diff --git a/python/ray/train/examples/train_linear_dataset_example.py b/python/ray/train/examples/train_linear_dataset_example.py index 3038ac66aa9e..acfa0ce2e637 100644 --- a/python/ray/train/examples/train_linear_dataset_example.py +++ b/python/ray/train/examples/train_linear_dataset_example.py @@ -87,8 +87,6 @@ def train_func(config): optimizer = torch.optim.SGD(model.parameters(), lr=lr) - results = [] - train_dataset_iterator = train_dataset_pipeline_shard.iter_epochs() validation_dataset_iterator = validation_dataset_pipeline_shard.iter_epochs() @@ -116,9 +114,6 @@ def train_func(config): train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) result = validate_epoch(validation_torch_dataset, model, loss_fn, device) train.report(**result) - results.append(result) - - return results def train_linear(num_workers=2, use_gpu=False): diff --git a/python/ray/train/examples/train_linear_example.py b/python/ray/train/examples/train_linear_example.py index 069c6dd13db1..7e09acef3d3d 100644 --- a/python/ray/train/examples/train_linear_example.py +++ b/python/ray/train/examples/train_linear_example.py @@ -74,15 +74,10 @@ def train_func(config): optimizer = torch.optim.SGD(model.parameters(), lr=lr) - results = [] - for _ in range(epochs): train_epoch(train_loader, model, loss_fn, optimizer) result = validate_epoch(validation_loader, model, loss_fn) train.report(**result) - results.append(result) - - return results def train_linear(num_workers=2, use_gpu=False, epochs=3): diff --git a/python/ray/train/tests/test_callbacks.py b/python/ray/train/tests/test_callbacks.py new file mode 100644 index 000000000000..b21adf6634b9 --- /dev/null +++ b/python/ray/train/tests/test_callbacks.py @@ -0,0 +1,357 @@ +from typing import Dict, List +import glob +import io +import json +from collections import defaultdict +from contextlib import redirect_stdout +from pathlib import Path + +import pytest + +import ray +import ray.train as train +from ray.train import Trainer +from ray.train.backend import BackendConfig, Backend +from ray.train.callbacks import ( + TrainingCallback, + JsonLoggerCallback, + PrintCallback, + TBXLoggerCallback, + TorchTensorboardProfilerCallback, +) +from ray.train.callbacks.logging import ( + MLflowLoggerCallback, + _TrainCallbackLogdirManager, +) +from ray.train.constants import ( + TRAINING_ITERATION, + DETAILED_AUTOFILLED_KEYS, + BASIC_AUTOFILLED_KEYS, + ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, +) +from ray.train._internal.worker_group import WorkerGroup +from ray.train._internal.results_preprocessors.preprocessor import ( + SequentialResultsPreprocessor, +) + +try: + from tensorflow.python.summary.summary_iterator import summary_iterator +except ImportError: + summary_iterator = None + + +@pytest.fixture +def ray_start_4_cpus(): + address_info = ray.init(num_cpus=4) + yield address_info + # The code after the yield will run as teardown code. + ray.shutdown() + + +class TestConfig(BackendConfig): + @property + def backend_cls(self): + return TestBackend + + +class TestBackend(Backend): + def on_start(self, worker_group: WorkerGroup, backend_config: TestConfig): + pass + + def on_shutdown(self, worker_group: WorkerGroup, backend_config: TestConfig): + pass + + +def test_print(ray_start_4_cpus): + num_workers = 4 + + def train_func(): + train.report(rank=train.world_rank()) + + stream = io.StringIO() + with redirect_stdout(stream): + trainer = Trainer(TestConfig(), num_workers=num_workers) + trainer.start() + trainer.run(train_func, callbacks=[PrintCallback()]) + trainer.shutdown() + + output = stream.getvalue() + results = json.loads(output) + + assert len(results) == num_workers + for i, result in enumerate(results): + assert set(result.keys()) == (BASIC_AUTOFILLED_KEYS | {"rank"}) + assert result["rank"] == i + + +@pytest.mark.parametrize("input", [None, "dir", "file"]) +def test_train_callback_logdir_manager(tmp_path, input): + default_dir = tmp_path / "default_dir" + + if input == "dir": + input_logdir = tmp_path / "dir" + input_logdir.mkdir(parents=True) + elif input == "file": + input_logdir = tmp_path / "file" + input_logdir.touch() + else: + input_logdir = None + + logdir_manager = _TrainCallbackLogdirManager(input_logdir) + + if input_logdir: + path = logdir_manager.logdir_path + assert path == logdir_manager.logdir_path + else: + with pytest.raises(RuntimeError): + path = logdir_manager.logdir_path + + if input_logdir and not Path(input_logdir).is_dir(): + with pytest.raises(FileExistsError): + logdir_manager.setup_logdir(str(default_dir)) + else: + path = logdir_manager.setup_logdir(str(default_dir)) + assert path == logdir_manager.logdir_path + + +@pytest.mark.parametrize("workers_to_log", [0, None, [0, 1]]) +@pytest.mark.parametrize("detailed", [False, True]) +@pytest.mark.parametrize("filename", [None, "my_own_filename.json"]) +def test_json( + monkeypatch, ray_start_4_cpus, tmp_path, workers_to_log, detailed, filename +): + if detailed: + monkeypatch.setenv(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, "1") + + config = TestConfig() + + num_iters = 5 + num_workers = 4 + + if workers_to_log is None: + num_workers_to_log = num_workers + elif isinstance(workers_to_log, int): + num_workers_to_log = 1 + else: + num_workers_to_log = len(workers_to_log) + + def train_func(): + for i in range(num_iters): + train.report(index=i) + return 1 + + if filename is None: + # if None, use default value + callback = JsonLoggerCallback(workers_to_log=workers_to_log) + else: + callback = JsonLoggerCallback(filename=filename, workers_to_log=workers_to_log) + trainer = Trainer(config, num_workers=num_workers, logdir=str(tmp_path)) + trainer.start() + trainer.run(train_func, callbacks=[callback]) + if filename is None: + assert str(callback.log_path.name) == JsonLoggerCallback._default_filename + else: + assert str(callback.log_path.name) == filename + + with open(callback.log_path, "r") as f: + log = json.load(f) + print(log) + assert len(log) == num_iters + assert len(log[0]) == num_workers_to_log + assert all(len(element) == len(log[0]) for element in log) + assert all( + all(worker["index"] == worker[TRAINING_ITERATION] - 1 for worker in element) + for element in log + ) + assert all( + all(all(key in worker for key in BASIC_AUTOFILLED_KEYS) for worker in element) + for element in log + ) + if detailed: + assert all( + all( + all(key in worker for key in DETAILED_AUTOFILLED_KEYS) + for worker in element + ) + for element in log + ) + else: + assert all( + all( + not any(key in worker for key in DETAILED_AUTOFILLED_KEYS) + for worker in element + ) + for element in log + ) + + +def _validate_tbx_result(events_dir): + events_file = list(glob.glob(f"{events_dir}/events*"))[0] + results = defaultdict(list) + for event in summary_iterator(events_file): + for v in event.summary.value: + assert v.tag.startswith("ray/train") + results[v.tag[10:]].append(v.simple_value) + + assert len(results["episode_reward_mean"]) == 3 + assert [int(res) for res in results["episode_reward_mean"]] == [4, 5, 6] + assert len(results["score"]) == 1 + assert len(results["hello/world"]) == 1 + + +def test_TBX(ray_start_4_cpus, tmp_path): + config = TestConfig() + + temp_dir = tmp_path + num_workers = 4 + + def train_func(): + train.report(episode_reward_mean=4) + train.report(episode_reward_mean=5) + train.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1}) + return 1 + + callback = TBXLoggerCallback(temp_dir) + trainer = Trainer(config, num_workers=num_workers) + trainer.start() + trainer.run(train_func, callbacks=[callback]) + + _validate_tbx_result(temp_dir) + + +def test_mlflow(ray_start_4_cpus, tmp_path): + config = TestConfig() + + params = {"p1": "p1"} + + temp_dir = tmp_path + num_workers = 4 + + def train_func(config): + train.report(episode_reward_mean=4) + train.report(episode_reward_mean=5) + train.report(episode_reward_mean=6) + return 1 + + callback = MLflowLoggerCallback(experiment_name="test_exp", logdir=temp_dir) + trainer = Trainer(config, num_workers=num_workers) + trainer.start() + trainer.run(train_func, config=params, callbacks=[callback]) + + from mlflow.tracking import MlflowClient + + client = MlflowClient(tracking_uri=callback.mlflow_util._mlflow.get_tracking_uri()) + + experiment_id = client.get_experiment_by_name("test_exp").experiment_id + all_runs = callback.mlflow_util._mlflow.search_runs(experiment_ids=[experiment_id]) + assert len(all_runs) == 1 + # all_runs is a pandas dataframe. + all_runs = all_runs.to_dict(orient="records") + run_id = all_runs[0]["run_id"] + run = client.get_run(run_id) + + assert run.data.params == params + assert ( + "episode_reward_mean" in run.data.metrics + and run.data.metrics["episode_reward_mean"] == 6.0 + ) + assert ( + TRAINING_ITERATION in run.data.metrics + and run.data.metrics[TRAINING_ITERATION] == 3.0 + ) + + metric_history = client.get_metric_history(run_id=run_id, key="episode_reward_mean") + + assert len(metric_history) == 3 + iterations = [metric.step for metric in metric_history] + assert iterations == [1, 2, 3] + rewards = [metric.value for metric in metric_history] + assert rewards == [4, 5, 6] + + +def test_torch_tensorboard_profiler_callback(ray_start_4_cpus, tmp_path): + config = TestConfig() + + temp_dir = tmp_path + num_workers = 4 + num_epochs = 2 + + def train_func(): + from ray.train.torch import TorchWorkerProfiler + from torch.profiler import profile, record_function, schedule + + twp = TorchWorkerProfiler() + with profile( + activities=[], + schedule=schedule(wait=0, warmup=0, active=1), + on_trace_ready=twp.trace_handler, + ) as p: + + for epoch in range(num_epochs): + with record_function("test_function"): + pass + + p.step() + + profile_results = twp.get_and_clear_profile_traces() + train.report(epoch=epoch, **profile_results) + + callback = TorchTensorboardProfilerCallback(temp_dir) + trainer = Trainer(config, num_workers=num_workers) + trainer.start() + trainer.run(train_func, callbacks=[callback]) + + assert temp_dir.exists() + + count = 0 + for path in temp_dir.iterdir(): + assert path.is_file() + count += 1 + assert count == num_workers * num_epochs + + +# fix issue: repeat assignments for preprocessor results nested recursive calling +# see https://github.com/ray-project/ray/issues/25005 +def test_hotfix_callback_nested_recusive_calling(): + # test callback used to simulate the nested recursive calling for preprocess() + class TestCallback(TrainingCallback): + def __init__(self): + self.max_process_time = 0 + + def count_process_times(self, processor): + count = 0 + if processor: + if isinstance(processor, SequentialResultsPreprocessor): + for preprocessor in processor.preprocessors: + # recursive calling preprocessors in list + count += self.count_process_times(preprocessor) + else: + count = 1 + return count + + def handle_result(self, results: List[Dict], **info): + process_times = self.count_process_times(self.results_preprocessor) + if process_times > self.max_process_time: + self.max_process_time = process_times + print(f"process times: {process_times}") + + def train_func(): + for idx in range(num_iterates): + train.report(iterate=idx + 1) + + # python default limitation for iterate depth + num_iterates = 1000 + trainer = Trainer(TestConfig(), num_workers=1) + trainer.start() + test_callback = TestCallback() + trainer.run(train_func, callbacks=[test_callback]) + assert test_callback.max_process_time == 1 + print(f"callback max process time: {test_callback.max_process_time}") + trainer.shutdown() + + +if __name__ == "__main__": + import pytest + import sys + + sys.exit(pytest.main(["-v", "-x", __file__])) \ No newline at end of file diff --git a/python/ray/train/tests/test_minimal.py b/python/ray/train/tests/test_minimal.py index 5f3be1d4c3b3..a23d7b4f23f9 100644 --- a/python/ray/train/tests/test_minimal.py +++ b/python/ray/train/tests/test_minimal.py @@ -59,7 +59,7 @@ def train_func(): ) results = trainer.fit() - assert results.checkpoint + assert results.checkpoint.to_dict()[key] == checkpoint.to_dict()[key] def test_failure(): diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index 2fed4e42fa43..3fac9a1e6599 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -7,7 +7,6 @@ from ray import tune from ray.air import Checkpoint from ray.air.config import FailureConfig, RunConfig -from ray.train import Trainer from ray.train._internal.worker_group import WorkerGroup from ray.train.backend import Backend, BackendConfig from ray.train.data_parallel_trainer import DataParallelTrainer @@ -115,16 +114,37 @@ def test_tune_tensorflow_mnist(ray_start_8_cpus): tune_tensorflow_mnist(num_workers=2, use_gpu=False, num_samples=2) +def test_tune_error(ray_start_4_cpus): + def train_func(config): + raise RuntimeError("Error in training function!") + + trainer = DataParallelTrainer( + train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1) + ) + tuner = Tuner( + trainer, + ) + + # with pytest.raises(TuneError): + tuner.fit() + print("a") + + def test_tune_checkpoint(ray_start_4_cpus): def train_func(): for i in range(10): train.report(test=i) train.save_checkpoint(hello="world") - trainer = Trainer(TestConfig(), num_workers=1) - TestTrainable = trainer.to_tune_trainable(train_func) + trainer = DataParallelTrainer( + train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1) + ) + tuner = Tuner( + trainer, + param_space={"train_loop_config": {"max_iter": 5}}, + ) - [trial] = tune.run(TestTrainable).trials + [trial] = tuner.fit()._experiment_analysis.trials checkpoint_path = trial.checkpoint.dir_or_data assert os.path.exists(checkpoint_path) checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() From 6f8d7e092c5ea919c99479781b8483f328ceaec6 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 16 Jun 2022 21:58:20 +0000 Subject: [PATCH 19/63] Fix tracked checkpoint error --- python/ray/util/ml_utils/checkpoint_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index 9dced9377750..9a27acd10e36 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -132,7 +132,7 @@ def to_air_checkpoint(self) -> Optional[Checkpoint]: checkpoint_dir = TrainableUtil.find_checkpoint_dir(checkpoint_data) checkpoint = Checkpoint.from_directory(checkpoint_dir) elif isinstance(checkpoint_data, bytes): - with tempfile.mkdtemp() as tmpdir: + with tempfile.TemporaryDirectory() as tmpdir: TrainableUtil.create_from_pickle(checkpoint_data, tmpdir) # Double wrap in checkpoint so we hold the data in memory and # can remove the temp directory From 85cb1a71e90ee21d5095a817ff10feb478da7922 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 16 Jun 2022 21:58:39 +0000 Subject: [PATCH 20/63] CI fixes --- .../tensorflow_linear_dataset_example.py | 2 +- .../train/examples/train_linear_example.py | 4 +++ python/ray/train/tests/test_callbacks.py | 26 ++++++++++--------- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/python/ray/train/examples/tensorflow_linear_dataset_example.py b/python/ray/train/examples/tensorflow_linear_dataset_example.py index 9dbb3205b7bf..ccc408455b44 100644 --- a/python/ray/train/examples/tensorflow_linear_dataset_example.py +++ b/python/ray/train/examples/tensorflow_linear_dataset_example.py @@ -88,7 +88,7 @@ def train_tensorflow_linear(num_workers=2, use_gpu=False): scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) results = trainer.fit() - print(f"Results: {results[0]}") + print(f"Results: {results}") return results diff --git a/python/ray/train/examples/train_linear_example.py b/python/ray/train/examples/train_linear_example.py index 7e09acef3d3d..ceabd0c2853f 100644 --- a/python/ray/train/examples/train_linear_example.py +++ b/python/ray/train/examples/train_linear_example.py @@ -74,10 +74,14 @@ def train_func(config): optimizer = torch.optim.SGD(model.parameters(), lr=lr) + results = [] for _ in range(epochs): train_epoch(train_loader, model, loss_fn, optimizer) result = validate_epoch(validation_loader, model, loss_fn) train.report(**result) + results.append(result) + # return required for backwards compatibility with the old API + return results def train_linear(num_workers=2, use_gpu=False, epochs=3): diff --git a/python/ray/train/tests/test_callbacks.py b/python/ray/train/tests/test_callbacks.py index b21adf6634b9..eb6ed7c3db17 100644 --- a/python/ray/train/tests/test_callbacks.py +++ b/python/ray/train/tests/test_callbacks.py @@ -1,37 +1,37 @@ -from typing import Dict, List import glob import io import json from collections import defaultdict from contextlib import redirect_stdout from pathlib import Path +from typing import Dict, List import pytest import ray import ray.train as train from ray.train import Trainer -from ray.train.backend import BackendConfig, Backend +from ray.train._internal.results_preprocessors.preprocessor import ( + SequentialResultsPreprocessor, +) +from ray.train._internal.worker_group import WorkerGroup +from ray.train.backend import Backend, BackendConfig from ray.train.callbacks import ( - TrainingCallback, JsonLoggerCallback, PrintCallback, TBXLoggerCallback, TorchTensorboardProfilerCallback, + TrainingCallback, ) from ray.train.callbacks.logging import ( MLflowLoggerCallback, _TrainCallbackLogdirManager, ) from ray.train.constants import ( - TRAINING_ITERATION, - DETAILED_AUTOFILLED_KEYS, BASIC_AUTOFILLED_KEYS, + DETAILED_AUTOFILLED_KEYS, ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, -) -from ray.train._internal.worker_group import WorkerGroup -from ray.train._internal.results_preprocessors.preprocessor import ( - SequentialResultsPreprocessor, + TRAINING_ITERATION, ) try: @@ -277,9 +277,10 @@ def test_torch_tensorboard_profiler_callback(ray_start_4_cpus, tmp_path): num_epochs = 2 def train_func(): - from ray.train.torch import TorchWorkerProfiler from torch.profiler import profile, record_function, schedule + from ray.train.torch import TorchWorkerProfiler + twp = TorchWorkerProfiler() with profile( activities=[], @@ -351,7 +352,8 @@ def train_func(): if __name__ == "__main__": - import pytest import sys - sys.exit(pytest.main(["-v", "-x", __file__])) \ No newline at end of file + import pytest + + sys.exit(pytest.main(["-v", "-x", __file__])) From 86a71d6bd6958b639e3eeaa19264bc99398c0aa5 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 20 Jun 2022 16:10:20 +0000 Subject: [PATCH 21/63] Add checkpoint configuration to `RunConfig` --- python/ray/air/config.py | 23 ++++++++++++++++++++++- python/ray/tune/impl/tuner_internal.py | 10 ++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/python/ray/air/config.py b/python/ray/air/config.py index a8f5c2b85c66..d75dc3c0cc5d 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -273,6 +273,25 @@ class FailureConfig: max_failures: int = 0 +@dataclass +@PublicAPI(stability="alpha") +class CheckpointingConfig: + """Configuration related to checkpointing of each run/trial. + + Args: + keep_checkpoints_num: Number of checkpoints to keep. A value of + `None` keeps all checkpoints. Defaults to `None`. If set, need + to provide `checkpoint_score_attr`. + checkpoint_score_attr: Specifies by which attribute to rank the + best checkpoint. Default is increasing order. If attribute starts + with `min-` it will rank attribute in decreasing order, i.e. + `min-validation_loss`. + """ + + keep_checkpoints_num: Optional[int] = None + checkpoint_score_attr: Optional[str] = None + + @dataclass @PublicAPI(stability="alpha") class RunConfig: @@ -298,8 +317,9 @@ class RunConfig: Currently only stateless callbacks are supported for resumed runs. (any state of the callback will not be checkpointed by Tune and thus will not take effect in resumed runs). - failure: The failure mode configuration. + failure: Failure mode configuration. sync_config: Configuration object for syncing. See tune.SyncConfig. + checkpointing: Checkpointing configuration. verbose: 0, 1, 2, or 3. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and brief results, 3 = status and detailed results. Defaults to 2. @@ -312,4 +332,5 @@ class RunConfig: stop: Optional[Union[Mapping, "Stopper", Callable[[str, Mapping], bool]]] = None failure: Optional[FailureConfig] = None sync_config: Optional[SyncConfig] = None + checkpointing: Optional[CheckpointingConfig] = None verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py index 51d67a5a5c9b..7190f6c6ff38 100644 --- a/python/ray/tune/impl/tuner_internal.py +++ b/python/ray/tune/impl/tuner_internal.py @@ -166,6 +166,16 @@ def _fit_internal(self, trainable, param_space) -> ExperimentAnalysis: max_failures=( self._run_config.failure.max_failures if self._run_config.failure else 0 ), + keep_checkpoints_num=( + self._run_config.checkpointing.keep_checkpoints_num + if self._run_config.checkpointing + else None + ), + checkpoint_score_attr=( + self._run_config.checkpointing.checkpoint_score_attr + if self._run_config.checkpointing + else None + ), _experiment_checkpoint_dir=self._experiment_checkpoint_dir, raise_on_failed_trial=False, verbose=self._run_config.verbose, From 41eb7809fd2f7bcb1188080f927d95cc5ed23645 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 20 Jun 2022 18:41:58 +0000 Subject: [PATCH 22/63] Add `best_checkpoint` and `dataframe` to `Result` --- python/ray/air/result.py | 9 ++++ python/ray/tune/impl/tuner_internal.py | 49 ++++++++++++---------- python/ray/tune/result_grid.py | 6 +++ python/ray/tune/tests/test_result_grid.py | 50 +++++++++++++++++++++++ 4 files changed, 92 insertions(+), 22 deletions(-) diff --git a/python/ray/air/result.py b/python/ray/air/result.py index 69cfd69926b8..2b52fb844244 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -4,6 +4,8 @@ from ray.air.checkpoint import Checkpoint from ray.util.annotations import PublicAPI +import pandas as pd + @dataclass @PublicAPI(stability="alpha") @@ -21,12 +23,19 @@ class Result: Args: metrics: The final metrics as reported by an Trainable. checkpoint: The final checkpoint of the Trainable. + best_checkpoint: The best checkpoint of the Trainable, as + determined by the ``metric`` and ``mode`` arguments set. + If either of those has not been set, this will be None. + May be the same as ``checkpoint``. error: The execution error of the Trainable run, if the trial finishes in error. + dataframe: The full result dataframe of the Trainable. """ metrics: Optional[Dict[str, Any]] checkpoint: Optional[Checkpoint] + best_checkpoint: Optional[Checkpoint] error: Optional[Exception] + dataframe: Optional[pd.DataFrame] @property def config(self) -> Optional[Dict[str, Any]]: diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py index 7190f6c6ff38..b2ea167369d7 100644 --- a/python/ray/tune/impl/tuner_internal.py +++ b/python/ray/tune/impl/tuner_internal.py @@ -149,17 +149,11 @@ def fit(self) -> ResultGrid: return ResultGrid(analysis) - def _fit_internal(self, trainable, param_space) -> ExperimentAnalysis: - """Fitting for a fresh Tuner.""" - analysis = run( - trainable, - config={**param_space}, + def _get_tune_run_arguments(self) -> Dict[str, Any]: + """Get tune.run arguments common for both new and resumed runs.""" + return dict( mode=self._tune_config.mode, metric=self._tune_config.metric, - num_samples=self._tune_config.num_samples, - search_alg=self._tune_config.search_alg, - scheduler=self._tune_config.scheduler, - name=self._run_config.name, callbacks=self._run_config.callbacks, sync_config=self._run_config.sync_config, stop=self._run_config.stop, @@ -179,27 +173,38 @@ def _fit_internal(self, trainable, param_space) -> ExperimentAnalysis: _experiment_checkpoint_dir=self._experiment_checkpoint_dir, raise_on_failed_trial=False, verbose=self._run_config.verbose, + ) + + def _fit_internal(self, trainable, param_space) -> ExperimentAnalysis: + """Fitting for a fresh Tuner.""" + args = { + **self._get_tune_run_arguments(), + **dict( + run_or_experiment=trainable, + config={**param_space}, + num_samples=self._tune_config.num_samples, + search_alg=self._tune_config.search_alg, + scheduler=self._tune_config.scheduler, + name=self._run_config.name, + ), **self._tuner_kwargs, + } + analysis = run( + **args, ) return analysis def _fit_resume(self, trainable) -> ExperimentAnalysis: """Fitting for a restored Tuner.""" - analysis = run( - trainable, - resume=True, - mode=self._tune_config.mode, - metric=self._tune_config.metric, - callbacks=self._run_config.callbacks, - sync_config=self._run_config.sync_config, - stop=self._run_config.stop, - max_failures=( - self._run_config.failure.max_failures if self._run_config.failure else 0 + args = { + **self._get_tune_run_arguments(), + **dict( + run_or_experiment=trainable, + resume=True, ), - _experiment_checkpoint_dir=self._experiment_checkpoint_dir, - raise_on_failed_trial=False, **self._tuner_kwargs, - ) + } + analysis = run(**args) return analysis def __getstate__(self): diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 9d653ecb4991..2568ebe46b09 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -165,10 +165,16 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError] def _trial_to_result(self, trial: Trial) -> Result: checkpoint = trial.checkpoint.to_air_checkpoint() + try: + best_checkpoint = self._experiment_analysis.best_checkpoint + except ValueError: + best_checkpoint = None result = Result( checkpoint=checkpoint, + best_checkpoint=best_checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), + dataframe=self._experiment_analysis.trial_dataframes.get(trial.logdir), ) return result diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index 77fae7453edf..dbcab0037d50 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -3,6 +3,7 @@ import pickle import pytest +import pandas as pd import ray from ray import tune @@ -40,6 +41,55 @@ def f(config): assert result.metrics["config"] == result.config +def test_result_grid_metric_mode(ray_start_2_cpus): + def f(config): + for i in range(2): + with tune.checkpoint_dir(step=i) as checkpoint_dir: + path = os.path.join(checkpoint_dir, "checkpoint") + with open(path, "w") as f: + f.write(json.dumps({"step": i})) + tune.report(step=i) + + analysis = tune.run(f, config={"a": 1}, metric="step", mode="max") + analysis._legacy_checkpoint = False + result_grid = ResultGrid(analysis) + result = result_grid[0] + assert isinstance(result.checkpoint, Checkpoint) + assert isinstance(result.best_checkpoint, Checkpoint) + assert isinstance(result.metrics, dict) + assert isinstance(result.config, dict) + assert isinstance(result.dataframe, pd.DataFrame) + assert os.path.normpath( + result.checkpoint.get_internal_representation()[1] + ) == os.path.normpath(result.best_checkpoint.get_internal_representation()[1]) + assert result.config == {"a": 1} + assert result.metrics["config"] == result.config + assert len(result.dataframe) == 2 + + +def test_result_grid_metric_mode_unset(ray_start_2_cpus): + def f(config): + for i in range(2): + with tune.checkpoint_dir(step=i) as checkpoint_dir: + path = os.path.join(checkpoint_dir, "checkpoint") + with open(path, "w") as f: + f.write(json.dumps({"step": i})) + tune.report(step=i) + + analysis = tune.run(f, config={"a": 1}) + analysis._legacy_checkpoint = False + result_grid = ResultGrid(analysis) + result = result_grid[0] + assert isinstance(result.checkpoint, Checkpoint) + assert result.best_checkpoint is None + assert isinstance(result.metrics, dict) + assert isinstance(result.config, dict) + assert isinstance(result.dataframe, pd.DataFrame) + assert result.config == {"a": 1} + assert result.metrics["config"] == result.config + assert len(result.dataframe) == 2 + + def test_result_grid_no_checkpoint(ray_start_2_cpus): def f(config): pass From eb2eb6717ff59a29072a566377760fb5cc1e2025 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 20 Jun 2022 19:53:31 +0000 Subject: [PATCH 23/63] Tests, fixes --- python/ray/air/__init__.py | 10 ++- python/ray/air/config.py | 43 +++++++++++-- python/ray/air/result.py | 6 +- python/ray/air/tests/test_api.py | 28 +++++++++ python/ray/tune/impl/tuner_internal.py | 2 +- python/ray/tune/result_grid.py | 28 ++++++++- python/ray/tune/tests/test_result_grid.py | 4 +- python/ray/tune/tests/test_tuner.py | 76 ++++++++++++++++++++++- 8 files changed, 181 insertions(+), 16 deletions(-) diff --git a/python/ray/air/__init__.py b/python/ray/air/__init__.py index 196fa1aa7e35..2c82cce8f4e3 100644 --- a/python/ray/air/__init__.py +++ b/python/ray/air/__init__.py @@ -1,5 +1,11 @@ from ray.air.checkpoint import Checkpoint -from ray.air.config import DatasetConfig, RunConfig, ScalingConfig +from ray.air.config import ( + DatasetConfig, + RunConfig, + ScalingConfig, + FailureConfig, + CheckpointingConfig, +) from ray.air.data_batch_type import DataBatchType from ray.air.result import Result from ray.air.util.datasets import train_test_split @@ -11,5 +17,7 @@ "Result", "ScalingConfig", "DatasetConfig", + "FailureConfig", + "CheckpointingConfig", "train_test_split", ] diff --git a/python/ray/air/config.py b/python/ray/air/config.py index d75dc3c0cc5d..ab9cfe79b67e 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -282,14 +282,47 @@ class CheckpointingConfig: keep_checkpoints_num: Number of checkpoints to keep. A value of `None` keeps all checkpoints. Defaults to `None`. If set, need to provide `checkpoint_score_attr`. - checkpoint_score_attr: Specifies by which attribute to rank the - best checkpoint. Default is increasing order. If attribute starts - with `min-` it will rank attribute in decreasing order, i.e. - `min-validation_loss`. + checkpoint_score_metric: Specifies by which metric to rank the + best checkpoint. Defaults to training iteration. + checkpoint_score_mode: Must be one of [min, max]. Determines + whether ``checkpoint_score_metric`` should be minimized or maximized. + If not set, will be the same as 'max'. Cannot be set if + ``checkpoint_score_metric`` is not set. """ keep_checkpoints_num: Optional[int] = None - checkpoint_score_attr: Optional[str] = None + checkpoint_score_metric: Optional[str] = None + checkpoint_score_mode: Optional[str] = None + + def __post_init__(self): + if self.checkpoint_score_mode not in (None, "min", "max"): + raise ValueError( + "The `checkpoint_score_mode` parameter can only be " + f"either None, 'min' or 'max', got {self.checkpoint_score_mode}." + ) + if ( + self.checkpoint_score_metric is None + and self.checkpoint_score_mode is not None + ): + raise ValueError( + "`checkpoint_score_mode` cannot be set if " + "`checkpoint_score_metric` is not set." + ) + + @property + def checkpoint_score_attr(self) -> Optional[str]: + """Same as ``checkpoint_score_attr`` in ``tune.run``.""" + if self.checkpoint_score_metric is None: + return self.checkpoint_score_metric + prefix = "" + if self.checkpoint_score_mode == "min": + prefix = "min-" + return f"{prefix}{self.checkpoint_score_metric}" + + @property + def checkpoint_score_mode_not_none(self) -> str: + """``checkpoint_score_mode`` but None -> 'max'""" + return self.checkpoint_score_mode or "max" @dataclass diff --git a/python/ray/air/result.py b/python/ray/air/result.py index 2b52fb844244..954c0b8f9054 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -24,9 +24,11 @@ class Result: metrics: The final metrics as reported by an Trainable. checkpoint: The final checkpoint of the Trainable. best_checkpoint: The best checkpoint of the Trainable, as - determined by the ``metric`` and ``mode`` arguments set. + determined by the ``checkpointing`` argument of ``RunConfig``, + or, if that's unset, by ``metric`` and ``mode`` arguments of + ``TuneConfig``. If either of those has not been set, this will be None. - May be the same as ``checkpoint``. + May be the same object as ``checkpoint``. error: The execution error of the Trainable run, if the trial finishes in error. dataframe: The full result dataframe of the Trainable. """ diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py index dce2ce930c8d..20138448a77d 100644 --- a/python/ray/air/tests/test_api.py +++ b/python/ray/air/tests/test_api.py @@ -4,6 +4,7 @@ from ray.air import Checkpoint from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated from ray.air.config import ScalingConfigDataClass +from ray.air.config import CheckpointingConfig from ray.data.preprocessor import Preprocessor from ray.train.trainer import BaseTrainer @@ -38,6 +39,33 @@ def test_run_config(): DummyTrainer(run_config=ray.air.RunConfig()) +def test_checkpointing_config(): + # cannot set checkpoint_score_mode if checkpoint_score_metric is unset + with pytest.raises(ValueError): + CheckpointingConfig(checkpoint_score_mode="min") + + with pytest.raises(ValueError): + CheckpointingConfig( + checkpoint_score_metric="metric", checkpoint_score_mode="invalid" + ) + + checkpointing = CheckpointingConfig() + assert checkpointing.checkpoint_score_attr is None + + checkpointing = CheckpointingConfig(checkpoint_score_metric="metric") + assert checkpointing.checkpoint_score_attr == "metric" + + checkpointing = CheckpointingConfig( + checkpoint_score_metric="metric", checkpoint_score_mode="max" + ) + assert checkpointing.checkpoint_score_attr == "metric" + + checkpointing = CheckpointingConfig( + checkpoint_score_metric="metric", checkpoint_score_mode="min" + ) + assert checkpointing.checkpoint_score_attr == "min-metric" + + def test_scaling_config(): with pytest.raises(ValueError): DummyTrainer(scaling_config="invalid") diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py index b2ea167369d7..7a0bf39eff6a 100644 --- a/python/ray/tune/impl/tuner_internal.py +++ b/python/ray/tune/impl/tuner_internal.py @@ -147,7 +147,7 @@ def fit(self) -> ResultGrid: else: analysis = self._fit_resume(trainable) - return ResultGrid(analysis) + return ResultGrid(analysis, self._run_config.checkpointing) def _get_tune_run_arguments(self) -> Dict[str, Any]: """Get tune.run arguments common for both new and resumed runs.""" diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 2568ebe46b09..ef4dd58b5064 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -1,5 +1,5 @@ import os -from typing import Optional, Union +from typing import TYPE_CHECKING, Optional, Union import pandas as pd @@ -11,6 +11,9 @@ from ray.tune.trial import Trial from ray.util import PublicAPI +if TYPE_CHECKING: + from ray.air.config import CheckpointingConfig + @PublicAPI(stability="alpha") class ResultGrid: @@ -40,8 +43,14 @@ class ResultGrid: seen by Tune will be provided. """ - def __init__(self, experiment_analysis: ExperimentAnalysis): + def __init__( + self, + experiment_analysis: ExperimentAnalysis, + checkpointing_config: Optional["CheckpointingConfig"] = None, + ): self._experiment_analysis = experiment_analysis + # Used to determine best checkpoint + self._checkpointing_config = checkpointing_config def get_best_result( self, @@ -165,8 +174,21 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError] def _trial_to_result(self, trial: Trial) -> Result: checkpoint = trial.checkpoint.to_air_checkpoint() + + checkpoint_metric = ( + self._checkpointing_config.checkpoint_score_metric + if self._checkpointing_config + else None + ) + checkpoint_mode = ( + self._checkpointing_config.checkpoint_score_mode_not_none + if self._checkpointing_config and checkpoint_metric + else None + ) try: - best_checkpoint = self._experiment_analysis.best_checkpoint + best_checkpoint = self._experiment_analysis.get_best_checkpoint( + trial, metric=checkpoint_metric, mode=checkpoint_mode + ) except ValueError: best_checkpoint = None diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index dbcab0037d50..6a81cfd01d1d 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -50,7 +50,7 @@ def f(config): f.write(json.dumps({"step": i})) tune.report(step=i) - analysis = tune.run(f, config={"a": 1}, metric="step", mode="max") + analysis = tune.run(f, config={"a": 1}, metric="step", mode="min") analysis._legacy_checkpoint = False result_grid = ResultGrid(analysis) result = result_grid[0] @@ -61,7 +61,7 @@ def f(config): assert isinstance(result.dataframe, pd.DataFrame) assert os.path.normpath( result.checkpoint.get_internal_representation()[1] - ) == os.path.normpath(result.best_checkpoint.get_internal_representation()[1]) + ) != os.path.normpath(result.best_checkpoint.get_internal_representation()[1]) assert result.config == {"a": 1} assert result.metrics["config"] == result.config assert len(result.dataframe) == 2 diff --git a/python/ray/tune/tests/test_tuner.py b/python/ray/tune/tests/test_tuner.py index a7cad997092c..07a088efacf6 100644 --- a/python/ray/tune/tests/test_tuner.py +++ b/python/ray/tune/tests/test_tuner.py @@ -7,7 +7,7 @@ from sklearn.utils import shuffle from ray import tune -from ray.air.config import RunConfig +from ray.air.config import CheckpointingConfig, RunConfig from ray.air.examples.pytorch.torch_linear_example import ( train_func as linear_train_func, ) @@ -32,6 +32,16 @@ class DummyTrainer(BaseTrainer): "placement_strategy", ] + def training_loop(self) -> None: + for i in range(5): + with tune.checkpoint_dir(step=i) as checkpoint_dir: + path = os.path.join(checkpoint_dir, "checkpoint") + with open(path, "w") as f: + f.write(str(i)) + tune.report(step=i) + + +class FailingTrainer(DummyTrainer): def training_loop(self) -> None: raise RuntimeError("There is an error in trainer!") @@ -189,7 +199,7 @@ def on_step_end(self, iteration, trials, **kwargs): assert len(results) == 4 def test_tuner_trainer_fail(self): - trainer = DummyTrainer() + trainer = FailingTrainer() param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), @@ -243,6 +253,68 @@ def test_tuner_run_config_override(self): assert tuner._local_tuner._run_config.stop == {"metric": 4} + def test_tuner_checkpoint_configuration(self): + # Case 1: nothing set + trainer = DummyTrainer() + tuner = Tuner(trainer) + + results = tuner.fit() + result = results[0] + assert result.checkpoint + assert not result.best_checkpoint + + # Case 2: metric and mode set + trainer = DummyTrainer() + tuner = Tuner( + trainer, tune_config=TuneConfig(mode="min", metric="step", num_samples=2) + ) + + results = tuner.fit() + result = results[0] + assert result.checkpoint + assert result.best_checkpoint + assert ( + os.path.basename( + os.path.normpath( + result.best_checkpoint.get_internal_representation()[1] + ) + ) + == "checkpoint_000000" + ) + assert ( + result.best_checkpoint.get_internal_representation() + != results[1].best_checkpoint.get_internal_representation() + ) + + # Case 3: CheckpointingConfig set. Takes priority. + trainer = DummyTrainer( + run_config=RunConfig( + checkpointing=CheckpointingConfig( + checkpoint_score_metric="step", checkpoint_score_mode="min" + ) + ) + ) + tuner = Tuner( + trainer, tune_config=TuneConfig(mode="max", metric="step", num_samples=2) + ) + + results = tuner.fit() + result = results[0] + assert result.checkpoint + assert result.best_checkpoint + assert ( + os.path.basename( + os.path.normpath( + result.best_checkpoint.get_internal_representation()[1] + ) + ) + == "checkpoint_000000" + ) + assert ( + result.best_checkpoint.get_internal_representation() + != results[1].best_checkpoint.get_internal_representation() + ) + if __name__ == "__main__": import sys From 024932e8acaf3a1deeb2a3af7ccdb9965589bbd3 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 20 Jun 2022 20:17:52 +0000 Subject: [PATCH 24/63] Result grid tweaks --- python/ray/tune/result_grid.py | 43 +++++++++++++++--- python/ray/tune/tests/test_result_grid.py | 53 +++++++++++++++++++++++ 2 files changed, 89 insertions(+), 7 deletions(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index ef4dd58b5064..78f05e6dcd20 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -58,6 +58,7 @@ def get_best_result( mode: Optional[str] = None, scope: str = "last", filter_nan_and_inf: bool = True, + checkpointing_config: Union[bool, "CheckpointingConfig"] = True, ) -> Result: """Get the best result from all the trials run. @@ -79,6 +80,13 @@ def get_best_result( filter_nan_and_inf: If True (default), NaN or infinite values are disregarded and these trials are never selected as the best trial. + checkpointing_config: If True (default), will use the + ``CheckpointingConfig`` object set in Trainer's ``RunConfig`` + to determine the best checkpoint of the trial. + If False, or if the ``CheckpointingConfig`` object was not set, will use + ``metric`` and ``mode`` as set here. + Can also be a ``CheckpointingConfig`` object, in which case it will + be used directly. """ if not metric and not self._experiment_analysis.default_metric: raise ValueError( @@ -92,6 +100,10 @@ def get_best_result( "`get_best_result` or specify a mode in the " "`TuneConfig` of your `Tuner`." ) + + metric = metric or self._experiment_analysis.default_metric + mode = mode or self._experiment_analysis.default_mode + best_trial = self._experiment_analysis.get_best_trial( metric=metric, mode=mode, @@ -112,7 +124,19 @@ def get_best_result( ) raise RuntimeError(error_msg) - return self._trial_to_result(best_trial) + # Lazy import to avoid circular dependency + from ray.air.config import CheckpointingConfig + + if not isinstance(checkpointing_config, CheckpointingConfig): + if checkpointing_config and self._checkpointing_config: + checkpointing_config = self._checkpointing_config + else: + checkpointing_config = CheckpointingConfig( + checkpoint_score_metric=metric, checkpoint_score_mode=mode + ) + return self._trial_to_result( + best_trial, checkpointing_config=checkpointing_config + ) def get_dataframe( self, @@ -159,7 +183,10 @@ def __len__(self) -> int: def __getitem__(self, i) -> Result: """Returns the i'th result in the grid.""" - return self._trial_to_result(self._experiment_analysis.trials[i]) + return self._trial_to_result( + self._experiment_analysis.trials[i], + checkpointing_config=self._checkpointing_config, + ) @staticmethod def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError]]: @@ -172,17 +199,19 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError] return TuneError(f.read()) return None - def _trial_to_result(self, trial: Trial) -> Result: + def _trial_to_result( + self, trial: Trial, checkpointing_config: "CheckpointingConfig" + ) -> Result: checkpoint = trial.checkpoint.to_air_checkpoint() checkpoint_metric = ( - self._checkpointing_config.checkpoint_score_metric - if self._checkpointing_config + checkpointing_config.checkpoint_score_metric + if checkpointing_config else None ) checkpoint_mode = ( - self._checkpointing_config.checkpoint_score_mode_not_none - if self._checkpointing_config and checkpoint_metric + checkpointing_config.checkpoint_score_mode_not_none + if checkpointing_config and checkpoint_metric else None ) try: diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index 6a81cfd01d1d..c6bcb7af2077 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -150,6 +150,59 @@ def f(config): assert best_result.metrics["x"] == 2 +def test_best_result_best_checkpoint(ray_start_2_cpus): + from ray.air.config import CheckpointingConfig + + def f(config): + for i in range(2): + with tune.checkpoint_dir(step=i) as checkpoint_dir: + path = os.path.join(checkpoint_dir, "checkpoint") + with open(path, "w") as f: + f.write(json.dumps(dict(x=config["x"] * (i + 1), step=i))) + tune.report(x=config["x"] * (i + 1), step=i) + + def load_checkpoint(result): + with open( + os.path.join(result.best_checkpoint.to_directory(), "checkpoint") + ) as f: + checkpoint_data = json.load(f) + return checkpoint_data + + analysis = tune.run(f, config={"x": tune.grid_search([1, 3])}) + + # No checkpointing config. Use metric and mode + result_grid = ResultGrid(analysis) + best_result = result_grid.get_best_result(metric="x", mode="max") + assert best_result.metrics["x"] == 6 + assert best_result.best_checkpoint + assert load_checkpoint(best_result)["step"] == 1 + + # Checkpointing config. Use by default + result_grid = ResultGrid( + analysis, checkpointing_config=CheckpointingConfig(checkpoint_score_metric="x") + ) + best_result = result_grid.get_best_result(metric="x", mode="min") + assert best_result.metrics["x"] == 2 + assert best_result.best_checkpoint + assert load_checkpoint(best_result)["step"] == 1 + + best_result = result_grid.get_best_result( + metric="x", mode="min", checkpointing_config=False + ) + assert best_result.metrics["x"] == 2 + assert best_result.best_checkpoint + assert load_checkpoint(best_result)["step"] == 0 + + best_result = result_grid.get_best_result( + metric="x", + mode="min", + checkpointing_config=CheckpointingConfig(checkpoint_score_metric="x"), + ) + assert best_result.metrics["x"] == 2 + assert best_result.best_checkpoint + assert load_checkpoint(best_result)["step"] == 1 + + def test_best_result_no_report(ray_start_2_cpus): def f(config): pass From abf2cdc9a18d56147d1dfd2aec4b56eab0a1223b Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 20 Jun 2022 20:24:23 +0000 Subject: [PATCH 25/63] Extend --- python/ray/tune/result_grid.py | 62 ++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 78f05e6dcd20..896cd1b885c8 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -52,6 +52,27 @@ def __init__( # Used to determine best checkpoint self._checkpointing_config = checkpointing_config + def _resolve_checkpointing_config( + self, + checkpointing_config: "CheckpointingConfig", + metric: Optional[str] = None, + mode: Optional[str] = None, + ) -> "CheckpointingConfig": + # Lazy import to avoid circular dependency + from ray.air.config import CheckpointingConfig + + metric = metric or self._experiment_analysis.default_metric + mode = mode or self._experiment_analysis.default_mode + + if not isinstance(checkpointing_config, CheckpointingConfig): + if checkpointing_config and self._checkpointing_config: + checkpointing_config = self._checkpointing_config + else: + checkpointing_config = CheckpointingConfig( + checkpoint_score_metric=metric, checkpoint_score_mode=mode + ) + return checkpointing_config + def get_best_result( self, metric: Optional[str] = None, @@ -101,9 +122,6 @@ def get_best_result( "`TuneConfig` of your `Tuner`." ) - metric = metric or self._experiment_analysis.default_metric - mode = mode or self._experiment_analysis.default_mode - best_trial = self._experiment_analysis.get_best_trial( metric=metric, mode=mode, @@ -124,16 +142,10 @@ def get_best_result( ) raise RuntimeError(error_msg) - # Lazy import to avoid circular dependency - from ray.air.config import CheckpointingConfig + checkpointing_config = self._resolve_checkpointing_config( + checkpointing_config, metric=metric, mode=mode + ) - if not isinstance(checkpointing_config, CheckpointingConfig): - if checkpointing_config and self._checkpointing_config: - checkpointing_config = self._checkpointing_config - else: - checkpointing_config = CheckpointingConfig( - checkpoint_score_metric=metric, checkpoint_score_mode=mode - ) return self._trial_to_result( best_trial, checkpointing_config=checkpointing_config ) @@ -181,11 +193,33 @@ def get_dataframe( def __len__(self) -> int: return len(self._experiment_analysis.trials) - def __getitem__(self, i) -> Result: + def __getitem__(self, i: int) -> Result: """Returns the i'th result in the grid.""" + return self.get( + self._experiment_analysis.trials[i], + ) + + def get( + self, i: int, *, checkpointing_config: Union[bool, "CheckpointingConfig"] = True + ): + """Returns the i'th result in the grid. + + Args: + i: index to return. + checkpointing_config: If True (default), will use the + ``CheckpointingConfig`` object set in Trainer's ``RunConfig`` + to determine the best checkpoint of the trial. + If False, or if the ``CheckpointingConfig`` object was not set, will use + ``metric`` and ``mode`` as set here. + Can also be a ``CheckpointingConfig`` object, in which case it will + be used directly. + """ + + checkpointing_config = self._resolve_checkpointing_config(checkpointing_config) + return self._trial_to_result( self._experiment_analysis.trials[i], - checkpointing_config=self._checkpointing_config, + checkpointing_config=checkpointing_config, ) @staticmethod From 563bc338b976e0a4a98d51982cddada388ceb7a6 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 05:38:46 +0200 Subject: [PATCH 26/63] Update result_grid.py --- python/ray/tune/result_grid.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 896cd1b885c8..9321de0d8cdb 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -195,9 +195,7 @@ def __len__(self) -> int: def __getitem__(self, i: int) -> Result: """Returns the i'th result in the grid.""" - return self.get( - self._experiment_analysis.trials[i], - ) + return self.get(i) def get( self, i: int, *, checkpointing_config: Union[bool, "CheckpointingConfig"] = True From d0261bea6d4f40414491d84fe9017cc7ad335c45 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 16:58:23 +0000 Subject: [PATCH 27/63] Fix --- python/ray/tune/result_grid.py | 2 +- python/ray/tune/tests/test_result_grid.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 9321de0d8cdb..344c4356938f 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -232,7 +232,7 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError] return None def _trial_to_result( - self, trial: Trial, checkpointing_config: "CheckpointingConfig" + self, trial: Trial, checkpointing_config: Optional["CheckpointingConfig"] ) -> Result: checkpoint = trial.checkpoint.to_air_checkpoint() diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index c6bcb7af2077..bccd553469a3 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -124,7 +124,7 @@ def test_result_grid_future_checkpoint(ray_start_2_cpus, to_object): result_grid = ResultGrid(None) # Internal result grid conversion - result = result_grid._trial_to_result(trial) + result = result_grid._trial_to_result(trial, checkpointing_config=None) assert isinstance(result.checkpoint, Checkpoint) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) From 56df4936e05391ae27255d762b1f32f4f2105138 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 16:58:47 +0000 Subject: [PATCH 28/63] Lint --- python/ray/tune/result_grid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 344c4356938f..4513935697a0 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -107,7 +107,7 @@ def get_best_result( If False, or if the ``CheckpointingConfig`` object was not set, will use ``metric`` and ``mode`` as set here. Can also be a ``CheckpointingConfig`` object, in which case it will - be used directly. + be used directly. """ if not metric and not self._experiment_analysis.default_metric: raise ValueError( From ef0c75ae685afdc31b70a66f43701b265f59decc Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 17:57:36 +0000 Subject: [PATCH 29/63] Lint --- python/ray/tune/result_grid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 4513935697a0..6f5edc3d3f13 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -210,7 +210,7 @@ def get( If False, or if the ``CheckpointingConfig`` object was not set, will use ``metric`` and ``mode`` as set here. Can also be a ``CheckpointingConfig`` object, in which case it will - be used directly. + be used directly. """ checkpointing_config = self._resolve_checkpointing_config(checkpointing_config) From 3464c93c5eb8fdd7ea0fb3a5c6cf4a071371246d Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 18:43:36 +0000 Subject: [PATCH 30/63] WIP --- python/ray/train/tests/test_tune.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index 3fac9a1e6599..dafe52241312 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -5,6 +5,7 @@ import ray import ray.train as train from ray import tune +from ray.tune import TuneError from ray.air import Checkpoint from ray.air.config import FailureConfig, RunConfig from ray.train._internal.worker_group import WorkerGroup @@ -125,9 +126,8 @@ def train_func(config): trainer, ) - # with pytest.raises(TuneError): - tuner.fit() - print("a") + with pytest.raises(TuneError): + tuner.fit() def test_tune_checkpoint(ray_start_4_cpus): From ee87c12d772860c18f80d4a2bb4a5c2514a81195 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 19:12:46 +0000 Subject: [PATCH 31/63] Renaming --- python/ray/air/__init__.py | 4 +- python/ray/air/config.py | 6 +- python/ray/air/result.py | 12 ++-- python/ray/air/tests/test_api.py | 14 ++--- python/ray/tune/impl/tuner_internal.py | 10 ++-- python/ray/tune/result_grid.py | 68 +++++++++++------------ python/ray/tune/tests/test_result_grid.py | 10 ++-- python/ray/tune/tests/test_tuner.py | 6 +- 8 files changed, 63 insertions(+), 67 deletions(-) diff --git a/python/ray/air/__init__.py b/python/ray/air/__init__.py index 2c82cce8f4e3..506f9d022cc0 100644 --- a/python/ray/air/__init__.py +++ b/python/ray/air/__init__.py @@ -4,7 +4,7 @@ RunConfig, ScalingConfig, FailureConfig, - CheckpointingConfig, + CheckpointConfig, ) from ray.air.data_batch_type import DataBatchType from ray.air.result import Result @@ -18,6 +18,6 @@ "ScalingConfig", "DatasetConfig", "FailureConfig", - "CheckpointingConfig", + "CheckpointConfig", "train_test_split", ] diff --git a/python/ray/air/config.py b/python/ray/air/config.py index ab9cfe79b67e..5b0a886cc3e2 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -275,7 +275,7 @@ class FailureConfig: @dataclass @PublicAPI(stability="alpha") -class CheckpointingConfig: +class CheckpointConfig: """Configuration related to checkpointing of each run/trial. Args: @@ -352,7 +352,7 @@ class RunConfig: and thus will not take effect in resumed runs). failure: Failure mode configuration. sync_config: Configuration object for syncing. See tune.SyncConfig. - checkpointing: Checkpointing configuration. + checkpoint_config: Checkpointing configuration. verbose: 0, 1, 2, or 3. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and brief results, 3 = status and detailed results. Defaults to 2. @@ -365,5 +365,5 @@ class RunConfig: stop: Optional[Union[Mapping, "Stopper", Callable[[str, Mapping], bool]]] = None failure: Optional[FailureConfig] = None sync_config: Optional[SyncConfig] = None - checkpointing: Optional[CheckpointingConfig] = None + checkpoint_config: Optional[CheckpointConfig] = None verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS diff --git a/python/ray/air/result.py b/python/ray/air/result.py index 954c0b8f9054..f615959a2fdc 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -23,11 +23,13 @@ class Result: Args: metrics: The final metrics as reported by an Trainable. checkpoint: The final checkpoint of the Trainable. - best_checkpoint: The best checkpoint of the Trainable, as - determined by the ``checkpointing`` argument of ``RunConfig``, - or, if that's unset, by ``metric`` and ``mode`` arguments of - ``TuneConfig``. - If either of those has not been set, this will be None. + best_checkpoint: The best checkpoint of the Trainable. + This will be determined by (from highest priority): + + 1. ``checkpoint_config`` argument of ``run_config`` + 2. ``metric`` and ``mode`` arguments of ``tune_config`` (if using ``Tuner``) + + If neither of those has not been set, this will be None. May be the same object as ``checkpoint``. error: The execution error of the Trainable run, if the trial finishes in error. dataframe: The full result dataframe of the Trainable. diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py index 20138448a77d..136cb0e58473 100644 --- a/python/ray/air/tests/test_api.py +++ b/python/ray/air/tests/test_api.py @@ -4,7 +4,7 @@ from ray.air import Checkpoint from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated from ray.air.config import ScalingConfigDataClass -from ray.air.config import CheckpointingConfig +from ray.air.config import CheckpointConfig from ray.data.preprocessor import Preprocessor from ray.train.trainer import BaseTrainer @@ -42,25 +42,25 @@ def test_run_config(): def test_checkpointing_config(): # cannot set checkpoint_score_mode if checkpoint_score_metric is unset with pytest.raises(ValueError): - CheckpointingConfig(checkpoint_score_mode="min") + CheckpointConfig(checkpoint_score_mode="min") with pytest.raises(ValueError): - CheckpointingConfig( + CheckpointConfig( checkpoint_score_metric="metric", checkpoint_score_mode="invalid" ) - checkpointing = CheckpointingConfig() + checkpointing = CheckpointConfig() assert checkpointing.checkpoint_score_attr is None - checkpointing = CheckpointingConfig(checkpoint_score_metric="metric") + checkpointing = CheckpointConfig(checkpoint_score_metric="metric") assert checkpointing.checkpoint_score_attr == "metric" - checkpointing = CheckpointingConfig( + checkpointing = CheckpointConfig( checkpoint_score_metric="metric", checkpoint_score_mode="max" ) assert checkpointing.checkpoint_score_attr == "metric" - checkpointing = CheckpointingConfig( + checkpointing = CheckpointConfig( checkpoint_score_metric="metric", checkpoint_score_mode="min" ) assert checkpointing.checkpoint_score_attr == "min-metric" diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py index 7a0bf39eff6a..e2dfadf43fdf 100644 --- a/python/ray/tune/impl/tuner_internal.py +++ b/python/ray/tune/impl/tuner_internal.py @@ -147,7 +147,7 @@ def fit(self) -> ResultGrid: else: analysis = self._fit_resume(trainable) - return ResultGrid(analysis, self._run_config.checkpointing) + return ResultGrid(analysis, self._run_config.checkpoint_config) def _get_tune_run_arguments(self) -> Dict[str, Any]: """Get tune.run arguments common for both new and resumed runs.""" @@ -161,13 +161,13 @@ def _get_tune_run_arguments(self) -> Dict[str, Any]: self._run_config.failure.max_failures if self._run_config.failure else 0 ), keep_checkpoints_num=( - self._run_config.checkpointing.keep_checkpoints_num - if self._run_config.checkpointing + self._run_config.checkpoint_config.keep_checkpoints_num + if self._run_config.checkpoint_config else None ), checkpoint_score_attr=( - self._run_config.checkpointing.checkpoint_score_attr - if self._run_config.checkpointing + self._run_config.checkpoint_config.checkpoint_score_attr + if self._run_config.checkpoint_config else None ), _experiment_checkpoint_dir=self._experiment_checkpoint_dir, diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 6f5edc3d3f13..afa68b609b59 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -12,7 +12,7 @@ from ray.util import PublicAPI if TYPE_CHECKING: - from ray.air.config import CheckpointingConfig + from ray.air.config import CheckpointConfig @PublicAPI(stability="alpha") @@ -46,32 +46,32 @@ class ResultGrid: def __init__( self, experiment_analysis: ExperimentAnalysis, - checkpointing_config: Optional["CheckpointingConfig"] = None, + checkpoint_config: Optional["CheckpointConfig"] = None, ): self._experiment_analysis = experiment_analysis # Used to determine best checkpoint - self._checkpointing_config = checkpointing_config + self._checkpointing_config = checkpoint_config - def _resolve_checkpointing_config( + def _resolve_checkpoint_config( self, - checkpointing_config: "CheckpointingConfig", + checkpoint_config: "CheckpointConfig", metric: Optional[str] = None, mode: Optional[str] = None, - ) -> "CheckpointingConfig": + ) -> "CheckpointConfig": # Lazy import to avoid circular dependency - from ray.air.config import CheckpointingConfig + from ray.air.config import CheckpointConfig metric = metric or self._experiment_analysis.default_metric mode = mode or self._experiment_analysis.default_mode - if not isinstance(checkpointing_config, CheckpointingConfig): - if checkpointing_config and self._checkpointing_config: - checkpointing_config = self._checkpointing_config + if not isinstance(checkpoint_config, CheckpointConfig): + if checkpoint_config and self._checkpointing_config: + checkpoint_config = self._checkpointing_config else: - checkpointing_config = CheckpointingConfig( + checkpoint_config = CheckpointConfig( checkpoint_score_metric=metric, checkpoint_score_mode=mode ) - return checkpointing_config + return checkpoint_config def get_best_result( self, @@ -79,7 +79,7 @@ def get_best_result( mode: Optional[str] = None, scope: str = "last", filter_nan_and_inf: bool = True, - checkpointing_config: Union[bool, "CheckpointingConfig"] = True, + checkpoint_config: Union[bool, "CheckpointConfig"] = True, ) -> Result: """Get the best result from all the trials run. @@ -101,12 +101,12 @@ def get_best_result( filter_nan_and_inf: If True (default), NaN or infinite values are disregarded and these trials are never selected as the best trial. - checkpointing_config: If True (default), will use the - ``CheckpointingConfig`` object set in Trainer's ``RunConfig`` + checkpoint_config: If True (default), will use the + ``CheckpointConfig`` object set in Trainer's ``run_config`` to determine the best checkpoint of the trial. - If False, or if the ``CheckpointingConfig`` object was not set, will use + If False, or if the ``CheckpointConfig`` object was not set, will use ``metric`` and ``mode`` as set here. - Can also be a ``CheckpointingConfig`` object, in which case it will + Can also be a ``CheckpointConfig`` object, in which case it will be used directly. """ if not metric and not self._experiment_analysis.default_metric: @@ -142,13 +142,11 @@ def get_best_result( ) raise RuntimeError(error_msg) - checkpointing_config = self._resolve_checkpointing_config( - checkpointing_config, metric=metric, mode=mode + checkpoint_config = self._resolve_checkpoint_config( + checkpoint_config, metric=metric, mode=mode ) - return self._trial_to_result( - best_trial, checkpointing_config=checkpointing_config - ) + return self._trial_to_result(best_trial, checkpoint_config=checkpoint_config) def get_dataframe( self, @@ -197,27 +195,25 @@ def __getitem__(self, i: int) -> Result: """Returns the i'th result in the grid.""" return self.get(i) - def get( - self, i: int, *, checkpointing_config: Union[bool, "CheckpointingConfig"] = True - ): + def get(self, i: int, *, checkpoint_config: Union[bool, "CheckpointConfig"] = True): """Returns the i'th result in the grid. Args: i: index to return. - checkpointing_config: If True (default), will use the - ``CheckpointingConfig`` object set in Trainer's ``RunConfig`` + checkpoint_config: If True (default), will use the + ``CheckpointConfig`` object set in Trainer's ``RunConfig`` to determine the best checkpoint of the trial. - If False, or if the ``CheckpointingConfig`` object was not set, will use + If False, or if the ``CheckpointConfig`` object was not set, will use ``metric`` and ``mode`` as set here. - Can also be a ``CheckpointingConfig`` object, in which case it will + Can also be a ``CheckpointConfig`` object, in which case it will be used directly. """ - checkpointing_config = self._resolve_checkpointing_config(checkpointing_config) + checkpoint_config = self._resolve_checkpoint_config(checkpoint_config) return self._trial_to_result( self._experiment_analysis.trials[i], - checkpointing_config=checkpointing_config, + checkpoint_config=checkpoint_config, ) @staticmethod @@ -232,18 +228,16 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError] return None def _trial_to_result( - self, trial: Trial, checkpointing_config: Optional["CheckpointingConfig"] + self, trial: Trial, checkpoint_config: Optional["CheckpointConfig"] ) -> Result: checkpoint = trial.checkpoint.to_air_checkpoint() checkpoint_metric = ( - checkpointing_config.checkpoint_score_metric - if checkpointing_config - else None + checkpoint_config.checkpoint_score_metric if checkpoint_config else None ) checkpoint_mode = ( - checkpointing_config.checkpoint_score_mode_not_none - if checkpointing_config and checkpoint_metric + checkpoint_config.checkpoint_score_mode_not_none + if checkpoint_config and checkpoint_metric else None ) try: diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index bccd553469a3..96789116db33 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -124,7 +124,7 @@ def test_result_grid_future_checkpoint(ray_start_2_cpus, to_object): result_grid = ResultGrid(None) # Internal result grid conversion - result = result_grid._trial_to_result(trial, checkpointing_config=None) + result = result_grid._trial_to_result(trial, checkpoint_config=None) assert isinstance(result.checkpoint, Checkpoint) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) @@ -151,7 +151,7 @@ def f(config): def test_best_result_best_checkpoint(ray_start_2_cpus): - from ray.air.config import CheckpointingConfig + from ray.air.config import CheckpointConfig def f(config): for i in range(2): @@ -179,7 +179,7 @@ def load_checkpoint(result): # Checkpointing config. Use by default result_grid = ResultGrid( - analysis, checkpointing_config=CheckpointingConfig(checkpoint_score_metric="x") + analysis, checkpoint_config=CheckpointConfig(checkpoint_score_metric="x") ) best_result = result_grid.get_best_result(metric="x", mode="min") assert best_result.metrics["x"] == 2 @@ -187,7 +187,7 @@ def load_checkpoint(result): assert load_checkpoint(best_result)["step"] == 1 best_result = result_grid.get_best_result( - metric="x", mode="min", checkpointing_config=False + metric="x", mode="min", checkpoint_config=False ) assert best_result.metrics["x"] == 2 assert best_result.best_checkpoint @@ -196,7 +196,7 @@ def load_checkpoint(result): best_result = result_grid.get_best_result( metric="x", mode="min", - checkpointing_config=CheckpointingConfig(checkpoint_score_metric="x"), + checkpoint_config=CheckpointConfig(checkpoint_score_metric="x"), ) assert best_result.metrics["x"] == 2 assert best_result.best_checkpoint diff --git a/python/ray/tune/tests/test_tuner.py b/python/ray/tune/tests/test_tuner.py index 07a088efacf6..dab5b9a1b51b 100644 --- a/python/ray/tune/tests/test_tuner.py +++ b/python/ray/tune/tests/test_tuner.py @@ -7,7 +7,7 @@ from sklearn.utils import shuffle from ray import tune -from ray.air.config import CheckpointingConfig, RunConfig +from ray.air.config import CheckpointConfig, RunConfig from ray.air.examples.pytorch.torch_linear_example import ( train_func as linear_train_func, ) @@ -286,10 +286,10 @@ def test_tuner_checkpoint_configuration(self): != results[1].best_checkpoint.get_internal_representation() ) - # Case 3: CheckpointingConfig set. Takes priority. + # Case 3: CheckpointConfig set. Takes priority. trainer = DummyTrainer( run_config=RunConfig( - checkpointing=CheckpointingConfig( + checkpoint_config=CheckpointConfig( checkpoint_score_metric="step", checkpoint_score_mode="min" ) ) From b10fe1e18745cd045168f256229b64c3b841fa6b Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 20:05:25 +0000 Subject: [PATCH 32/63] Improve test coverage --- python/ray/train/tests/test_examples.py | 12 ++++++++++++ python/ray/train/tests/test_tune.py | 6 +++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index fd6a2fadbf91..2ebef818d7aa 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -51,6 +51,10 @@ def test_tensorflow_mnist(ray_start_4_cpus, num_workers): assert result[TRAINING_ITERATION] == epochs + loss = list(results.dataframe["loss"]) + assert len(loss) == epochs + assert loss[-1] < loss[0] + def test_tf_non_distributed(ray_start_4_cpus): """Make sure Ray Train works without TF MultiWorkerMirroredStrategy.""" @@ -103,6 +107,10 @@ def test_torch_linear(ray_start_4_cpus, num_workers): result = results.metrics assert result[TRAINING_ITERATION] == epochs + loss = list(results.dataframe["loss"]) + assert len(loss) == epochs + assert loss[-1] < loss[0] + # TODO: Refactor as a backend test. def test_torch_linear_failure(ray_start_4_cpus): @@ -138,6 +146,10 @@ def test_torch_fashion_mnist(ray_start_4_cpus): result = results.metrics assert result[TRAINING_ITERATION] == epochs + loss = list(results.dataframe["loss"]) + assert len(loss) == epochs + assert loss[-1] < loss[0] + def test_torch_non_distributed(ray_start_4_cpus): """Make sure Ray Train works without torch DDP.""" diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index dafe52241312..0196a84e46b6 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -5,7 +5,6 @@ import ray import ray.train as train from ray import tune -from ray.tune import TuneError from ray.air import Checkpoint from ray.air.config import FailureConfig, RunConfig from ray.train._internal.worker_group import WorkerGroup @@ -126,8 +125,9 @@ def train_func(config): trainer, ) - with pytest.raises(TuneError): - tuner.fit() + result_grid = tuner.fit() + with pytest.raises(RuntimeError): + raise result_grid[0].error def test_tune_checkpoint(ray_start_4_cpus): From 4dbcccaba67b23538d969e5df309c506e128d964 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 20:51:57 +0000 Subject: [PATCH 33/63] Simplify --- python/ray/air/result.py | 18 ++-- .../ray/tune/analysis/experiment_analysis.py | 2 +- python/ray/tune/function_runner.py | 2 + python/ray/tune/result_grid.py | 84 +++---------------- python/ray/tune/tests/test_result_grid.py | 65 +++++--------- python/ray/tune/tests/test_tuner.py | 64 +------------- python/ray/tune/trial.py | 3 + 7 files changed, 46 insertions(+), 192 deletions(-) diff --git a/python/ray/air/result.py b/python/ray/air/result.py index f615959a2fdc..77b3d4b03d28 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional, Tuple from dataclasses import dataclass from ray.air.checkpoint import Checkpoint @@ -23,23 +23,19 @@ class Result: Args: metrics: The final metrics as reported by an Trainable. checkpoint: The final checkpoint of the Trainable. - best_checkpoint: The best checkpoint of the Trainable. - This will be determined by (from highest priority): - - 1. ``checkpoint_config`` argument of ``run_config`` - 2. ``metric`` and ``mode`` arguments of ``tune_config`` (if using ``Tuner``) - - If neither of those has not been set, this will be None. - May be the same object as ``checkpoint``. error: The execution error of the Trainable run, if the trial finishes in error. - dataframe: The full result dataframe of the Trainable. + dataframe: The full result dataframe of the Trainable. Each row of the + dataframe corresponds to one iteration and contains reported + metrics. + checkpoint_history: A list of tuples of all checkpoints saved + by the Trainable and their associated metrics. """ metrics: Optional[Dict[str, Any]] checkpoint: Optional[Checkpoint] - best_checkpoint: Optional[Checkpoint] error: Optional[Exception] dataframe: Optional[pd.DataFrame] + checkpoint_history: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]] @property def config(self) -> Optional[Dict[str, Any]]: diff --git a/python/ray/tune/analysis/experiment_analysis.py b/python/ray/tune/analysis/experiment_analysis.py index 84d99637f4c7..f537788f61f9 100644 --- a/python/ray/tune/analysis/experiment_analysis.py +++ b/python/ray/tune/analysis/experiment_analysis.py @@ -436,7 +436,7 @@ def get_trial_checkpoints_paths( ) return path_metric_df[["chkpt_path", metric]].values.tolist() elif isinstance(trial, Trial): - checkpoints = trial.checkpoint_manager.best_checkpoints() + checkpoints = trial.get_trial_checkpoints() # Support metrics given as paths, e.g. # "info/learner/default_policy/policy_loss". return [ diff --git a/python/ray/tune/function_runner.py b/python/ray/tune/function_runner.py index 89930e921351..02f5cf707989 100644 --- a/python/ray/tune/function_runner.py +++ b/python/ray/tune/function_runner.py @@ -441,6 +441,8 @@ def step(self): new_result = self._last_result.copy() new_result.update(result) result = new_result + # Do not checkpoint again + result[SHOULD_CHECKPOINT] = False self._last_result = result if self._status_reporter.has_new_checkpoint(): diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index afa68b609b59..12f4cf0c8514 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -1,5 +1,5 @@ import os -from typing import TYPE_CHECKING, Optional, Union +from typing import Optional, Union import pandas as pd @@ -11,9 +11,6 @@ from ray.tune.trial import Trial from ray.util import PublicAPI -if TYPE_CHECKING: - from ray.air.config import CheckpointConfig - @PublicAPI(stability="alpha") class ResultGrid: @@ -46,32 +43,8 @@ class ResultGrid: def __init__( self, experiment_analysis: ExperimentAnalysis, - checkpoint_config: Optional["CheckpointConfig"] = None, ): self._experiment_analysis = experiment_analysis - # Used to determine best checkpoint - self._checkpointing_config = checkpoint_config - - def _resolve_checkpoint_config( - self, - checkpoint_config: "CheckpointConfig", - metric: Optional[str] = None, - mode: Optional[str] = None, - ) -> "CheckpointConfig": - # Lazy import to avoid circular dependency - from ray.air.config import CheckpointConfig - - metric = metric or self._experiment_analysis.default_metric - mode = mode or self._experiment_analysis.default_mode - - if not isinstance(checkpoint_config, CheckpointConfig): - if checkpoint_config and self._checkpointing_config: - checkpoint_config = self._checkpointing_config - else: - checkpoint_config = CheckpointConfig( - checkpoint_score_metric=metric, checkpoint_score_mode=mode - ) - return checkpoint_config def get_best_result( self, @@ -79,7 +52,6 @@ def get_best_result( mode: Optional[str] = None, scope: str = "last", filter_nan_and_inf: bool = True, - checkpoint_config: Union[bool, "CheckpointConfig"] = True, ) -> Result: """Get the best result from all the trials run. @@ -142,11 +114,7 @@ def get_best_result( ) raise RuntimeError(error_msg) - checkpoint_config = self._resolve_checkpoint_config( - checkpoint_config, metric=metric, mode=mode - ) - - return self._trial_to_result(best_trial, checkpoint_config=checkpoint_config) + return self._trial_to_result(best_trial) def get_dataframe( self, @@ -193,27 +161,8 @@ def __len__(self) -> int: def __getitem__(self, i: int) -> Result: """Returns the i'th result in the grid.""" - return self.get(i) - - def get(self, i: int, *, checkpoint_config: Union[bool, "CheckpointConfig"] = True): - """Returns the i'th result in the grid. - - Args: - i: index to return. - checkpoint_config: If True (default), will use the - ``CheckpointConfig`` object set in Trainer's ``RunConfig`` - to determine the best checkpoint of the trial. - If False, or if the ``CheckpointConfig`` object was not set, will use - ``metric`` and ``mode`` as set here. - Can also be a ``CheckpointConfig`` object, in which case it will - be used directly. - """ - - checkpoint_config = self._resolve_checkpoint_config(checkpoint_config) - return self._trial_to_result( self._experiment_analysis.trials[i], - checkpoint_config=checkpoint_config, ) @staticmethod @@ -227,31 +176,20 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError] return TuneError(f.read()) return None - def _trial_to_result( - self, trial: Trial, checkpoint_config: Optional["CheckpointConfig"] - ) -> Result: + def _trial_to_result(self, trial: Trial) -> Result: checkpoint = trial.checkpoint.to_air_checkpoint() - - checkpoint_metric = ( - checkpoint_config.checkpoint_score_metric if checkpoint_config else None - ) - checkpoint_mode = ( - checkpoint_config.checkpoint_score_mode_not_none - if checkpoint_config and checkpoint_metric - else None - ) - try: - best_checkpoint = self._experiment_analysis.get_best_checkpoint( - trial, metric=checkpoint_metric, mode=checkpoint_mode - ) - except ValueError: - best_checkpoint = None + checkpoint_history = [ + (checkpoint.to_air_checkpoint(), checkpoint.metrics) + for checkpoint in trial.get_trial_checkpoints() + ] result = Result( checkpoint=checkpoint, - best_checkpoint=best_checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), - dataframe=self._experiment_analysis.trial_dataframes.get(trial.logdir), + dataframe=self._experiment_analysis.trial_dataframes.get(trial.logdir) + if self._experiment_analysis + else None, + checkpoint_history=checkpoint_history, ) return result diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index 96789116db33..0de68be19190 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -55,13 +55,17 @@ def f(config): result_grid = ResultGrid(analysis) result = result_grid[0] assert isinstance(result.checkpoint, Checkpoint) - assert isinstance(result.best_checkpoint, Checkpoint) + assert isinstance(result.checkpoint_history, list) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) assert isinstance(result.dataframe, pd.DataFrame) assert os.path.normpath( result.checkpoint.get_internal_representation()[1] - ) != os.path.normpath(result.best_checkpoint.get_internal_representation()[1]) + ) != os.path.normpath( + min((x for x in result.checkpoint_history), key=lambda x: x[1]["step"])[ + 0 + ].get_internal_representation()[1] + ) assert result.config == {"a": 1} assert result.metrics["config"] == result.config assert len(result.dataframe) == 2 @@ -81,7 +85,6 @@ def f(config): result_grid = ResultGrid(analysis) result = result_grid[0] assert isinstance(result.checkpoint, Checkpoint) - assert result.best_checkpoint is None assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) assert isinstance(result.dataframe, pd.DataFrame) @@ -124,10 +127,11 @@ def test_result_grid_future_checkpoint(ray_start_2_cpus, to_object): result_grid = ResultGrid(None) # Internal result grid conversion - result = result_grid._trial_to_result(trial, checkpoint_config=None) + result = result_grid._trial_to_result(trial) assert isinstance(result.checkpoint, Checkpoint) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) + assert result.dataframe is None assert result.config == {"some_config": 1} assert result.metrics["config"] == result.config @@ -150,57 +154,30 @@ def f(config): assert best_result.metrics["x"] == 2 -def test_best_result_best_checkpoint(ray_start_2_cpus): - from ray.air.config import CheckpointConfig - +def test_best_result_checkpoint_history(ray_start_2_cpus): def f(config): for i in range(2): with tune.checkpoint_dir(step=i) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") with open(path, "w") as f: - f.write(json.dumps(dict(x=config["x"] * (i + 1), step=i))) - tune.report(x=config["x"] * (i + 1), step=i) - - def load_checkpoint(result): - with open( - os.path.join(result.best_checkpoint.to_directory(), "checkpoint") - ) as f: - checkpoint_data = json.load(f) - return checkpoint_data + f.write(json.dumps(dict(x=config["x"], step=i))) + tune.report(x=config["x"], step=i) analysis = tune.run(f, config={"x": tune.grid_search([1, 3])}) # No checkpointing config. Use metric and mode result_grid = ResultGrid(analysis) best_result = result_grid.get_best_result(metric="x", mode="max") - assert best_result.metrics["x"] == 6 - assert best_result.best_checkpoint - assert load_checkpoint(best_result)["step"] == 1 - - # Checkpointing config. Use by default - result_grid = ResultGrid( - analysis, checkpoint_config=CheckpointConfig(checkpoint_score_metric="x") - ) - best_result = result_grid.get_best_result(metric="x", mode="min") - assert best_result.metrics["x"] == 2 - assert best_result.best_checkpoint - assert load_checkpoint(best_result)["step"] == 1 - - best_result = result_grid.get_best_result( - metric="x", mode="min", checkpoint_config=False - ) - assert best_result.metrics["x"] == 2 - assert best_result.best_checkpoint - assert load_checkpoint(best_result)["step"] == 0 - - best_result = result_grid.get_best_result( - metric="x", - mode="min", - checkpoint_config=CheckpointConfig(checkpoint_score_metric="x"), - ) - assert best_result.metrics["x"] == 2 - assert best_result.best_checkpoint - assert load_checkpoint(best_result)["step"] == 1 + assert best_result.metrics["x"] == 3 + print(best_result.checkpoint_history) + print([x[0].get_internal_representation() for x in best_result.checkpoint_history]) + assert len(best_result.checkpoint_history) == 2 + i = 0 + for checkpoint, metrics in best_result.checkpoint_history: + assert isinstance(checkpoint, Checkpoint) + assert metrics["x"] == 3 + assert metrics["step"] == i + i += 1 def test_best_result_no_report(ray_start_2_cpus): diff --git a/python/ray/tune/tests/test_tuner.py b/python/ray/tune/tests/test_tuner.py index dab5b9a1b51b..e00e41fcdefd 100644 --- a/python/ray/tune/tests/test_tuner.py +++ b/python/ray/tune/tests/test_tuner.py @@ -7,7 +7,7 @@ from sklearn.utils import shuffle from ray import tune -from ray.air.config import CheckpointConfig, RunConfig +from ray.air.config import RunConfig from ray.air.examples.pytorch.torch_linear_example import ( train_func as linear_train_func, ) @@ -253,68 +253,6 @@ def test_tuner_run_config_override(self): assert tuner._local_tuner._run_config.stop == {"metric": 4} - def test_tuner_checkpoint_configuration(self): - # Case 1: nothing set - trainer = DummyTrainer() - tuner = Tuner(trainer) - - results = tuner.fit() - result = results[0] - assert result.checkpoint - assert not result.best_checkpoint - - # Case 2: metric and mode set - trainer = DummyTrainer() - tuner = Tuner( - trainer, tune_config=TuneConfig(mode="min", metric="step", num_samples=2) - ) - - results = tuner.fit() - result = results[0] - assert result.checkpoint - assert result.best_checkpoint - assert ( - os.path.basename( - os.path.normpath( - result.best_checkpoint.get_internal_representation()[1] - ) - ) - == "checkpoint_000000" - ) - assert ( - result.best_checkpoint.get_internal_representation() - != results[1].best_checkpoint.get_internal_representation() - ) - - # Case 3: CheckpointConfig set. Takes priority. - trainer = DummyTrainer( - run_config=RunConfig( - checkpoint_config=CheckpointConfig( - checkpoint_score_metric="step", checkpoint_score_mode="min" - ) - ) - ) - tuner = Tuner( - trainer, tune_config=TuneConfig(mode="max", metric="step", num_samples=2) - ) - - results = tuner.fit() - result = results[0] - assert result.checkpoint - assert result.best_checkpoint - assert ( - os.path.basename( - os.path.normpath( - result.best_checkpoint.get_internal_representation()[1] - ) - ) - == "checkpoint_000000" - ) - assert ( - result.best_checkpoint.get_internal_representation() - != results[1].best_checkpoint.get_internal_representation() - ) - if __name__ == "__main__": import sys diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py index 73e3944ee289..ea5d4bceb809 100644 --- a/python/ray/tune/trial.py +++ b/python/ray/tune/trial.py @@ -767,6 +767,9 @@ def get_trainable_cls(self): def is_finished(self): return self.status in [Trial.ERROR, Trial.TERMINATED] + def get_trial_checkpoints(self) -> List[_TrackedCheckpoint]: + return self.checkpoint_manager.best_checkpoints() + @property def is_restoring(self): return self.restoring_from is not None From 27e531c3431c23225e3caa47f92e3d93b871cc0d Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 20:54:43 +0000 Subject: [PATCH 34/63] Docstring tweak --- python/ray/tune/analysis/experiment_analysis.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/ray/tune/analysis/experiment_analysis.py b/python/ray/tune/analysis/experiment_analysis.py index f537788f61f9..7b0518cfa5c5 100644 --- a/python/ray/tune/analysis/experiment_analysis.py +++ b/python/ray/tune/analysis/experiment_analysis.py @@ -366,7 +366,11 @@ def results_df(self) -> DataFrame: @property def trial_dataframes(self) -> Dict[str, DataFrame]: - """List of all dataframes of the trials.""" + """List of all dataframes of the trials. + + Each row of the dataframe corresponds to one iteration of a trial + and contains reported metrics. + """ return self._trial_dataframes def dataframe( From 7d1abfe2a2d6b1786e3b571a5cdc8fbcca256cdf Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 20:56:16 +0000 Subject: [PATCH 35/63] Remove docstring --- python/ray/tune/result_grid.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 12f4cf0c8514..9c216b657e7b 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -73,13 +73,6 @@ def get_best_result( filter_nan_and_inf: If True (default), NaN or infinite values are disregarded and these trials are never selected as the best trial. - checkpoint_config: If True (default), will use the - ``CheckpointConfig`` object set in Trainer's ``run_config`` - to determine the best checkpoint of the trial. - If False, or if the ``CheckpointConfig`` object was not set, will use - ``metric`` and ``mode`` as set here. - Can also be a ``CheckpointConfig`` object, in which case it will - be used directly. """ if not metric and not self._experiment_analysis.default_metric: raise ValueError( From b0dd3baf038252f2f4208b18a10e380d8b566a40 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 20:58:45 +0000 Subject: [PATCH 36/63] Fix --- python/ray/air/result.py | 9 ++++++--- python/ray/tune/result_grid.py | 4 ++-- python/ray/tune/tests/test_result_grid.py | 12 ++++++------ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/python/ray/air/result.py b/python/ray/air/result.py index 77b3d4b03d28..a404569b3c9b 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -27,15 +27,18 @@ class Result: dataframe: The full result dataframe of the Trainable. Each row of the dataframe corresponds to one iteration and contains reported metrics. - checkpoint_history: A list of tuples of all checkpoints saved - by the Trainable and their associated metrics. + best_checkpoints: A list of tuples of the best checkpoints saved + by the Trainable and their associated metrics. The number of + saved checkpoints is determined by the ``checkpoint_config`` + argument of ``run_config`` (by default, all checkpoints will + be saved). """ metrics: Optional[Dict[str, Any]] checkpoint: Optional[Checkpoint] error: Optional[Exception] dataframe: Optional[pd.DataFrame] - checkpoint_history: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]] + best_checkpoints: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]] @property def config(self) -> Optional[Dict[str, Any]]: diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 9c216b657e7b..aaf6a73ecf2b 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -171,7 +171,7 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError] def _trial_to_result(self, trial: Trial) -> Result: checkpoint = trial.checkpoint.to_air_checkpoint() - checkpoint_history = [ + best_checkpoints = [ (checkpoint.to_air_checkpoint(), checkpoint.metrics) for checkpoint in trial.get_trial_checkpoints() ] @@ -183,6 +183,6 @@ def _trial_to_result(self, trial: Trial) -> Result: dataframe=self._experiment_analysis.trial_dataframes.get(trial.logdir) if self._experiment_analysis else None, - checkpoint_history=checkpoint_history, + best_checkpoints=best_checkpoints, ) return result diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index 0de68be19190..dc49c404cdc6 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -55,14 +55,14 @@ def f(config): result_grid = ResultGrid(analysis) result = result_grid[0] assert isinstance(result.checkpoint, Checkpoint) - assert isinstance(result.checkpoint_history, list) + assert isinstance(result.best_checkpoints, list) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) assert isinstance(result.dataframe, pd.DataFrame) assert os.path.normpath( result.checkpoint.get_internal_representation()[1] ) != os.path.normpath( - min((x for x in result.checkpoint_history), key=lambda x: x[1]["step"])[ + min((x for x in result.best_checkpoints), key=lambda x: x[1]["step"])[ 0 ].get_internal_representation()[1] ) @@ -169,11 +169,11 @@ def f(config): result_grid = ResultGrid(analysis) best_result = result_grid.get_best_result(metric="x", mode="max") assert best_result.metrics["x"] == 3 - print(best_result.checkpoint_history) - print([x[0].get_internal_representation() for x in best_result.checkpoint_history]) - assert len(best_result.checkpoint_history) == 2 + print(best_result.best_checkpoints) + print([x[0].get_internal_representation() for x in best_result.best_checkpoints]) + assert len(best_result.best_checkpoints) == 2 i = 0 - for checkpoint, metrics in best_result.checkpoint_history: + for checkpoint, metrics in best_result.best_checkpoints: assert isinstance(checkpoint, Checkpoint) assert metrics["x"] == 3 assert metrics["step"] == i From 5b226abab1b98e5bd237a87270bf2cd31f05b8bf Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 21:10:01 +0000 Subject: [PATCH 37/63] Tweak docstring --- python/ray/air/result.py | 4 ++-- python/ray/tune/analysis/experiment_analysis.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/ray/air/result.py b/python/ray/air/result.py index a404569b3c9b..d6fa35a4a809 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -24,8 +24,8 @@ class Result: metrics: The final metrics as reported by an Trainable. checkpoint: The final checkpoint of the Trainable. error: The execution error of the Trainable run, if the trial finishes in error. - dataframe: The full result dataframe of the Trainable. Each row of the - dataframe corresponds to one iteration and contains reported + dataframe: The full result dataframe of the Trainable. + The dataframe is indexed by iterations and contains reported metrics. best_checkpoints: A list of tuples of the best checkpoints saved by the Trainable and their associated metrics. The number of diff --git a/python/ray/tune/analysis/experiment_analysis.py b/python/ray/tune/analysis/experiment_analysis.py index 7b0518cfa5c5..97dd0a924a51 100644 --- a/python/ray/tune/analysis/experiment_analysis.py +++ b/python/ray/tune/analysis/experiment_analysis.py @@ -368,8 +368,8 @@ def results_df(self) -> DataFrame: def trial_dataframes(self) -> Dict[str, DataFrame]: """List of all dataframes of the trials. - Each row of the dataframe corresponds to one iteration of a trial - and contains reported metrics. + Each dataframe is indexed by iterations and contains reported + metrics. """ return self._trial_dataframes From 65ce1d3c9a1fd668a3c09eece4c07d46476c15b9 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 21 Jun 2022 22:21:39 +0000 Subject: [PATCH 38/63] Fix --- python/ray/tune/impl/tuner_internal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py index e2dfadf43fdf..3faf25ee895c 100644 --- a/python/ray/tune/impl/tuner_internal.py +++ b/python/ray/tune/impl/tuner_internal.py @@ -147,7 +147,7 @@ def fit(self) -> ResultGrid: else: analysis = self._fit_resume(trainable) - return ResultGrid(analysis, self._run_config.checkpoint_config) + return ResultGrid(analysis) def _get_tune_run_arguments(self) -> Dict[str, Any]: """Get tune.run arguments common for both new and resumed runs.""" From 1e1fbea7d3392aa70869750a09c3e1467678dc06 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 22 Jun 2022 11:22:08 +0000 Subject: [PATCH 39/63] Use CheckpointStrategy --- python/ray/air/__init__.py | 4 +- python/ray/air/config.py | 55 +------------------ python/ray/air/tests/test_api.py | 14 ++--- .../ray/util/ml_utils/checkpoint_manager.py | 10 ++++ 4 files changed, 21 insertions(+), 62 deletions(-) diff --git a/python/ray/air/__init__.py b/python/ray/air/__init__.py index 506f9d022cc0..922f1bc83b94 100644 --- a/python/ray/air/__init__.py +++ b/python/ray/air/__init__.py @@ -4,8 +4,8 @@ RunConfig, ScalingConfig, FailureConfig, - CheckpointConfig, ) +from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy from ray.air.data_batch_type import DataBatchType from ray.air.result import Result from ray.air.util.datasets import train_test_split @@ -18,6 +18,6 @@ "ScalingConfig", "DatasetConfig", "FailureConfig", - "CheckpointConfig", + "CheckpointStrategy", "train_test_split", ] diff --git a/python/ray/air/config.py b/python/ray/air/config.py index 5b0a886cc3e2..5ea53d92f749 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -5,6 +5,7 @@ from ray.tune.syncer import SyncConfig from ray.tune.utils.log import Verbosity from ray.util.annotations import PublicAPI +from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy if TYPE_CHECKING: from ray.data import Dataset @@ -273,58 +274,6 @@ class FailureConfig: max_failures: int = 0 -@dataclass -@PublicAPI(stability="alpha") -class CheckpointConfig: - """Configuration related to checkpointing of each run/trial. - - Args: - keep_checkpoints_num: Number of checkpoints to keep. A value of - `None` keeps all checkpoints. Defaults to `None`. If set, need - to provide `checkpoint_score_attr`. - checkpoint_score_metric: Specifies by which metric to rank the - best checkpoint. Defaults to training iteration. - checkpoint_score_mode: Must be one of [min, max]. Determines - whether ``checkpoint_score_metric`` should be minimized or maximized. - If not set, will be the same as 'max'. Cannot be set if - ``checkpoint_score_metric`` is not set. - """ - - keep_checkpoints_num: Optional[int] = None - checkpoint_score_metric: Optional[str] = None - checkpoint_score_mode: Optional[str] = None - - def __post_init__(self): - if self.checkpoint_score_mode not in (None, "min", "max"): - raise ValueError( - "The `checkpoint_score_mode` parameter can only be " - f"either None, 'min' or 'max', got {self.checkpoint_score_mode}." - ) - if ( - self.checkpoint_score_metric is None - and self.checkpoint_score_mode is not None - ): - raise ValueError( - "`checkpoint_score_mode` cannot be set if " - "`checkpoint_score_metric` is not set." - ) - - @property - def checkpoint_score_attr(self) -> Optional[str]: - """Same as ``checkpoint_score_attr`` in ``tune.run``.""" - if self.checkpoint_score_metric is None: - return self.checkpoint_score_metric - prefix = "" - if self.checkpoint_score_mode == "min": - prefix = "min-" - return f"{prefix}{self.checkpoint_score_metric}" - - @property - def checkpoint_score_mode_not_none(self) -> str: - """``checkpoint_score_mode`` but None -> 'max'""" - return self.checkpoint_score_mode or "max" - - @dataclass @PublicAPI(stability="alpha") class RunConfig: @@ -365,5 +314,5 @@ class RunConfig: stop: Optional[Union[Mapping, "Stopper", Callable[[str, Mapping], bool]]] = None failure: Optional[FailureConfig] = None sync_config: Optional[SyncConfig] = None - checkpoint_config: Optional[CheckpointConfig] = None + checkpoint_config: Optional[CheckpointStrategy] = None verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py index 136cb0e58473..d7054c5a81bf 100644 --- a/python/ray/air/tests/test_api.py +++ b/python/ray/air/tests/test_api.py @@ -4,7 +4,7 @@ from ray.air import Checkpoint from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated from ray.air.config import ScalingConfigDataClass -from ray.air.config import CheckpointConfig +from ray.air.config import CheckpointStrategy from ray.data.preprocessor import Preprocessor from ray.train.trainer import BaseTrainer @@ -42,25 +42,25 @@ def test_run_config(): def test_checkpointing_config(): # cannot set checkpoint_score_mode if checkpoint_score_metric is unset with pytest.raises(ValueError): - CheckpointConfig(checkpoint_score_mode="min") + CheckpointStrategy(checkpoint_score_mode="min") with pytest.raises(ValueError): - CheckpointConfig( + CheckpointStrategy( checkpoint_score_metric="metric", checkpoint_score_mode="invalid" ) - checkpointing = CheckpointConfig() + checkpointing = CheckpointStrategy() assert checkpointing.checkpoint_score_attr is None - checkpointing = CheckpointConfig(checkpoint_score_metric="metric") + checkpointing = CheckpointStrategy(checkpoint_score_metric="metric") assert checkpointing.checkpoint_score_attr == "metric" - checkpointing = CheckpointConfig( + checkpointing = CheckpointStrategy( checkpoint_score_metric="metric", checkpoint_score_mode="max" ) assert checkpointing.checkpoint_score_attr == "metric" - checkpointing = CheckpointConfig( + checkpointing = CheckpointStrategy( checkpoint_score_metric="metric", checkpoint_score_mode="min" ) assert checkpointing.checkpoint_score_attr == "min-metric" diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index 9a27acd10e36..4b4dc9b8d113 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -230,6 +230,16 @@ def __post_init__(self): f"checkpoint_score_order must be either " f'"{MAX}" or "{MIN}".' ) + @property + def checkpoint_score_attr(self) -> Optional[str]: + """Same as ``checkpoint_score_attr`` in ``tune.run``.""" + if self.checkpoint_score_attribute is None: + return self.checkpoint_score_attribute + prefix = "" + if self.checkpoint_score_order == MIN: + prefix = "min-" + return f"{prefix}{self.checkpoint_score_attribute}" + class _CheckpointManager: """Common checkpoint management and bookkeeping class for Ray Train and Tune. From e19d40f542dd8e3af89042331ccc1d94d48692cf Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 22 Jun 2022 15:15:17 +0200 Subject: [PATCH 40/63] Fix --- python/ray/air/tests/test_api.py | 12 ++++-------- python/ray/tune/impl/tuner_internal.py | 2 +- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py index d7054c5a81bf..1c0680860e3f 100644 --- a/python/ray/air/tests/test_api.py +++ b/python/ray/air/tests/test_api.py @@ -40,28 +40,24 @@ def test_run_config(): def test_checkpointing_config(): - # cannot set checkpoint_score_mode if checkpoint_score_metric is unset - with pytest.raises(ValueError): - CheckpointStrategy(checkpoint_score_mode="min") - with pytest.raises(ValueError): CheckpointStrategy( - checkpoint_score_metric="metric", checkpoint_score_mode="invalid" + checkpoint_score_attribute="metric", checkpoint_score_order="invalid" ) checkpointing = CheckpointStrategy() assert checkpointing.checkpoint_score_attr is None - checkpointing = CheckpointStrategy(checkpoint_score_metric="metric") + checkpointing = CheckpointStrategy(checkpoint_score_attribute="metric") assert checkpointing.checkpoint_score_attr == "metric" checkpointing = CheckpointStrategy( - checkpoint_score_metric="metric", checkpoint_score_mode="max" + checkpoint_score_attribute="metric", checkpoint_score_order="max" ) assert checkpointing.checkpoint_score_attr == "metric" checkpointing = CheckpointStrategy( - checkpoint_score_metric="metric", checkpoint_score_mode="min" + checkpoint_score_attribute="metric", checkpoint_score_order="min" ) assert checkpointing.checkpoint_score_attr == "min-metric" diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py index 60b89b8acb98..2d348c94d47a 100644 --- a/python/ray/tune/impl/tuner_internal.py +++ b/python/ray/tune/impl/tuner_internal.py @@ -163,7 +163,7 @@ def _get_tune_run_arguments(self) -> Dict[str, Any]: else 0 ), keep_checkpoints_num=( - self._run_config.checkpoint_config.keep_checkpoints_num + self._run_config.checkpoint_config.num_to_keep if self._run_config.checkpoint_config else None ), From fd961746ca582263ecbc6bacc4342e915bd74416 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 15:43:53 +0000 Subject: [PATCH 41/63] dataframe -> metrics_dataframe --- python/ray/air/result.py | 4 ++-- python/ray/tune/result_grid.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/ray/air/result.py b/python/ray/air/result.py index d6fa35a4a809..5b7f0fcba04b 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -24,7 +24,7 @@ class Result: metrics: The final metrics as reported by an Trainable. checkpoint: The final checkpoint of the Trainable. error: The execution error of the Trainable run, if the trial finishes in error. - dataframe: The full result dataframe of the Trainable. + metrics_dataframe: The full result dataframe of the Trainable. The dataframe is indexed by iterations and contains reported metrics. best_checkpoints: A list of tuples of the best checkpoints saved @@ -37,7 +37,7 @@ class Result: metrics: Optional[Dict[str, Any]] checkpoint: Optional[Checkpoint] error: Optional[Exception] - dataframe: Optional[pd.DataFrame] + metrics_dataframe: Optional[pd.DataFrame] best_checkpoints: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]] @property diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index ed6711414985..b0cc6f83899e 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -180,7 +180,9 @@ def _trial_to_result(self, trial: Trial) -> Result: checkpoint=checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), - dataframe=self._experiment_analysis.trial_dataframes.get(trial.logdir) + metrics_dataframe=self._experiment_analysis.trial_dataframes.get( + trial.logdir + ) if self._experiment_analysis else None, best_checkpoints=best_checkpoints, From 8d5f1b3d63d6b58843f2fd51d39c8502b2293015 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 16:04:09 +0000 Subject: [PATCH 42/63] CheckpointStrategy -> CheckpointConfig --- doc/source/train/api.rst | 4 --- doc/source/train/user_guide.rst | 10 +++--- python/ray/air/__init__.py | 4 +-- python/ray/air/config.py | 6 ++-- python/ray/air/tests/test_api.py | 12 +++---- python/ray/train/__init__.py | 6 +++- python/ray/train/_internal/checkpoint.py | 8 ++--- python/ray/train/data_parallel_trainer.py | 6 ++-- python/ray/train/tests/test_trainer.py | 12 +++---- python/ray/train/trainer.py | 16 ++++----- python/ray/tune/callback.py | 4 +-- .../ray/tune/execution/checkpoint_manager.py | 4 +-- .../ray/util/ml_utils/checkpoint_manager.py | 33 ++++++++++++++----- .../ml_utils/tests/test_checkpoint_manager.py | 14 ++++---- 14 files changed, 79 insertions(+), 60 deletions(-) diff --git a/doc/source/train/api.rst b/doc/source/train/api.rst index 054ad0f30d10..ea8b879cdcd9 100644 --- a/doc/source/train/api.rst +++ b/doc/source/train/api.rst @@ -117,10 +117,6 @@ Checkpointing .. _train-api-checkpoint-strategy: -CheckpointStrategy -~~~~~~~~~~~~~~~~~~ - -.. autoclass:: ray.train.CheckpointStrategy .. _train-api-func-utils: diff --git a/doc/source/train/user_guide.rst b/doc/source/train/user_guide.rst index 7dc1bd79ce3a..ff2b2556afb0 100644 --- a/doc/source/train/user_guide.rst +++ b/doc/source/train/user_guide.rst @@ -700,13 +700,13 @@ As an example, to completely disable writing checkpoints to disk: :emphasize-lines: 8,12 from ray import train - from ray.train import CheckpointStrategy, Trainer + from ray.train import CheckpointConfig, Trainer def train_func(): for epoch in range(3): train.save_checkpoint(epoch=epoch) - checkpoint_strategy = CheckpointStrategy(num_to_keep=0) + checkpoint_strategy = CheckpointConfig(num_to_keep=0) trainer = Trainer(backend="torch", num_workers=2) trainer.start() @@ -714,12 +714,12 @@ As an example, to completely disable writing checkpoints to disk: trainer.shutdown() -You may also config ``CheckpointStrategy`` to keep the "N best" checkpoints persisted to disk. The following example shows how you could keep the 2 checkpoints with the lowest "loss" value: +You may also config ``CheckpointConfig`` to keep the "N best" checkpoints persisted to disk. The following example shows how you could keep the 2 checkpoints with the lowest "loss" value: .. code-block:: python from ray import train - from ray.train import CheckpointStrategy, Trainer + from ray.train import CheckpointConfig, Trainer def train_func(): @@ -733,7 +733,7 @@ You may also config ``CheckpointStrategy`` to keep the "N best" checkpoints pers train.save_checkpoint(loss=3) # Keep the 2 checkpoints with the smallest "loss" value. - checkpoint_strategy = CheckpointStrategy(num_to_keep=2, + checkpoint_strategy = CheckpointConfig(num_to_keep=2, checkpoint_score_attribute="loss", checkpoint_score_order="min") diff --git a/python/ray/air/__init__.py b/python/ray/air/__init__.py index 922f1bc83b94..506f9d022cc0 100644 --- a/python/ray/air/__init__.py +++ b/python/ray/air/__init__.py @@ -4,8 +4,8 @@ RunConfig, ScalingConfig, FailureConfig, + CheckpointConfig, ) -from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy from ray.air.data_batch_type import DataBatchType from ray.air.result import Result from ray.air.util.datasets import train_test_split @@ -18,6 +18,6 @@ "ScalingConfig", "DatasetConfig", "FailureConfig", - "CheckpointStrategy", + "CheckpointConfig", "train_test_split", ] diff --git a/python/ray/air/config.py b/python/ray/air/config.py index 00db7cdcf33b..5b7317d283ca 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -5,7 +5,9 @@ from ray.tune.syncer import SyncConfig from ray.tune.utils.log import Verbosity from ray.util.annotations import PublicAPI -from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy + +# Move here later when ml_utils is deprecated +from ray.util.ml_utils.checkpoint_manager import CheckpointConfig if TYPE_CHECKING: from ray.data import Dataset @@ -314,5 +316,5 @@ class RunConfig: stop: Optional[Union[Mapping, "Stopper", Callable[[str, Mapping], bool]]] = None failure_config: Optional[FailureConfig] = None sync_config: Optional[SyncConfig] = None - checkpoint_config: Optional[CheckpointStrategy] = None + checkpoint_config: Optional[CheckpointConfig] = None verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py index 1c0680860e3f..e4474c24626a 100644 --- a/python/ray/air/tests/test_api.py +++ b/python/ray/air/tests/test_api.py @@ -4,7 +4,7 @@ from ray.air import Checkpoint from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated from ray.air.config import ScalingConfigDataClass -from ray.air.config import CheckpointStrategy +from ray.air.config import CheckpointConfig from ray.data.preprocessor import Preprocessor from ray.train.trainer import BaseTrainer @@ -41,22 +41,22 @@ def test_run_config(): def test_checkpointing_config(): with pytest.raises(ValueError): - CheckpointStrategy( + CheckpointConfig( checkpoint_score_attribute="metric", checkpoint_score_order="invalid" ) - checkpointing = CheckpointStrategy() + checkpointing = CheckpointConfig() assert checkpointing.checkpoint_score_attr is None - checkpointing = CheckpointStrategy(checkpoint_score_attribute="metric") + checkpointing = CheckpointConfig(checkpoint_score_attribute="metric") assert checkpointing.checkpoint_score_attr == "metric" - checkpointing = CheckpointStrategy( + checkpointing = CheckpointConfig( checkpoint_score_attribute="metric", checkpoint_score_order="max" ) assert checkpointing.checkpoint_score_attr == "metric" - checkpointing = CheckpointStrategy( + checkpointing = CheckpointConfig( checkpoint_score_attribute="metric", checkpoint_score_order="min" ) assert checkpointing.checkpoint_score_attr == "min-metric" diff --git a/python/ray/train/__init__.py b/python/ray/train/__init__.py index 11407fa8a16a..74d360f117b7 100644 --- a/python/ray/train/__init__.py +++ b/python/ray/train/__init__.py @@ -12,13 +12,16 @@ world_size, ) from ray.train.trainer import Trainer, TrainingIterator +from ray.air.config import CheckpointConfig + +# deprecated from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy usage_lib.record_library_usage("train") __all__ = [ "BackendConfig", - "CheckpointStrategy", + "CheckpointConfig", "get_dataset_shard", "load_checkpoint", "local_rank", @@ -30,4 +33,5 @@ "world_rank", "world_size", "TRAIN_DATASET_KEY", + "CheckpointStrategy", ] diff --git a/python/ray/train/_internal/checkpoint.py b/python/ray/train/_internal/checkpoint.py index 0a85f4396e36..8bffe957833d 100644 --- a/python/ray/train/_internal/checkpoint.py +++ b/python/ray/train/_internal/checkpoint.py @@ -11,7 +11,7 @@ TUNE_CHECKPOINT_ID, TUNE_INSTALLED, ) -from ray.util.ml_utils.checkpoint_manager import CheckpointStorage, CheckpointStrategy +from ray.util.ml_utils.checkpoint_manager import CheckpointStorage, CheckpointConfig from ray.util.ml_utils.checkpoint_manager import ( _CheckpointManager as CommonCheckpointManager, ) @@ -67,7 +67,7 @@ class CheckpointManager(CommonCheckpointManager): def __init__( self, run_dir: Optional[Path] = None, - checkpoint_strategy: Optional[CheckpointStrategy] = None, + checkpoint_strategy: Optional[CheckpointConfig] = None, ): self.run_dir = run_dir @@ -136,11 +136,11 @@ def _get_next_checkpoint_path(self) -> Optional[Path]: def on_start_training( self, - checkpoint_strategy: Optional[CheckpointStrategy], + checkpoint_strategy: Optional[CheckpointConfig], run_dir: Path, latest_checkpoint_id: Optional[int] = 0, ): - checkpoint_strategy = checkpoint_strategy or CheckpointStrategy() + checkpoint_strategy = checkpoint_strategy or CheckpointConfig() self._checkpoint_strategy = checkpoint_strategy self._validate_checkpoint_strategy() diff --git a/python/ray/train/data_parallel_trainer.py b/python/ray/train/data_parallel_trainer.py index 59ed5cb0be4c..c8729282ab40 100644 --- a/python/ray/train/data_parallel_trainer.py +++ b/python/ray/train/data_parallel_trainer.py @@ -8,7 +8,7 @@ from ray import tune from ray.air import session from ray.air.checkpoint import Checkpoint -from ray.air.config import DatasetConfig, RunConfig, ScalingConfig +from ray.air.config import DatasetConfig, RunConfig, ScalingConfig, CheckpointConfig from ray.air.constants import MODEL_KEY, PREPROCESSOR_KEY from ray.train import BackendConfig, TrainingIterator from ray.train._internal.backend_executor import BackendExecutor, TrialInfo @@ -18,7 +18,7 @@ from ray.train.constants import TRAIN_DATASET_KEY, WILDCARD_KEY from ray.train.trainer import BaseTrainer, GenDataset from ray.util.annotations import DeveloperAPI -from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy, _TrackedCheckpoint +from ray.util.ml_utils.checkpoint_manager import _TrackedCheckpoint if TYPE_CHECKING: from ray.data.preprocessor import Preprocessor @@ -32,7 +32,7 @@ def __init__( self, preprocessor: "Preprocessor", run_dir: Optional[Path] = None, - checkpoint_strategy: Optional[CheckpointStrategy] = None, + checkpoint_strategy: Optional[CheckpointConfig] = None, ): self.preprocessor = preprocessor super(_DataParallelCheckpointManager, self).__init__( diff --git a/python/ray/train/tests/test_trainer.py b/python/ray/train/tests/test_trainer.py index 207d45f79d28..d366999e8611 100644 --- a/python/ray/train/tests/test_trainer.py +++ b/python/ray/train/tests/test_trainer.py @@ -10,7 +10,7 @@ import ray import ray.train as train from ray._private.test_utils import wait_for_condition -from ray.train import Trainer, CheckpointStrategy +from ray.train import Trainer, CheckpointConfig from ray.train.backend import BackendConfig, Backend from ray.train.constants import TRAIN_ENABLE_WORKER_SPREAD_ENV from ray.train.torch import TorchConfig @@ -514,7 +514,7 @@ def test_persisted_checkpoint_strategy(ray_start_2_cpus): logdir = "/tmp/test/trainer/test_persisted_checkpoint_strategy" config = TestConfig() - checkpoint_strategy = CheckpointStrategy( + checkpoint_strategy = CheckpointConfig( num_to_keep=2, checkpoint_score_attribute="loss", checkpoint_score_order="min" ) @@ -555,7 +555,7 @@ def validate(): def test_load_checkpoint_from_path(ray_start_2_cpus, tmpdir): config = TestConfig() - checkpoint_strategy = CheckpointStrategy( + checkpoint_strategy = CheckpointConfig( checkpoint_score_attribute="loss", checkpoint_score_order="min" ) @@ -585,12 +585,12 @@ def train_func(): trainer.start() with pytest.raises(ValueError): - trainer.run(train_func, checkpoint_strategy=CheckpointStrategy(num_to_keep=-1)) + trainer.run(train_func, checkpoint_strategy=CheckpointConfig(num_to_keep=-1)) with pytest.raises(ValueError): trainer.run( train_func, - checkpoint_strategy=CheckpointStrategy( + checkpoint_strategy=CheckpointConfig( checkpoint_score_order="invalid_order" ), ) @@ -598,7 +598,7 @@ def train_func(): with pytest.raises(ValueError): trainer.run( train_func, - checkpoint_strategy=CheckpointStrategy( + checkpoint_strategy=CheckpointConfig( checkpoint_score_attribute="missing_attribute" ), ) diff --git a/python/ray/train/trainer.py b/python/ray/train/trainer.py index 1980f16235df..0262f92963d8 100644 --- a/python/ray/train/trainer.py +++ b/python/ray/train/trainer.py @@ -9,6 +9,7 @@ import ray from ray.actor import ActorHandle from ray.air.checkpoint import Checkpoint +from ray.air.config import CheckpointConfig from ray.train._internal.backend_executor import ( BackendExecutor, InactiveWorkerGroupError, @@ -42,7 +43,6 @@ TUNE_INSTALLED, ) from ray.util.annotations import Deprecated, DeveloperAPI -from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy if TUNE_INSTALLED: from ray import tune @@ -293,7 +293,7 @@ def run( callbacks: Optional[List[TrainingCallback]] = None, dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]] = None, checkpoint: Optional[Union[Dict, str, Path]] = None, - checkpoint_strategy: Optional[CheckpointStrategy] = None, + checkpoint_strategy: Optional[CheckpointConfig] = None, ) -> List[T]: """Runs a training function in a distributed manner. @@ -321,7 +321,7 @@ def run( or ``Path`` then the value is expected to be a path to a file that contains a serialized checkpoint dict. If this is ``None`` then no checkpoint will be loaded. - checkpoint_strategy (Optional[CheckpointStrategy]): The + checkpoint_strategy (Optional[CheckpointConfig]): The configurations for saving checkpoints. Returns: @@ -373,7 +373,7 @@ def run_iterator( config: Optional[Dict[str, Any]] = None, dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]] = None, checkpoint: Optional[Union[Dict, str, Path]] = None, - checkpoint_strategy: Optional[CheckpointStrategy] = None, + checkpoint_strategy: Optional[CheckpointConfig] = None, ) -> "TrainingIterator": """Same as ``run`` except returns an iterator over the results. @@ -411,7 +411,7 @@ def train_func(config): ``str`` or ``Path`` then the value is expected to be a path to a file that contains a serialized checkpoint dict. If this is ``None`` then no checkpoint will be loaded. - checkpoint_strategy (Optional[CheckpointStrategy]): The + checkpoint_strategy (Optional[CheckpointConfig]): The configurations for saving checkpoints. Returns: @@ -462,7 +462,7 @@ def latest_checkpoint_dir(self) -> Optional[Path]: def best_checkpoint_path(self) -> Optional[Path]: """Path to the best persisted checkpoint from the latest run. - "Best" is defined by the input ``CheckpointStrategy``. + "Best" is defined by the input ``CheckpointConfig``. Default behavior is to return the most recent checkpoint. Returns ``None`` if ``run()`` has not been called or if @@ -486,7 +486,7 @@ def latest_checkpoint(self) -> Optional[Dict]: def best_checkpoint(self) -> Optional[Dict]: """Best saved checkpoint from the latest run. - "Best" is defined by the input ``CheckpointStrategy``. + "Best" is defined by the input ``CheckpointConfig``. Default behavior is to return the most recent checkpoint. Returns ``None`` if ``run()`` has not been called or if @@ -670,7 +670,7 @@ def __init__( dataset_spec: RayDatasetSpec, checkpoint_manager: CheckpointManager, checkpoint: Optional[Union[Dict, str, Path, Checkpoint]], - checkpoint_strategy: Optional[CheckpointStrategy], + checkpoint_strategy: Optional[CheckpointConfig], run_dir: Optional[Path] = None, ): self._backend_executor = backend_executor diff --git a/python/ray/tune/callback.py b/python/ray/tune/callback.py index fcf4e24aee3d..450ee55310f7 100644 --- a/python/ray/tune/callback.py +++ b/python/ray/tune/callback.py @@ -3,11 +3,11 @@ import warnings from ray.util.annotations import PublicAPI, DeveloperAPI -from ray.util.ml_utils.checkpoint_manager import _TrackedCheckpoint if TYPE_CHECKING: from ray.tune.experiment import Trial from ray.tune.stopper import Stopper + from ray.util.ml_utils.checkpoint_manager import _TrackedCheckpoint class _CallbackMeta(ABCMeta): @@ -251,7 +251,7 @@ def on_checkpoint( iteration: int, trials: List["Trial"], trial: "Trial", - checkpoint: _TrackedCheckpoint, + checkpoint: "_TrackedCheckpoint", **info, ): """Called after a trial saved a checkpoint with Tune. diff --git a/python/ray/tune/execution/checkpoint_manager.py b/python/ray/tune/execution/checkpoint_manager.py index 64b68a7fb416..f1295ac6f604 100644 --- a/python/ray/tune/execution/checkpoint_manager.py +++ b/python/ray/tune/execution/checkpoint_manager.py @@ -4,7 +4,7 @@ from ray.tune.result import TRAINING_ITERATION from ray.util.ml_utils.checkpoint_manager import ( - CheckpointStrategy, + CheckpointConfig, MIN, MAX, _CheckpointManager as CommonCheckpointManager, @@ -51,7 +51,7 @@ def __init__( else: checkpoint_score_attr = checkpoint_score_attr - checkpoint_strategy = CheckpointStrategy( + checkpoint_strategy = CheckpointConfig( num_to_keep=keep_checkpoints_num, checkpoint_score_attribute=checkpoint_score_attr, checkpoint_score_order=MIN if checkpoint_score_desc else MAX, diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index 12be4aea9f99..493be617e2d4 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -14,8 +14,7 @@ import ray from ray.air import Checkpoint from ray.tune.result import NODE_IP -from ray.util import PublicAPI -from ray.util.annotations import DeveloperAPI +from ray.util.annotations import Deprecated, DeveloperAPI, PublicAPI from ray.util.ml_utils.util import is_nan MAX = "max" @@ -186,9 +185,10 @@ def __repr__(self): return f"_HeapCheckpoint({repr(self.tracked_checkpoint)})" -@PublicAPI(stability="beta") +# Move to ray.air.config when ml_utils is deprecated. @dataclass -class CheckpointStrategy: +@PublicAPI(stability="alpha") +class CheckpointConfig: """Configurable parameters for defining the checkpointing strategy. Default behavior is to persist all checkpoints to disk. If @@ -196,7 +196,7 @@ class CheckpointStrategy: checkpoints with maximum timestamp, i.e. the most recent checkpoints. Args: - num_to_keep (Optional[int]): The number of checkpoints to keep + num_to_keep: The number of checkpoints to keep on disk for this run. If a checkpoint is persisted to disk after there are already this many checkpoints, then an existing checkpoint will be deleted. If this is ``None`` then checkpoints @@ -208,7 +208,7 @@ class CheckpointStrategy: This attribute must be a key from the checkpoint dictionary which has a numerical value. Per default, the last checkpoints will be kept. - checkpoint_score_order (str). Either "max" or "min". + checkpoint_score_order: Either "max" or "min". If "max", then checkpoints with highest values of ``checkpoint_score_attribute`` will be kept. If "min", then checkpoints with lowest values of @@ -242,6 +242,23 @@ def checkpoint_score_attr(self) -> Optional[str]: return f"{prefix}{self.checkpoint_score_attribute}" +# Alias for backwards compatibility + +deprecation_message = ( + "`CheckpointStrategy` is deprecated and will be removed in " + "the future. Please use `ray.air.config.CheckpointStrategy` " + "instead." +) + + +@Deprecated(message=deprecation_message) +@dataclass +class CheckpointStrategy(CheckpointConfig): + def __post_init__(self): + logger.warning(deprecation_message) + super().__post_init__() + + class _CheckpointManager: """Common checkpoint management and bookkeeping class for Ray Train and Tune. @@ -269,11 +286,11 @@ class _CheckpointManager: def __init__( self, - checkpoint_strategy: CheckpointStrategy, + checkpoint_strategy: CheckpointConfig, latest_checkpoint_id: int = 0, delete_fn: Optional[Callable[["_TrackedCheckpoint"], None]] = None, ): - self._checkpoint_strategy = checkpoint_strategy or CheckpointStrategy() + self._checkpoint_strategy = checkpoint_strategy or CheckpointConfig() # Incremental unique checkpoint ID of this run. self._latest_checkpoint_id = latest_checkpoint_id diff --git a/python/ray/util/ml_utils/tests/test_checkpoint_manager.py b/python/ray/util/ml_utils/tests/test_checkpoint_manager.py index 16fd83a8ecb8..0c0a145ad26b 100644 --- a/python/ray/util/ml_utils/tests/test_checkpoint_manager.py +++ b/python/ray/util/ml_utils/tests/test_checkpoint_manager.py @@ -2,13 +2,13 @@ from ray.util.ml_utils.checkpoint_manager import ( _CheckpointManager, CheckpointStorage, - CheckpointStrategy, + CheckpointConfig, _TrackedCheckpoint, ) def test_unlimited_persistent_checkpoints(): - cpm = _CheckpointManager(checkpoint_strategy=CheckpointStrategy(num_to_keep=None)) + cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=None)) for i in range(10): cpm.register_checkpoint( @@ -19,7 +19,7 @@ def test_unlimited_persistent_checkpoints(): def test_limited_persistent_checkpoints(): - cpm = _CheckpointManager(checkpoint_strategy=CheckpointStrategy(num_to_keep=2)) + cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=2)) for i in range(10): cpm.register_checkpoint( @@ -30,7 +30,7 @@ def test_limited_persistent_checkpoints(): def test_no_persistent_checkpoints(): - cpm = _CheckpointManager(checkpoint_strategy=CheckpointStrategy(num_to_keep=0)) + cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=0)) for i in range(10): cpm.register_checkpoint( @@ -41,7 +41,7 @@ def test_no_persistent_checkpoints(): def test_dont_persist_memory_checkpoints(): - cpm = _CheckpointManager(checkpoint_strategy=CheckpointStrategy(num_to_keep=None)) + cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=None)) cpm._persist_memory_checkpoints = False for i in range(10): @@ -53,7 +53,7 @@ def test_dont_persist_memory_checkpoints(): def test_persist_memory_checkpoints(): - cpm = _CheckpointManager(checkpoint_strategy=CheckpointStrategy(num_to_keep=None)) + cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=None)) cpm._persist_memory_checkpoints = True for i in range(10): @@ -66,7 +66,7 @@ def test_persist_memory_checkpoints(): def test_keep_best_checkpoints(): cpm = _CheckpointManager( - checkpoint_strategy=CheckpointStrategy( + checkpoint_strategy=CheckpointConfig( num_to_keep=2, checkpoint_score_attribute="metric", checkpoint_score_order="min", From 0482bce4c4bc0b2204283b5455eb9b5474de90b0 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 16:10:20 +0000 Subject: [PATCH 43/63] Missed this --- doc/source/train/user_guide.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/train/user_guide.rst b/doc/source/train/user_guide.rst index ff2b2556afb0..8a75792732cf 100644 --- a/doc/source/train/user_guide.rst +++ b/doc/source/train/user_guide.rst @@ -680,7 +680,7 @@ directory ` of each run. # /home/ray_results/train_2021-09-01_12-00-00/run_001/checkpoints # By default, the "best" checkpoint path will refer to the most recent one. - # This can be configured by defining a CheckpointStrategy. + # This can be configured by defining a CheckpointConfig. print(trainer.best_checkpoint_path) # /home/ray_results/train_2021-09-01_12-00-00/run_001/checkpoints/checkpoint_000005 From 0cb579ddd3b868849453d4a72239fd3581873e07 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 11:47:44 -0700 Subject: [PATCH 44/63] Update test_result_grid.py --- python/ray/tune/tests/test_result_grid.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index fde9713908d0..ea7bf0a5bcb3 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -58,7 +58,7 @@ def f(config): assert isinstance(result.best_checkpoints, list) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) - assert isinstance(result.dataframe, pd.DataFrame) + assert isinstance(result.metrics_dataframe, pd.DataFrame) assert os.path.normpath( result.checkpoint.get_internal_representation()[1] ) != os.path.normpath( @@ -68,7 +68,7 @@ def f(config): ) assert result.config == {"a": 1} assert result.metrics["config"] == result.config - assert len(result.dataframe) == 2 + assert len(result.metrics_dataframe) == 2 def test_result_grid_metric_mode_unset(ray_start_2_cpus): @@ -87,10 +87,10 @@ def f(config): assert isinstance(result.checkpoint, Checkpoint) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) - assert isinstance(result.dataframe, pd.DataFrame) + assert isinstance(result.metrics_dataframe, pd.DataFrame) assert result.config == {"a": 1} assert result.metrics["config"] == result.config - assert len(result.dataframe) == 2 + assert len(result.metrics_dataframe) == 2 def test_result_grid_no_checkpoint(ray_start_2_cpus): @@ -131,7 +131,7 @@ def test_result_grid_future_checkpoint(ray_start_2_cpus, to_object): assert isinstance(result.checkpoint, Checkpoint) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) - assert result.dataframe is None + assert result.metrics_dataframe is None assert result.config == {"some_config": 1} assert result.metrics["config"] == result.config From 7ade7e4878c87212fdb6d3707fc10e3447a76164 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 21:09:20 +0000 Subject: [PATCH 45/63] Fix --- python/ray/train/tests/test_examples.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index 2ebef818d7aa..316ff4dc5fc3 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -51,7 +51,7 @@ def test_tensorflow_mnist(ray_start_4_cpus, num_workers): assert result[TRAINING_ITERATION] == epochs - loss = list(results.dataframe["loss"]) + loss = list(results.metrics_dataframe["loss"]) assert len(loss) == epochs assert loss[-1] < loss[0] @@ -107,7 +107,7 @@ def test_torch_linear(ray_start_4_cpus, num_workers): result = results.metrics assert result[TRAINING_ITERATION] == epochs - loss = list(results.dataframe["loss"]) + loss = list(results.metrics_dataframe["loss"]) assert len(loss) == epochs assert loss[-1] < loss[0] @@ -146,7 +146,7 @@ def test_torch_fashion_mnist(ray_start_4_cpus): result = results.metrics assert result[TRAINING_ITERATION] == epochs - loss = list(results.dataframe["loss"]) + loss = list(results.metrics_dataframe["loss"]) assert len(loss) == epochs assert loss[-1] < loss[0] From 0937dc857fb32d4cf0ff99bb8daeae5d7c2ade85 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 21:33:14 +0000 Subject: [PATCH 46/63] Apply feeedback from code review --- doc/source/ray-air/package-ref.rst | 3 +++ doc/source/train/api.rst | 6 ------ doc/source/train/user_guide.rst | 2 +- python/ray/air/config.py | 2 +- python/ray/util/ml_utils/checkpoint_manager.py | 1 + 5 files changed, 6 insertions(+), 8 deletions(-) diff --git a/doc/source/ray-air/package-ref.rst b/doc/source/ray-air/package-ref.rst index 3b57ecfe6dc5..24586c5c620b 100644 --- a/doc/source/ray-air/package-ref.rst +++ b/doc/source/ray-air/package-ref.rst @@ -124,3 +124,6 @@ Configs .. automodule:: ray.air.config :members: +.. _train-api-checkpoint-config: + +.. autoclass:: ray.air.config.CheckpointConfig \ No newline at end of file diff --git a/doc/source/train/api.rst b/doc/source/train/api.rst index ea8b879cdcd9..babf01861019 100644 --- a/doc/source/train/api.rst +++ b/doc/source/train/api.rst @@ -112,12 +112,6 @@ TorchTensorboardProfilerCallback .. autoclass:: ray.train.callbacks.TorchTensorboardProfilerCallback -Checkpointing -------------- - -.. _train-api-checkpoint-strategy: - - .. _train-api-func-utils: Training Function Utilities diff --git a/doc/source/train/user_guide.rst b/doc/source/train/user_guide.rst index 8a75792732cf..701492136952 100644 --- a/doc/source/train/user_guide.rst +++ b/doc/source/train/user_guide.rst @@ -691,7 +691,7 @@ Configuring checkpoints +++++++++++++++++++++++ For more configurability of checkpointing behavior (specifically saving -checkpoints to disk), a :ref:`train-api-checkpoint-strategy` can be passed into +checkpoints to disk), a :ref:`train-api-checkpoint-config` can be passed into ``Trainer.run``. As an example, to completely disable writing checkpoints to disk: diff --git a/python/ray/air/config.py b/python/ray/air/config.py index 5b7317d283ca..cec982da4188 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -6,7 +6,7 @@ from ray.tune.utils.log import Verbosity from ray.util.annotations import PublicAPI -# Move here later when ml_utils is deprecated +# Move here later when ml_utils is deprecated. Doing it now causes a circular import. from ray.util.ml_utils.checkpoint_manager import CheckpointConfig if TYPE_CHECKING: diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index 493be617e2d4..a3153dbd5e06 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -186,6 +186,7 @@ def __repr__(self): # Move to ray.air.config when ml_utils is deprecated. +# Doing it now causes a circular import. @dataclass @PublicAPI(stability="alpha") class CheckpointConfig: From b99362770375c0bda7b0087e88fe58a0891933f5 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 22:34:27 +0000 Subject: [PATCH 47/63] Fix lint --- doc/source/ray-air/package-ref.rst | 2 -- doc/source/train/user_guide.rst | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/source/ray-air/package-ref.rst b/doc/source/ray-air/package-ref.rst index 24586c5c620b..744206354232 100644 --- a/doc/source/ray-air/package-ref.rst +++ b/doc/source/ray-air/package-ref.rst @@ -124,6 +124,4 @@ Configs .. automodule:: ray.air.config :members: -.. _train-api-checkpoint-config: - .. autoclass:: ray.air.config.CheckpointConfig \ No newline at end of file diff --git a/doc/source/train/user_guide.rst b/doc/source/train/user_guide.rst index 701492136952..be799a49e666 100644 --- a/doc/source/train/user_guide.rst +++ b/doc/source/train/user_guide.rst @@ -691,8 +691,8 @@ Configuring checkpoints +++++++++++++++++++++++ For more configurability of checkpointing behavior (specifically saving -checkpoints to disk), a :ref:`train-api-checkpoint-config` can be passed into -``Trainer.run``. +checkpoints to disk), a :class:`CheckpointConfig` can be passed into +``Trainer``. As an example, to completely disable writing checkpoints to disk: From ed870bd4f72bd8d64d8a24d651a21b068114c3d2 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 15:46:51 -0700 Subject: [PATCH 48/63] Update python/ray/train/__init__.py --- python/ray/train/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/train/__init__.py b/python/ray/train/__init__.py index 74d360f117b7..5039ae461499 100644 --- a/python/ray/train/__init__.py +++ b/python/ray/train/__init__.py @@ -14,7 +14,7 @@ from ray.train.trainer import Trainer, TrainingIterator from ray.air.config import CheckpointConfig -# deprecated +# Deprecated. Alias of CheckpointConfig for backwards compat from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy usage_lib.record_library_usage("train") From a4fd532ea77f16b0e4e70e738eeceacdc7912d85 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 27 Jun 2022 18:26:44 +0000 Subject: [PATCH 49/63] Fix CI --- .../ray/train/examples/tune_cifar_pytorch_pbt_example.py | 2 +- python/ray/train/tests/test_minimal.py | 9 ++++----- python/ray/train/tests/test_tune.py | 4 +++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py index a7031b3116a1..38abba231ae8 100644 --- a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py +++ b/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py @@ -185,7 +185,7 @@ def train_func(config): ), run_config=RunConfig( stop={"training_iteration": 2 if args.smoke_test else 100}, - failure=FailureConfig(max_failures=3), # used for fault tolerance + failure_config=FailureConfig(max_failures=3), # used for fault tolerance ), ) diff --git a/python/ray/train/tests/test_minimal.py b/python/ray/train/tests/test_minimal.py index a23d7b4f23f9..7541edb16852 100644 --- a/python/ray/train/tests/test_minimal.py +++ b/python/ray/train/tests/test_minimal.py @@ -1,7 +1,7 @@ import pytest import ray -import ray.train as train +from ray.air import session from ray.air.checkpoint import Checkpoint from ray.train._internal.worker_group import WorkerGroup from ray.train.backend import Backend, BackendConfig @@ -38,10 +38,9 @@ def test_run(ray_start_4_cpus): config = TestConfig() def train_func(): - checkpoint = train.load_checkpoint() - train.report(**checkpoint) - train.save_checkpoint(**checkpoint) - return checkpoint[key] + checkpoint = session.get_checkpoint() + session.report(metrics=checkpoint.to_dict(), checkpoint=checkpoint) + return checkpoint.to_dict()[key] checkpoint = Checkpoint.from_dict( { diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index 0196a84e46b6..e407679268ad 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -200,7 +200,9 @@ def train_func(): trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1) ) - tuner = Tuner(trainer, run_config=RunConfig(failure=FailureConfig(max_failures=3))) + tuner = Tuner( + trainer, run_config=RunConfig(failure_config=FailureConfig(max_failures=3)) + ) analysis = tuner.fit()._experiment_analysis checkpoint_path = analysis.trials[0].checkpoint.dir_or_data From d0ae2ba1998b544b01897d03768b0fa0d9a5c3e7 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 28 Jun 2022 11:21:57 -0700 Subject: [PATCH 50/63] Use warnings.warn --- python/ray/util/ml_utils/checkpoint_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index a3153dbd5e06..1f64633a9666 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -7,6 +7,7 @@ import os import shutil import tempfile +import warnings from dataclasses import dataclass from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -256,7 +257,7 @@ def checkpoint_score_attr(self) -> Optional[str]: @dataclass class CheckpointStrategy(CheckpointConfig): def __post_init__(self): - logger.warning(deprecation_message) + warnings.warn(deprecation_message) super().__post_init__() From d44f75026ad47a6fcea4035e1ee8de68bd1980a3 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 28 Jun 2022 11:47:05 -0700 Subject: [PATCH 51/63] Make method privat --- python/ray/air/tests/test_api.py | 8 ++++---- python/ray/tune/impl/tuner_internal.py | 2 +- python/ray/util/ml_utils/checkpoint_manager.py | 6 ++++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py index e4474c24626a..ffd2d722378c 100644 --- a/python/ray/air/tests/test_api.py +++ b/python/ray/air/tests/test_api.py @@ -46,20 +46,20 @@ def test_checkpointing_config(): ) checkpointing = CheckpointConfig() - assert checkpointing.checkpoint_score_attr is None + assert checkpointing._tune_legacy_checkpoint_score_attr is None checkpointing = CheckpointConfig(checkpoint_score_attribute="metric") - assert checkpointing.checkpoint_score_attr == "metric" + assert checkpointing._tune_legacy_checkpoint_score_attr == "metric" checkpointing = CheckpointConfig( checkpoint_score_attribute="metric", checkpoint_score_order="max" ) - assert checkpointing.checkpoint_score_attr == "metric" + assert checkpointing._tune_legacy_checkpoint_score_attr == "metric" checkpointing = CheckpointConfig( checkpoint_score_attribute="metric", checkpoint_score_order="min" ) - assert checkpointing.checkpoint_score_attr == "min-metric" + assert checkpointing._tune_legacy_checkpoint_score_attr == "min-metric" def test_scaling_config(): diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py index 2d348c94d47a..d1e01e0a8e8d 100644 --- a/python/ray/tune/impl/tuner_internal.py +++ b/python/ray/tune/impl/tuner_internal.py @@ -168,7 +168,7 @@ def _get_tune_run_arguments(self) -> Dict[str, Any]: else None ), checkpoint_score_attr=( - self._run_config.checkpoint_config.checkpoint_score_attr + self._run_config.checkpoint_config._tune_legacy_checkpoint_score_attr if self._run_config.checkpoint_config else None ), diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index 1f64633a9666..c687ca48e8ad 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -234,8 +234,10 @@ def __post_init__(self): ) @property - def checkpoint_score_attr(self) -> Optional[str]: - """Same as ``checkpoint_score_attr`` in ``tune.run``.""" + def _tune_legacy_checkpoint_score_attr(self) -> Optional[str]: + """Same as ``checkpoint_score_attr`` in ``tune.run``. + + Only used for Legacy API compatibility.""" if self.checkpoint_score_attribute is None: return self.checkpoint_score_attribute prefix = "" From c9d33806a000f3bb84bb68ca467e6bd7d6675923 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 28 Jun 2022 13:39:36 -0700 Subject: [PATCH 52/63] Update python/ray/util/ml_utils/checkpoint_manager.py --- python/ray/util/ml_utils/checkpoint_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index c687ca48e8ad..4b6576a20d79 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -236,7 +236,6 @@ def __post_init__(self): @property def _tune_legacy_checkpoint_score_attr(self) -> Optional[str]: """Same as ``checkpoint_score_attr`` in ``tune.run``. - Only used for Legacy API compatibility.""" if self.checkpoint_score_attribute is None: return self.checkpoint_score_attribute From 5c0a75317897b9ee37f6dfd7899342c8ed490cb8 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 28 Jun 2022 13:41:17 -0700 Subject: [PATCH 53/63] Update checkpoint_manager.py --- python/ray/util/ml_utils/checkpoint_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/ray/util/ml_utils/checkpoint_manager.py b/python/ray/util/ml_utils/checkpoint_manager.py index 4b6576a20d79..e6e58ff77402 100644 --- a/python/ray/util/ml_utils/checkpoint_manager.py +++ b/python/ray/util/ml_utils/checkpoint_manager.py @@ -236,7 +236,9 @@ def __post_init__(self): @property def _tune_legacy_checkpoint_score_attr(self) -> Optional[str]: """Same as ``checkpoint_score_attr`` in ``tune.run``. - Only used for Legacy API compatibility.""" + + Only used for Legacy API compatibility. + """ if self.checkpoint_score_attribute is None: return self.checkpoint_score_attribute prefix = "" From c7b783b05f6fd76d0ba4f2febc6d58d3720e0ccf Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 29 Jun 2022 11:39:02 -0700 Subject: [PATCH 54/63] Fix test --- python/ray/train/tests/test_tune.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index e407679268ad..e25193eddf54 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -5,7 +5,7 @@ import ray import ray.train as train from ray import tune -from ray.air import Checkpoint +from ray.air import Checkpoint, session from ray.air.config import FailureConfig, RunConfig from ray.train._internal.worker_group import WorkerGroup from ray.train.backend import Backend, BackendConfig @@ -154,13 +154,16 @@ def train_func(): def test_reuse_checkpoint(ray_start_4_cpus): def train_func(config): itr = 0 - ckpt = train.load_checkpoint() + ckpt = session.get_checkpoint() if ckpt is not None: + ckpt = ckpt.to_dict() itr = ckpt["iter"] + 1 for i in range(itr, config["max_iter"]): - train.save_checkpoint(iter=i) - train.report(test=i, training_iteration=i) + session.report( + dict(test=i, training_iteration=i), + checkpoint=Checkpoint.from_dict(dict(iter=i)), + ) trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1) @@ -185,17 +188,20 @@ def train_func(config): def test_retry(ray_start_4_cpus): def train_func(): - ckpt = train.load_checkpoint() + ckpt = session.get_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? itr = 0 if ckpt: + ckpt = ckpt.to_dict() itr = ckpt["iter"] + 1 for i in range(itr, 4): if i == 2 and not restored: raise Exception("try to fail me") - train.save_checkpoint(iter=i) - train.report(test=i, training_iteration=i) + session.report( + dict(test=i, training_iteration=i), + checkpoint=Checkpoint.from_dict(dict(iter=i)), + ) trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=dict(num_workers=1) From 2e9ec6644b4e6b06c47f96918df28ffd0d8e97e3 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 30 Jun 2022 16:39:04 +0000 Subject: [PATCH 55/63] Rename files --- doc/source/train/examples.rst | 8 ++++---- doc/source/train/examples/train_fashion_mnist_example.rst | 4 ++-- .../train/examples/train_linear_dataset_example.rst | 4 ++-- doc/source/train/examples/train_linear_example.rst | 4 ++-- .../train/examples/tune_cifar_pytorch_pbt_example.rst | 6 +++--- python/ray/train/BUILD | 6 +++--- python/ray/train/examples/mlflow_fashion_mnist_example.py | 2 +- ...on_mnist_example.py => torch_fashion_mnist_example.py} | 0 ...dataset_example.py => torch_linear_dataset_example.py} | 0 .../{train_linear_example.py => torch_linear_example.py} | 0 ...rch_pbt_example.py => tune_cifar_torch_pbt_example.py} | 1 - python/ray/train/examples/tune_linear_example.py | 2 +- python/ray/train/tests/test_examples.py | 8 ++++++-- python/ray/train/tests/test_gpu.py | 6 +++--- python/ray/train/tests/test_tune.py | 2 +- .../workloads/pytorch_pbt_failure.py | 2 +- release/ml_user_tests/train/train_torch_linear_test.py | 2 +- 17 files changed, 30 insertions(+), 27 deletions(-) rename python/ray/train/examples/{train_fashion_mnist_example.py => torch_fashion_mnist_example.py} (100%) rename python/ray/train/examples/{train_linear_dataset_example.py => torch_linear_dataset_example.py} (100%) rename python/ray/train/examples/{train_linear_example.py => torch_linear_example.py} (100%) rename python/ray/train/examples/{tune_cifar_pytorch_pbt_example.py => tune_cifar_torch_pbt_example.py} (99%) diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst index 2a4e0b75bbd1..1529e63342cc 100644 --- a/doc/source/train/examples.rst +++ b/doc/source/train/examples.rst @@ -15,10 +15,10 @@ General Examples PyTorch ~~~~~~~ -* :doc:`/train/examples/train_linear_example`: +* :doc:`/train/examples/torch_linear_example`: Simple example for PyTorch. -* :doc:`/train/examples/train_fashion_mnist_example`: +* :doc:`/train/examples/torch_fashion_mnist_example`: End-to-end example for PyTorch. * :doc:`/train/examples/transformers/transformers_example`: @@ -59,7 +59,7 @@ Ray Datasets Integration Examples * :doc:`/train/examples/tensorflow_linear_dataset_example`: Simple example for training a linear TensorFlow model. -* :doc:`/train/examples/train_linear_dataset_example`: +* :doc:`/train/examples/torch_linear_dataset_example`: Simple example for training a linear PyTorch model. * :doc:`/train/examples/tune_torch_linear_dataset_example`: @@ -75,7 +75,7 @@ Ray Tune Integration Examples * :doc:`/train/examples/tune_tensorflow_mnist_example`: End-to-end example for tuning a TensorFlow model. -* :doc:`/train/examples/tune_cifar_pytorch_pbt_example`: +* :doc:`/train/examples/tune_cifar_torch_pbt_example`: End-to-end example for tuning a PyTorch model with PBT. .. diff --git a/doc/source/train/examples/train_fashion_mnist_example.rst b/doc/source/train/examples/train_fashion_mnist_example.rst index e11849e7b5f5..7082cc1433db 100644 --- a/doc/source/train/examples/train_fashion_mnist_example.rst +++ b/doc/source/train/examples/train_fashion_mnist_example.rst @@ -1,6 +1,6 @@ :orphan: -train_fashion_mnist_example +torch_fashion_mnist_example =========================== -.. literalinclude:: /../../python/ray/train/examples/train_fashion_mnist_example.py +.. literalinclude:: /../../python/ray/train/examples/torch_fashion_mnist_example.py diff --git a/doc/source/train/examples/train_linear_dataset_example.rst b/doc/source/train/examples/train_linear_dataset_example.rst index 5dfe21be0dc5..f84daeb67a7a 100644 --- a/doc/source/train/examples/train_linear_dataset_example.rst +++ b/doc/source/train/examples/train_linear_dataset_example.rst @@ -1,6 +1,6 @@ :orphan: -train_linear_dataset_example +torch_linear_dataset_example ============================ -.. literalinclude:: /../../python/ray/train/examples/train_linear_dataset_example.py +.. literalinclude:: /../../python/ray/train/examples/torch_linear_dataset_example.py diff --git a/doc/source/train/examples/train_linear_example.rst b/doc/source/train/examples/train_linear_example.rst index 3abb4af64c81..10f3090d5196 100644 --- a/doc/source/train/examples/train_linear_example.rst +++ b/doc/source/train/examples/train_linear_example.rst @@ -1,6 +1,6 @@ :orphan: -train_linear_example +torch_linear_example ==================== -.. literalinclude:: /../../python/ray/train/examples/train_linear_example.py +.. literalinclude:: /../../python/ray/train/examples/torch_linear_example.py diff --git a/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst b/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst index 5a1f156d8ee7..dae870f3247e 100644 --- a/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst +++ b/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst @@ -1,6 +1,6 @@ :orphan: -tune_cifar_pytorch_pbt_example -============================== +tune_cifar_torch_pbt_example +============================ -.. literalinclude:: /../../python/ray/train/examples/tune_cifar_pytorch_pbt_example.py +.. literalinclude:: /../../python/ray/train/examples/tune_cifar_torch_pbt_example.py diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index 6f719b725e64..89f32eda50e2 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -64,10 +64,10 @@ py_test( ) py_test( - name = "tune_cifar_pytorch_pbt_example", + name = "tune_cifar_torch_pbt_example", size = "medium", - main = "examples/tune_cifar_pytorch_pbt_example.py", - srcs = ["examples/tune_cifar_pytorch_pbt_example.py"], + main = "examples/tune_cifar_torch_pbt_example.py", + srcs = ["examples/tune_cifar_torch_pbt_example.py"], tags = ["team:ml", "exclusive", "pytorch", "tune"], deps = [":train_lib"], args = ["--smoke-test"] diff --git a/python/ray/train/examples/mlflow_fashion_mnist_example.py b/python/ray/train/examples/mlflow_fashion_mnist_example.py index 1cda7fc3e1ac..2d223c43ec1d 100644 --- a/python/ray/train/examples/mlflow_fashion_mnist_example.py +++ b/python/ray/train/examples/mlflow_fashion_mnist_example.py @@ -1,7 +1,7 @@ import argparse from ray.air import RunConfig -from ray.train.examples.train_fashion_mnist_example import train_func +from ray.train.examples.torch_fashion_mnist_example import train_func from ray.train.torch import TorchTrainer from ray.tune.integration.mlflow import MLflowLoggerCallback diff --git a/python/ray/train/examples/train_fashion_mnist_example.py b/python/ray/train/examples/torch_fashion_mnist_example.py similarity index 100% rename from python/ray/train/examples/train_fashion_mnist_example.py rename to python/ray/train/examples/torch_fashion_mnist_example.py diff --git a/python/ray/train/examples/train_linear_dataset_example.py b/python/ray/train/examples/torch_linear_dataset_example.py similarity index 100% rename from python/ray/train/examples/train_linear_dataset_example.py rename to python/ray/train/examples/torch_linear_dataset_example.py diff --git a/python/ray/train/examples/train_linear_example.py b/python/ray/train/examples/torch_linear_example.py similarity index 100% rename from python/ray/train/examples/train_linear_example.py rename to python/ray/train/examples/torch_linear_example.py diff --git a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py b/python/ray/train/examples/tune_cifar_torch_pbt_example.py similarity index 99% rename from python/ray/train/examples/tune_cifar_pytorch_pbt_example.py rename to python/ray/train/examples/tune_cifar_torch_pbt_example.py index 38abba231ae8..f0b5c786ff8d 100644 --- a/python/ray/train/examples/tune_cifar_pytorch_pbt_example.py +++ b/python/ray/train/examples/tune_cifar_torch_pbt_example.py @@ -58,7 +58,6 @@ def validate_epoch(dataloader, model, loss_fn): def train_func(config): - # print(config) epochs = config.pop("epochs", 3) model = ResNet18(config) model = train.torch.prepare_model(model) diff --git a/python/ray/train/examples/tune_linear_example.py b/python/ray/train/examples/tune_linear_example.py index 5d4a8edc911b..096c35547842 100644 --- a/python/ray/train/examples/tune_linear_example.py +++ b/python/ray/train/examples/tune_linear_example.py @@ -1,6 +1,6 @@ import argparse -from train_linear_example import train_func +from torch_linear_example import train_func import ray from ray import tune diff --git a/python/ray/train/tests/test_examples.py b/python/ray/train/tests/test_examples.py index 316ff4dc5fc3..1bb0753b1c6b 100644 --- a/python/ray/train/tests/test_examples.py +++ b/python/ray/train/tests/test_examples.py @@ -16,10 +16,10 @@ from ray.train.examples.torch_quick_start import ( train_func as torch_quick_start_train_func, ) -from ray.train.examples.train_fashion_mnist_example import ( +from ray.train.examples.torch_fashion_mnist_example import ( train_func as fashion_mnist_train_func, ) -from ray.train.examples.train_linear_example import train_func as linear_train_func +from ray.train.examples.torch_linear_example import train_func as linear_train_func from ray.train.horovod.horovod_trainer import HorovodTrainer from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer from ray.train.tests.test_trainer import KillCallback @@ -172,6 +172,10 @@ def test_horovod_torch_mnist(ray_start_4_cpus): result = results.metrics assert result[TRAINING_ITERATION] == num_workers + loss = list(results.metrics_dataframe["loss"]) + assert len(loss) == num_epochs + assert loss[-1] < loss[0] + # TODO: Refactor as a backend test. def test_horovod_torch_mnist_stateful(ray_start_4_cpus): diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py index ac9a0afe7cfb..a4ac411eb9f5 100644 --- a/python/ray/train/tests/test_gpu.py +++ b/python/ray/train/tests/test_gpu.py @@ -18,10 +18,10 @@ from ray.train.examples.tensorflow_mnist_example import ( train_func as tensorflow_mnist_train_func, ) -from ray.train.examples.train_fashion_mnist_example import ( +from ray.train.examples.torch_fashion_mnist_example import ( train_func as fashion_mnist_train_func, ) -from ray.train.examples.train_linear_example import LinearDataset +from ray.train.examples.torch_linear_example import LinearDataset from ray.train.horovod.horovod_trainer import HorovodTrainer from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer from ray.train.torch.torch_trainer import TorchTrainer @@ -350,7 +350,7 @@ def test_tune_tensorflow_mnist_gpu(ray_start_4_cpus_2_gpus): def test_train_linear_dataset_gpu(ray_start_4_cpus_2_gpus): - from ray.train.examples.train_linear_dataset_example import train_linear + from ray.train.examples.torch_linear_dataset_example import train_linear assert train_linear(num_workers=2, use_gpu=True) diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index e25193eddf54..6c34bb7259b4 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -13,7 +13,7 @@ from ray.train.examples.tensorflow_mnist_example import ( train_func as tensorflow_mnist_train_func, ) -from ray.train.examples.train_fashion_mnist_example import ( +from ray.train.examples.torch_fashion_mnist_example import ( train_func as fashion_mnist_train_func, ) from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer diff --git a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py index d354b2834ac6..0704bed7ec75 100644 --- a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py +++ b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py @@ -6,7 +6,7 @@ import ray from ray import tune from ray.air.config import RunConfig -from ray.train.examples.tune_cifar_pytorch_pbt_example import train_func +from ray.train.examples.tune_cifar_torch_pbt_example import train_func from ray.train.torch import TorchConfig, TorchTrainer from ray.tune.schedulers import PopulationBasedTraining from ray.tune.tune_config import TuneConfig diff --git a/release/ml_user_tests/train/train_torch_linear_test.py b/release/ml_user_tests/train/train_torch_linear_test.py index 2a2a0a751061..1629ec6cdda9 100644 --- a/release/ml_user_tests/train/train_torch_linear_test.py +++ b/release/ml_user_tests/train/train_torch_linear_test.py @@ -4,7 +4,7 @@ import ray -from ray.train.examples.train_linear_example import train_linear +from ray.train.examples.torch_linear_example import train_linear if __name__ == "__main__": start = time.time() From 2bf89d221e4e171b1ebb3f9f62a8f8d3532cd3da Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 30 Jun 2022 16:43:12 +0000 Subject: [PATCH 56/63] Use keras callback --- .../examples/tensorflow_linear_dataset_example.py | 11 +++-------- python/ray/train/examples/tensorflow_mnist_example.py | 8 +------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/python/ray/train/examples/tensorflow_linear_dataset_example.py b/python/ray/train/examples/tensorflow_linear_dataset_example.py index ccc408455b44..0ee9d48d2077 100644 --- a/python/ray/train/examples/tensorflow_linear_dataset_example.py +++ b/python/ray/train/examples/tensorflow_linear_dataset_example.py @@ -2,20 +2,15 @@ from typing import Dict, Tuple import tensorflow as tf -from tensorflow.keras.callbacks import Callback +from ray.air.callbacks.keras import Callback as TrainReportCallback import ray -import ray.train as train +from ray.air import session from ray.air.config import DatasetConfig from ray.data import Dataset from ray.train.tensorflow import TensorflowTrainer, prepare_dataset_shard -class TrainReportCallback(Callback): - def on_epoch_end(self, epoch, logs=None): - train.report(**logs) - - def get_datasets_and_configs( a=5, b=10, size=1000 ) -> Tuple[Dict[str, Dataset], Dict[str, DatasetConfig]]: @@ -60,7 +55,7 @@ def train_func(config): # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = build_and_compile_model(config) - dataset_pipeline = train.get_dataset_shard("train") + dataset_pipeline = session.get_dataset_shard("train") dataset_iterator = dataset_pipeline.iter_epochs() for _ in range(epochs): diff --git a/python/ray/train/examples/tensorflow_mnist_example.py b/python/ray/train/examples/tensorflow_mnist_example.py index a0ef319f8756..97e8db033025 100644 --- a/python/ray/train/examples/tensorflow_mnist_example.py +++ b/python/ray/train/examples/tensorflow_mnist_example.py @@ -7,17 +7,11 @@ import numpy as np import tensorflow as tf -from tensorflow.keras.callbacks import Callback +from ray.air.callbacks.keras import Callback as TrainReportCallback -import ray.train as train from ray.train.tensorflow import TensorflowTrainer -class TrainReportCallback(Callback): - def on_epoch_end(self, epoch, logs=None): - train.report(**logs) - - def mnist_dataset(batch_size): (x_train, y_train), _ = tf.keras.datasets.mnist.load_data() # The `x` arrays are in uint8 and have values in the [0, 255] range. From 375790ecde6e07658618bedf74d6b991e801c2c7 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 30 Jun 2022 16:44:47 +0000 Subject: [PATCH 57/63] Revert docstring changes --- python/ray/train/train_loop_utils.py | 73 ++++++++++++---------------- 1 file changed, 30 insertions(+), 43 deletions(-) diff --git a/python/ray/train/train_loop_utils.py b/python/ray/train/train_loop_utils.py index 5b03fd1fffe5..58244652e961 100644 --- a/python/ray/train/train_loop_utils.py +++ b/python/ray/train/train_loop_utils.py @@ -38,25 +38,23 @@ def get_dataset_shard( import ray from ray import train - from ray.train.torch import TorchTrainer def train_func(): model = Net() for iter in range(100): - data_shard = train.get_dataset_shard("train").to_torch() + data_shard = train.get_dataset_shard().to_torch() model.train(data_shard) return model dataset = ray.data.read_csv("train.csv") dataset.filter(...).repeat().random_shuffle() + trainer = Trainer(backend="torch") + trainer.start() + # Trainer will automatically handle sharding. - trainer = TorchTrainer( - train_func, - datasets={"train": dataset}, - scaling_config={"num_workers": 2}, - ) - trainer.fit() + train_model = trainer.run(train_func, dataset=dataset) + trainer.shutdown() Args: dataset_name: If a Dictionary of Datasets was passed to ``Trainer``, then @@ -97,15 +95,16 @@ def report(**kwargs) -> None: import time from ray import train - from ray.train.torch import TorchTrainer def train_func(): for iter in range(100): time.sleep(1) train.report(hello="world") - trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2}) - trainer.fit() + trainer = Trainer(backend="torch") + trainer.start() + trainer.run(train_func) + trainer.shutdown() Args: **kwargs: Any key value pair to be reported by Train. @@ -127,7 +126,6 @@ def world_rank() -> int: import time from ray import train - from ray.train.torch import TorchTrainer def train_func(): for iter in range(100): @@ -135,8 +133,10 @@ def train_func(): if train.world_rank() == 0: print("Worker 0") - trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2}) - trainer.fit() + trainer = Trainer(backend="torch") + trainer.start() + trainer.run(train_func) + trainer.shutdown() """ session = get_session() @@ -153,18 +153,16 @@ def local_rank() -> int: import time from ray import train - from ray.train.torch import TorchTrainer def train_func(): if torch.cuda.is_available(): torch.cuda.set_device(train.local_rank()) ... - trainer = TorchTrainer( - train_func, - scaling_config={"use_gpu": True, "num_workers": 2}, - ) - trainer.fit() + trainer = Trainer(backend="torch", use_gpu=True) + trainer.start() + trainer.run(train_func) + trainer.shutdown() """ session = get_session() @@ -180,29 +178,18 @@ def load_checkpoint() -> Optional[Dict]: .. code-block:: python from ray import train - from ray.air import Checkpoint - from ray.train.torch import TorchTrainer def train_func(): checkpoint = train.load_checkpoint() for iter in range(checkpoint["epoch"], 5): print(iter) - checkpoint = Checkpoint.from_dict( - { - # this would be set during checkpoint saving - "_current_checkpoint_id": 1, - "epoch": 3, - } - ) - trainer = TorchTrainer( - train_func, - resume_from_checkpoint=checkpoint, - scaling_config={"num_workers": 2}, - ) - trainer.fit() + trainer = Trainer(backend="torch") + trainer.start() + trainer.run(train_func, checkpoint={"epoch": 3}) # 3 # 4 + trainer.shutdown() Args: **kwargs: Any key value pair to be checkpointed by Train. @@ -226,16 +213,16 @@ def save_checkpoint(**kwargs) -> None: import time from ray import train - from ray.train.torch import TorchTrainer def train_func(): for iter in range(100): time.sleep(1) train.save_checkpoint(epoch=iter) - trainer = TorchTrainer(train_func, scaling_config={"num_workers": 2}) - result = trainer.fit() - assert result.checkpoint + trainer = Trainer(backend="torch") + trainer.start() + trainer.run(train_func) + trainer.shutdown() Args: **kwargs: Any key value pair to be checkpointed by Train. @@ -255,14 +242,14 @@ def world_size() -> int: import time from ray import train - from ray.train.torch import TorchTrainer def train_func(): assert train.world_size() == 4 - trainer = TorchTrainer(train_func, scaling_config={"num_workers": 4}) - result = trainer.fit() - + trainer = Trainer(backend="torch", num_workers=4) + trainer.start() + trainer.run(train_func) + trainer.shutdown() """ session = get_session() if session is None: From baaaf47718c6b5a46d228a9829c764e8b5f9390e Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 30 Jun 2022 17:40:37 +0000 Subject: [PATCH 58/63] Rename example files in docs --- ..._fashion_mnist_example.rst => torch_fashion_mnist_example.rst} | 0 ...inear_dataset_example.rst => torch_linear_dataset_example.rst} | 0 .../{train_linear_example.rst => torch_linear_example.rst} | 0 ...r_pytorch_pbt_example.rst => tune_cifar_torch_pbt_example.rst} | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename doc/source/train/examples/{train_fashion_mnist_example.rst => torch_fashion_mnist_example.rst} (100%) rename doc/source/train/examples/{train_linear_dataset_example.rst => torch_linear_dataset_example.rst} (100%) rename doc/source/train/examples/{train_linear_example.rst => torch_linear_example.rst} (100%) rename doc/source/train/examples/{tune_cifar_pytorch_pbt_example.rst => tune_cifar_torch_pbt_example.rst} (100%) diff --git a/doc/source/train/examples/train_fashion_mnist_example.rst b/doc/source/train/examples/torch_fashion_mnist_example.rst similarity index 100% rename from doc/source/train/examples/train_fashion_mnist_example.rst rename to doc/source/train/examples/torch_fashion_mnist_example.rst diff --git a/doc/source/train/examples/train_linear_dataset_example.rst b/doc/source/train/examples/torch_linear_dataset_example.rst similarity index 100% rename from doc/source/train/examples/train_linear_dataset_example.rst rename to doc/source/train/examples/torch_linear_dataset_example.rst diff --git a/doc/source/train/examples/train_linear_example.rst b/doc/source/train/examples/torch_linear_example.rst similarity index 100% rename from doc/source/train/examples/train_linear_example.rst rename to doc/source/train/examples/torch_linear_example.rst diff --git a/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst b/doc/source/train/examples/tune_cifar_torch_pbt_example.rst similarity index 100% rename from doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst rename to doc/source/train/examples/tune_cifar_torch_pbt_example.rst From 691ce99d80343295a18be0947422cab225b53a19 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 30 Jun 2022 17:49:34 +0000 Subject: [PATCH 59/63] Add legacy tests --- python/ray/train/tests/test_minimal.py | 49 ++++++++++++++++ python/ray/train/tests/test_tune.py | 78 ++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) diff --git a/python/ray/train/tests/test_minimal.py b/python/ray/train/tests/test_minimal.py index 7541edb16852..e3a1670ed3fb 100644 --- a/python/ray/train/tests/test_minimal.py +++ b/python/ray/train/tests/test_minimal.py @@ -1,6 +1,11 @@ +from typing import List, Dict + import pytest import ray +import ray.train as train +from ray.train import Trainer +from ray.train.callbacks import TrainingCallback from ray.air import session from ray.air.checkpoint import Checkpoint from ray.train._internal.worker_group import WorkerGroup @@ -30,6 +35,14 @@ def on_shutdown(self, worker_group: WorkerGroup, backend_config: TestConfig): pass +class TestCallback(TrainingCallback): + def __init__(self): + self.result_list = [] + + def handle_result(self, results: List[Dict], **info): + self.result_list.append(results) + + def test_run(ray_start_4_cpus): """Tests that Train can be run without any specific backends.""" num_workers = 2 @@ -61,6 +74,42 @@ def train_func(): assert results.checkpoint.to_dict()[key] == checkpoint.to_dict()[key] +def test_run_legacy(ray_start_4_cpus): + """Tests that Train can be run without any specific backends.""" + num_workers = 2 + key = "value" + value = 1 + config = TestConfig() + + def train_func(): + checkpoint = train.load_checkpoint() + train.report(**checkpoint) + train.save_checkpoint(**checkpoint) + return checkpoint[key] + + checkpoint = {key: value} + test_callback = TestCallback() + + trainer = Trainer(config, num_workers=num_workers) + trainer.start() + results = trainer.run(train_func, checkpoint=checkpoint, callbacks=[test_callback]) + + # Test results. + assert len(results) == num_workers + assert all(result == 1 for result in results) + + # Test reporting and callbacks. + assert len(test_callback.result_list) == value + assert len(test_callback.result_list[0]) == num_workers + print(test_callback.result_list[0]) + assert all(result[key] == value for result in test_callback.result_list[0]) + + # Test checkpointing. + assert trainer.latest_checkpoint[key] == value + + trainer.shutdown() + + def test_failure(): """Tests that backend frameworks and non-critical libraries are not imported.""" with pytest.raises(ModuleNotFoundError): diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index 6c34bb7259b4..640fa98a19a0 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -5,6 +5,7 @@ import ray import ray.train as train from ray import tune +from ray.tune import TuneError from ray.air import Checkpoint, session from ray.air.config import FailureConfig, RunConfig from ray.train._internal.worker_group import WorkerGroup @@ -18,6 +19,7 @@ ) from ray.train.tensorflow.tensorflow_trainer import TensorflowTrainer from ray.train.torch.torch_trainer import TorchTrainer +from ray.train.trainer import Trainer from ray.tune.tune_config import TuneConfig from ray.tune.tuner import Tuner @@ -219,6 +221,82 @@ def train_func(): assert len(trial_dfs[0]["training_iteration"]) == 4 +def test_tune_error_legacy(ray_start_4_cpus): + def train_func(config): + raise RuntimeError("Error in training function!") + + trainer = Trainer(TestConfig(), num_workers=1) + TestTrainable = trainer.to_tune_trainable(train_func) + + with pytest.raises(TuneError): + tune.run(TestTrainable) + + +def test_tune_checkpoint_legacy(ray_start_4_cpus): + def train_func(): + for i in range(10): + train.report(test=i) + train.save_checkpoint(hello="world") + + trainer = Trainer(TestConfig(), num_workers=1) + TestTrainable = trainer.to_tune_trainable(train_func) + + [trial] = tune.run(TestTrainable).trials + checkpoint_path = trial.checkpoint.dir_or_data + assert os.path.exists(checkpoint_path) + checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() + assert checkpoint["hello"] == "world" + + +def test_reuse_checkpoint_legacy(ray_start_4_cpus): + def train_func(config): + itr = 0 + ckpt = train.load_checkpoint() + if ckpt is not None: + itr = ckpt["iter"] + 1 + + for i in range(itr, config["max_iter"]): + train.save_checkpoint(iter=i) + train.report(test=i, training_iteration=i) + + trainer = Trainer(TestConfig(), num_workers=1) + TestTrainable = trainer.to_tune_trainable(train_func) + + [trial] = tune.run(TestTrainable, config={"max_iter": 5}).trials + checkpoint_path = trial.checkpoint.dir_or_data + checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() + assert checkpoint["iter"] == 4 + analysis = tune.run(TestTrainable, config={"max_iter": 10}, restore=checkpoint_path) + trial_dfs = list(analysis.trial_dataframes.values()) + assert len(trial_dfs[0]["training_iteration"]) == 5 + + +def test_retry_legacy(ray_start_4_cpus): + def train_func(): + ckpt = train.load_checkpoint() + restored = bool(ckpt) # Does a previous checkpoint exist? + itr = 0 + if ckpt: + itr = ckpt["iter"] + 1 + + for i in range(itr, 4): + if i == 2 and not restored: + raise Exception("try to fail me") + train.save_checkpoint(iter=i) + train.report(test=i, training_iteration=i) + + trainer = Trainer(TestConfig(), num_workers=1) + TestTrainable = trainer.to_tune_trainable(train_func) + + analysis = tune.run(TestTrainable, max_failures=3) + checkpoint_path = analysis.trials[0].checkpoint.dir_or_data + checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() + assert checkpoint["iter"] == 3 + + trial_dfs = list(analysis.trial_dataframes.values()) + assert len(trial_dfs[0]["training_iteration"]) == 4 + + if __name__ == "__main__": import sys From 587ad5634779021acc6ac63a6df49ae67bbf47d3 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 6 Jul 2022 18:24:59 +0000 Subject: [PATCH 60/63] Add todo --- python/ray/train/examples/torch_fashion_mnist_example.py | 2 ++ python/ray/train/examples/torch_linear_example.py | 1 + python/ray/train/examples/tune_cifar_torch_pbt_example.py | 2 ++ 3 files changed, 5 insertions(+) diff --git a/python/ray/train/examples/torch_fashion_mnist_example.py b/python/ray/train/examples/torch_fashion_mnist_example.py index 6e8db3220db4..5d716cb2dd91 100644 --- a/python/ray/train/examples/torch_fashion_mnist_example.py +++ b/python/ray/train/examples/torch_fashion_mnist_example.py @@ -114,6 +114,8 @@ def train_func(config: Dict): train.report(loss=loss) loss_results.append(loss) + # return required for backwards compatibility with the old API + # TODO(team-ml) clean up and remove return return loss_results diff --git a/python/ray/train/examples/torch_linear_example.py b/python/ray/train/examples/torch_linear_example.py index ceabd0c2853f..892cbb486244 100644 --- a/python/ray/train/examples/torch_linear_example.py +++ b/python/ray/train/examples/torch_linear_example.py @@ -81,6 +81,7 @@ def train_func(config): train.report(**result) results.append(result) # return required for backwards compatibility with the old API + # TODO(team-ml) clean up and remove return return results diff --git a/python/ray/train/examples/tune_cifar_torch_pbt_example.py b/python/ray/train/examples/tune_cifar_torch_pbt_example.py index f0b5c786ff8d..bddc01e1cd95 100644 --- a/python/ray/train/examples/tune_cifar_torch_pbt_example.py +++ b/python/ray/train/examples/tune_cifar_torch_pbt_example.py @@ -117,6 +117,8 @@ def train_func(config): train.report(**result) results.append(result) + # return required for backwards compatibility with the old API + # TODO(team-ml) clean up and remove return return results From 139f44d495083261ac0cb3ff33c80f18d59a5ff0 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 6 Jul 2022 18:33:30 +0000 Subject: [PATCH 61/63] Use `trial_logdir` instead --- python/ray/air/result.py | 3 ++- python/ray/train/examples/mlflow_simple_example.py | 2 +- python/ray/tune/result_grid.py | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/ray/air/result.py b/python/ray/air/result.py index 833aa7660f33..a6cf8fe47353 100644 --- a/python/ray/air/result.py +++ b/python/ray/air/result.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +from pathlib import Path from typing import Any, Dict, List, Optional, Tuple from ray.air.checkpoint import Checkpoint @@ -38,7 +39,7 @@ class Result: metrics: Optional[Dict[str, Any]] checkpoint: Optional[Checkpoint] error: Optional[Exception] - log_dir: Optional[str] + log_dir: Optional[Path] metrics_dataframe: Optional[pd.DataFrame] best_checkpoints: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]] diff --git a/python/ray/train/examples/mlflow_simple_example.py b/python/ray/train/examples/mlflow_simple_example.py index d64d0525ae58..b3a7264b20d8 100644 --- a/python/ray/train/examples/mlflow_simple_example.py +++ b/python/ray/train/examples/mlflow_simple_example.py @@ -41,7 +41,7 @@ def train_func(): # Print the latest run directory and keep note of it. # For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06 -print("Run directory:", result.log_dir) +print("Run directory:", result.log_dir.parent) # TensorBoard is saved in parent dir # How to visualize the logs diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 77994e2f491b..bdf39b97f4ef 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -1,4 +1,5 @@ import os +from pathlib import Path from typing import Optional, Union import pandas as pd @@ -180,7 +181,7 @@ def _trial_to_result(self, trial: Trial) -> Result: checkpoint=checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), - log_dir=trial.local_dir, + log_dir=Path(trial.logdir), metrics_dataframe=self._experiment_analysis.trial_dataframes.get( trial.logdir ) From 3a4d3f347d3f1170269ce77ad4adbd7ab057e32d Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 6 Jul 2022 21:49:00 +0000 Subject: [PATCH 62/63] Fix --- python/ray/tune/result_grid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index bdf39b97f4ef..bec6a1438bee 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -181,7 +181,7 @@ def _trial_to_result(self, trial: Trial) -> Result: checkpoint=checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), - log_dir=Path(trial.logdir), + log_dir=Path(trial.logdir) if trial.logdir else None, metrics_dataframe=self._experiment_analysis.trial_dataframes.get( trial.logdir ) From 2ea93d78a4bffc4d2b3e4e000dafdae3dc18390f Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 7 Jul 2022 17:51:47 +0000 Subject: [PATCH 63/63] Only print metrics --- python/ray/train/examples/horovod/horovod_example.py | 2 +- python/ray/train/examples/mlflow_fashion_mnist_example.py | 2 +- python/ray/train/examples/tensorflow_linear_dataset_example.py | 2 +- python/ray/train/examples/tensorflow_mnist_example.py | 2 +- .../auto_pipeline_for_host_to_device_data_transfer.py | 2 +- python/ray/train/examples/torch_fashion_mnist_example.py | 2 +- python/ray/train/examples/torch_linear_dataset_example.py | 2 +- python/ray/train/examples/torch_linear_example.py | 2 +- python/ray/train/examples/transformers/transformers_example.py | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/ray/train/examples/horovod/horovod_example.py b/python/ray/train/examples/horovod/horovod_example.py index c01788008ec5..8e930f7d151f 100644 --- a/python/ray/train/examples/horovod/horovod_example.py +++ b/python/ray/train/examples/horovod/horovod_example.py @@ -158,7 +158,7 @@ def main(num_workers, use_gpu, kwargs): scaling_config={"use_gpu": use_gpu, "num_workers": num_workers}, ) results = trainer.fit() - print(results) + print(results.metrics) # Horovod Class API. diff --git a/python/ray/train/examples/mlflow_fashion_mnist_example.py b/python/ray/train/examples/mlflow_fashion_mnist_example.py index 2d223c43ec1d..99f7b73a525a 100644 --- a/python/ray/train/examples/mlflow_fashion_mnist_example.py +++ b/python/ray/train/examples/mlflow_fashion_mnist_example.py @@ -17,7 +17,7 @@ def main(num_workers=2, use_gpu=False): ) final_results = trainer.fit() - print("Full results for rank 0 worker: ", final_results) + print("Final metrics: ", final_results.metrics) if __name__ == "__main__": diff --git a/python/ray/train/examples/tensorflow_linear_dataset_example.py b/python/ray/train/examples/tensorflow_linear_dataset_example.py index 0ee9d48d2077..f3a938e06c0e 100644 --- a/python/ray/train/examples/tensorflow_linear_dataset_example.py +++ b/python/ray/train/examples/tensorflow_linear_dataset_example.py @@ -83,7 +83,7 @@ def train_tensorflow_linear(num_workers=2, use_gpu=False): scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) results = trainer.fit() - print(f"Results: {results}") + print(f"Results: {results.metrics}") return results diff --git a/python/ray/train/examples/tensorflow_mnist_example.py b/python/ray/train/examples/tensorflow_mnist_example.py index 97e8db033025..14f4cf6dc7ef 100644 --- a/python/ray/train/examples/tensorflow_mnist_example.py +++ b/python/ray/train/examples/tensorflow_mnist_example.py @@ -81,7 +81,7 @@ def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4): scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) results = trainer.fit() - print(f"Results: {results[0]}") + print(f"Results: {results.metrics}") if __name__ == "__main__": diff --git a/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py b/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py index 03e69ca67f96..1220f541d034 100644 --- a/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py +++ b/python/ray/train/examples/torch_data_prefetch_benchmark/auto_pipeline_for_host_to_device_data_transfer.py @@ -109,7 +109,7 @@ def train_linear(num_workers=1, num_hidden_layers=1, use_auto_transfer=True, epo ) results = trainer.fit() - print(results) + print(results.metrics) return results diff --git a/python/ray/train/examples/torch_fashion_mnist_example.py b/python/ray/train/examples/torch_fashion_mnist_example.py index 5d716cb2dd91..7ad5017bbc5c 100644 --- a/python/ray/train/examples/torch_fashion_mnist_example.py +++ b/python/ray/train/examples/torch_fashion_mnist_example.py @@ -126,7 +126,7 @@ def train_fashion_mnist(num_workers=2, use_gpu=False): scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) result = trainer.fit() - print(f"Results: {result}") + print(f"Results: {result.metrics}") if __name__ == "__main__": diff --git a/python/ray/train/examples/torch_linear_dataset_example.py b/python/ray/train/examples/torch_linear_dataset_example.py index acfa0ce2e637..15fbf0da97b9 100644 --- a/python/ray/train/examples/torch_linear_dataset_example.py +++ b/python/ray/train/examples/torch_linear_dataset_example.py @@ -128,7 +128,7 @@ def train_linear(num_workers=2, use_gpu=False): scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) results = trainer.fit() - print(results) + print(results.metrics) return results diff --git a/python/ray/train/examples/torch_linear_example.py b/python/ray/train/examples/torch_linear_example.py index 892cbb486244..8be2e1d2dcc6 100644 --- a/python/ray/train/examples/torch_linear_example.py +++ b/python/ray/train/examples/torch_linear_example.py @@ -94,7 +94,7 @@ def train_linear(num_workers=2, use_gpu=False, epochs=3): ) results = trainer.fit() - print(results) + print(results.metrics) return results diff --git a/python/ray/train/examples/transformers/transformers_example.py b/python/ray/train/examples/transformers/transformers_example.py index 1b47e5ce2e31..30f9f0158f06 100644 --- a/python/ray/train/examples/transformers/transformers_example.py +++ b/python/ray/train/examples/transformers/transformers_example.py @@ -619,7 +619,7 @@ def main(): scaling_config={"num_workers": args.num_workers, "use_gpu": args.use_gpu}, ) results = trainer.fit() - print(results) + print(results.metrics) else: # Run training locally. train_func(config)