From 39e63cfbb896b1ec648879388833c54184770f16 Mon Sep 17 00:00:00 2001 From: Richard Liaw Date: Mon, 8 Aug 2022 11:56:11 -0700 Subject: [PATCH] [cherry-pick][releases/2.0.0][air/docs] Update Trainer documentation (#27481) (#27644) --- doc/source/_toc.yml | 13 +- .../ray-air/examples/xgboost_starter.py | 1 + doc/source/ray-air/getting-started.rst | 2 +- doc/source/ray-air/package-ref.rst | 10 + doc/source/ray-air/trainer.rst | 10 +- doc/source/ray-air/tuner.rst | 4 +- doc/source/train/api.rst | 115 --------- doc/source/train/config_guide.rst | 87 +++++++ .../train/{user_guide.rst => dl_guide.rst} | 107 ++------- doc/source/train/doc_code/gbdt_user_guide.py | 91 +++++++- doc/source/train/doc_code/key_concepts.py | 155 +++++++++++++ .../train/doc_code/xgboost_train_predict.py | 33 +++ .../benchmark_example.rst | 2 +- doc/source/train/faq.rst | 2 +- doc/source/train/gbdt.rst | 14 +- doc/source/train/getting-started.rst | 181 +++++++++++++++ doc/source/train/images/train-specific.svg | 1 + doc/source/train/key-concepts.rst | 121 ++++++++++ doc/source/train/train.rst | 219 +++++++----------- doc/source/train/user-guides.rst | 45 ++++ doc/source/tune/api_docs/overview.rst | 1 + doc/source/tune/api_docs/syncing.rst | 20 ++ doc/source/tune/api_docs/trainable.rst | 4 + python/ray/air/config.py | 6 +- python/ray/train/_internal/checkpoint.py | 4 +- python/ray/train/data_parallel_trainer.py | 3 +- .../ray/train/examples/torch_quick_start.py | 6 +- python/ray/train/horovod/horovod_trainer.py | 3 +- .../train/tensorflow/tensorflow_trainer.py | 7 +- python/ray/train/torch/torch_trainer.py | 20 +- python/ray/tune/syncer.py | 4 +- 31 files changed, 908 insertions(+), 383 deletions(-) create mode 100644 doc/source/train/config_guide.rst rename doc/source/train/{user_guide.rst => dl_guide.rst} (90%) create mode 100644 doc/source/train/doc_code/key_concepts.py create mode 100644 doc/source/train/doc_code/xgboost_train_predict.py create mode 100644 doc/source/train/getting-started.rst create mode 100644 doc/source/train/images/train-specific.svg create mode 100644 doc/source/train/key-concepts.rst create mode 100644 doc/source/train/user-guides.rst create mode 100644 doc/source/tune/api_docs/syncing.rst diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml index 9aa72fd21e55..0fdae2fbaa68 100644 --- a/doc/source/_toc.yml +++ b/doc/source/_toc.yml @@ -66,11 +66,16 @@ parts: - file: train/train title: Ray Train sections: - - file: train/user_guide - - file: train/gbdt - - file: train/examples + - file: train/getting-started + - file: train/key-concepts + - file: train/user-guides + sections: + - file: train/config_guide + - file: train/dl_guide + - file: train/gbdt + - file: train/architecture - file: train/faq - - file: train/architecture + - file: train/examples - file: train/api - file: tune/index diff --git a/doc/source/ray-air/examples/xgboost_starter.py b/doc/source/ray-air/examples/xgboost_starter.py index 40bd0bfff008..d8c4adae798c 100644 --- a/doc/source/ray-air/examples/xgboost_starter.py +++ b/doc/source/ray-air/examples/xgboost_starter.py @@ -39,6 +39,7 @@ params={ # XGBoost specific params "objective": "binary:logistic", + # "tree_method": "gpu_hist", # uncomment this to use GPUs. "eval_metric": ["logloss", "error"], }, datasets={"train": train_dataset, "valid": valid_dataset}, diff --git a/doc/source/ray-air/getting-started.rst b/doc/source/ray-air/getting-started.rst index b84d38448fd0..7b67ed582ba0 100644 --- a/doc/source/ray-air/getting-started.rst +++ b/doc/source/ray-air/getting-started.rst @@ -22,6 +22,7 @@ Get started by installing Ray AIR: .. code:: bash pip install -U "ray[air]" + # The below Ray AIR tutorial was written with the following libraries. # Consider running the following to ensure that the code below runs properly: pip install -U pandas>=1.3.5 @@ -30,7 +31,6 @@ Get started by installing Ray AIR: pip install -U tensorflow>=2.6.2 pip install -U pyarrow>=6.0.1 - Quick Start ----------- diff --git a/doc/source/ray-air/package-ref.rst b/doc/source/ray-air/package-ref.rst index 266e2d3ff05a..cfad36f71ac1 100644 --- a/doc/source/ray-air/package-ref.rst +++ b/doc/source/ray-air/package-ref.rst @@ -146,6 +146,8 @@ TensorFlow :members: :show-inheritance: +.. _air-pytorch-ref: + PyTorch ####### @@ -174,6 +176,14 @@ Scikit-Learn :members: :show-inheritance: + +Reinforcement Learning (RLlib) +############################## + +.. automodule:: ray.train.rl + :members: + :show-inheritance: + .. _air-builtin-callbacks: Monitoring Integrations diff --git a/doc/source/ray-air/trainer.rst b/doc/source/ray-air/trainer.rst index df14540353cf..a9c28c1ca6cf 100644 --- a/doc/source/ray-air/trainer.rst +++ b/doc/source/ray-air/trainer.rst @@ -1,7 +1,7 @@ .. _air-trainers: -Ray AIR Trainers -================ +Using Trainers for Distributed Training +======================================= .. https://docs.google.com/drawings/d/1anmT0JVFH9abR5wX5_WcxNHJh6jWeDL49zWxGpkfORA/edit @@ -32,7 +32,7 @@ construct a Trainer, you can provide: * A collection of :ref:`datasets ` and a :ref:`preprocessor ` for the provided datasets, which configures preprocessing and the datasets to ingest from. * ``resume_from_checkpoint``, which is a checkpoint path to resume from, should your training run be interrupted. -After instatiating a Trainer, you can invoke it by calling :meth:`Trainer.fit() `. +After instantiating a Trainer, you can invoke it by calling :meth:`Trainer.fit() `. .. literalinclude:: doc_code/xgboost_trainer.py :language: python @@ -63,7 +63,7 @@ You can access the data shard within a worker via ``session.get_dataset_shard()` to generate batches of Tensorflow or Pytorch tensors. You can read more about :ref:`data ingest ` here. -Read more about :ref:`Ray Train's Deep Learning Trainers `. +Read more about :ref:`Ray Train's Deep Learning Trainers `. .. dropdown:: Code examples @@ -110,7 +110,7 @@ Ray Train offers 2 main tree-based trainers: :class:`XGBoostTrainer ` and :class:`LightGBMTrainer `. -See :ref:`here for a more detailed user-guide `. +See :ref:`here for a more detailed user-guide `. XGBoost Trainer diff --git a/doc/source/ray-air/tuner.rst b/doc/source/ray-air/tuner.rst index 2823dd1910df..2acd4bced3bf 100644 --- a/doc/source/ray-air/tuner.rst +++ b/doc/source/ray-air/tuner.rst @@ -45,6 +45,8 @@ Below, we demonstrate how you can use a Trainer object with a Tuner. :end-before: __basic_end__ +.. _air-tuner-search-space: + How to configure a search space? -------------------------------- @@ -54,7 +56,7 @@ from which hyperparameter configurations will be sampled. Depending on the model and dataset, you may want to tune: - The training batch size -- The learning rate for SGD-based training (e.g., image classification) +- The learning rate for deep learning training (e.g., image classification) - The maximum depth for tree-based models (e.g., XGBoost) The following shows some example code on how to specify the ``param_space``. diff --git a/doc/source/train/api.rst b/doc/source/train/api.rst index afd363c4b212..68ae7cf74e62 100644 --- a/doc/source/train/api.rst +++ b/doc/source/train/api.rst @@ -52,121 +52,6 @@ BackendConfig .. autoclass:: ray.train.backend.BackendConfig -.. _train-api-func-utils: - -Training Function Utilities ---------------------------- - -train.report -~~~~~~~~~~~~ - -.. autofunction:: ray.train.report - -train.load_checkpoint -~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: ray.train.load_checkpoint - -train.save_checkpoint -~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: ray.train.save_checkpoint - -train.get_dataset_shard -~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: ray.train.get_dataset_shard - -train.world_rank -~~~~~~~~~~~~~~~~ - -.. autofunction:: ray.train.world_rank - -train.local_rank -~~~~~~~~~~~~~~~~ - -.. autofunction:: ray.train.local_rank - -train.world_size -~~~~~~~~~~~~~~~~ - -.. autofunction:: ray.train.world_size - -.. _train-api-torch-utils: - -PyTorch Training Function Utilities ------------------------------------ - -.. _train-api-torch-prepare-model: - -train.torch.prepare_model -~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: ray.train.torch.prepare_model - :noindex: - -.. _train-api-torch-prepare-data-loader: - -train.torch.prepare_data_loader -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: ray.train.torch.prepare_data_loader - :noindex: - -train.torch.prepare_optimizer -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: ray.train.torch.prepare_optimizer - :noindex: - - -train.torch.backward -~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: ray.train.torch.backward - :noindex: - -.. _train-api-torch-get-device: - -train.torch.get_device -~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: ray.train.torch.get_device - :noindex: - -train.torch.enable_reproducibility -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: ray.train.torch.enable_reproducibility - :noindex: - -.. _train-api-torch-worker-profiler: - -train.torch.accelerate -~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: ray.train.torch.accelerate - :noindex: - -train.torch.TorchWorkerProfiler -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: ray.train.torch.TorchWorkerProfiler - :members: - :noindex: - -.. _train-api-tensorflow-utils: - -TensorFlow Training Function Utilities --------------------------------------- - -train.tensorflow.prepare_dataset_shard -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autofunction:: ray.train.tensorflow.prepare_dataset_shard - :noindex: - - Deprecated APIs --------------- diff --git a/doc/source/train/config_guide.rst b/doc/source/train/config_guide.rst new file mode 100644 index 000000000000..d152e958beb6 --- /dev/null +++ b/doc/source/train/config_guide.rst @@ -0,0 +1,87 @@ +.. _train-config: + +Configurations User Guide +========================= + +The following overviews how to configure scale-out, run options, and fault-tolerance for Train. +For more details on how to configure data ingest, also refer to :ref:`air-ingest`. + +Scaling configuration (``ScalingConfig``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The scaling configuration specifies distributed training properties like the number of workers or the +resources per worker. + +The properties of the scaling configuration are :ref:`tunable `. + +:class:`ScalingConfig API reference ` + +.. literalinclude:: doc_code/key_concepts.py + :language: python + :start-after: __scaling_config_start__ + :end-before: __scaling_config_end__ + + +Run configuration (``RunConfig``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The run configuration specifies distributed training properties like the number of workers or the +resources per worker. + +The properties of the run configuration are :ref:`not tunable `. + +:class:`RunConfig API reference ` + +.. literalinclude:: doc_code/key_concepts.py + :language: python + :start-after: __run_config_start__ + :end-before: __run_config_end__ + +Failure configuration (``FailureConfig``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The failure configuration specifies how training failures should be dealt with. + +As part of the RunConfig, the properties of the failure configuration +are :ref:`not tunable `. + +:class:`FailureConfig API reference ` + +.. literalinclude:: doc_code/key_concepts.py + :language: python + :start-after: __failure_config_start__ + :end-before: __failure_config_end__ + +Sync configuration (``SyncConfig``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The sync configuration specifies how to synchronize checkpoints between the +Ray cluster and remote storage. + +As part of the RunConfig, the properties of the sync configuration +are :ref:`not tunable `. + +:class:`SyncConfig API reference ` + +.. literalinclude:: doc_code/key_concepts.py + :language: python + :start-after: __sync_config_start__ + :end-before: __sync_config_end__ + + +Checkpoint configuration (``CheckpointConfig``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The checkpoint configuration specifies how often to checkpoint training state +and how many checkpoints to keep. + +As part of the RunConfig, the properties of the checkpoint configuration +are :ref:`not tunable `. + +:class:`CheckpointConfig API reference ` + +.. literalinclude:: doc_code/key_concepts.py + :language: python + :start-after: __checkpoint_config_start__ + :end-before: __checkpoint_config_end__ + diff --git a/doc/source/train/user_guide.rst b/doc/source/train/dl_guide.rst similarity index 90% rename from doc/source/train/user_guide.rst rename to doc/source/train/dl_guide.rst index 0013255b555e..91a495e3e554 100644 --- a/doc/source/train/user_guide.rst +++ b/doc/source/train/dl_guide.rst @@ -1,24 +1,13 @@ -.. _train-user-guide: +.. _train-dl-guide: -Ray Train Deep Learning User Guide -================================== +Deep Learning User Guide +======================== -.. tip:: Get in touch with us if you're using or considering using `Ray Train `_! - -Ray Train provides solutions for training machine learning models in a distributed manner on Ray. -This guide focuses on deep learning with PyTorch, TensorFlow and Horovod. -For other model types, distributed training support is available through other Trainers & libraries: - -* **Reinforcement Learning:** :ref:`RLlib ` -* **XGBoost:** :doc:`/ray-air/examples/xgboost_example` -* **LightGBM:** :doc:`/ray-air/examples/lightgbm_example` -* **Scikit-Learn** :doc:`/ray-air/examples/sklearn_example` -* **Hugging Face** :doc:`/ray-air/examples/huggingface_text_classification` -* **PyTorch Lightning:** :ref:`ray-lightning` +This guide explains how to use Train to scale PyTorch, TensorFlow and Horovod. In this guide, we cover examples for the following use cases: -* How do I :ref:`port my code ` to using Ray Train? +* How do I :ref:`port my code ` to use Ray Train? * How do I use Ray Train to :ref:`train with a large dataset `? * How do I :ref:`monitor ` my training? * How do I run my training on pre-emptible instances @@ -408,95 +397,35 @@ of the :class:`Result` object returned by ``Trainer.fit``. Distributed Data Ingest with Ray Datasets ----------------------------------------- -Ray Train provides native support for :ref:`Ray Datasets ` to support the following use cases: - -1. **Large Datasets**: With Ray Datasets, you can easily work with datasets that are too big to fit on a single node. - Ray Datasets will distribute the dataset across the Ray Cluster and allow you to perform dataset operations (map, filter, etc.) - on the distributed dataset. -2. **Automatic locality-aware sharding**: If provided a Ray Dataset, Ray Train will automatically shard the dataset and assign each shard - to a training worker while minimizing cross-node data transfer. Unlike with standard Torch or TensorFlow datasets, each training - worker will only load its assigned shard into memory rather than the entire ``Dataset``. -3. **Pipelined Execution**: Ray Datasets also supports pipelining, meaning that data processing operations - can be run concurrently with training. Training is no longer blocked on expensive data processing operations (such as global shuffling) - and this minimizes the amount of time your GPUs are idle. See :ref:`dataset-pipeline-api` for more information. - -To get started, pass in a Ray Dataset (or multiple) into ``Trainer``. Underneath the hood, Ray Train will automatically shard the given dataset. - -Using Ray Datasets is the recommended way for ingesting data into ``Trainer``\s and can be used with any ``Trainer`` in Ray AIR. - -.. warning:: - - If you are doing distributed training with TensorFlow, you will need to - disable TensorFlow's built-in autosharding as the data on each worker is - already sharded. - - .. code-block:: python - :emphasize-lines: 1, 6 - - from ray.train.tensorflow import prepare_dataset_shard - - def train_func(): - ... - tf_dataset = ray.train.get_dataset_shard().to_tf(...) - tf_dataset = prepare_dataset_shard(tf_dataset) +:ref:`Ray Datasets ` are the recommended way to work with large datasets in Ray Train. Datasets provides automatic loading, sharding, and pipelined ingest (optional) of Data across multiple Train workers. +To get started, pass in one or more datasets under the ``datasets`` keyword argument for Trainer (e.g., ``Trainer(datasets={...})``). - -**Simple Dataset Example** +Here's a simple code overview of the Datasets integration: .. code-block:: python - import ray - from ray import train - from ray.air import ScalingConfig - from ray.train.torch import TorchTrainer + from ray.air import session + # Datasets can be accessed in your train_func via ``get_dataset_shard``. def train_func(config): - # Create your model here. - model = NeuralNetwork() - - batch_size = config["worker_batch_size"] - - train_data_shard = train.get_dataset_shard("train") - train_torch_dataset = train_data_shard.to_torch( - label_column="label", batch_size=batch_size - ) - - validation_data_shard = train.get_dataset_shard("validation") - validation_torch_dataset = validation_data_shard.to_torch( - label_column="label", batch_size=batch_size - ) + train_data_shard = session.get_dataset_shard("train") + validation_data_shard = session.get_dataset_shard("validation") + ... - for epoch in config["num_epochs"]: - for X, y in train_torch_dataset: - model.train() - output = model(X) - # Train on one batch. - for X, y in validation_torch_dataset: - model.eval() - output = model(X) - # Validate one batch. - return model - - # Random split dataset into 80% training data and 20% validation data. + # Random split the dataset into 80% training data and 20% validation data. + dataset = ray.data.read_csv("...") train_dataset, validation_dataset = dataset.train_test_split( - test_size=0.2, shuffle=True + test_size=0.2, shuffle=True, ) trainer = TorchTrainer( train_func, - train_loop_config={"worker_batch_size": 64, "num_epochs": 2}, datasets={"train": train_dataset, "validation": validation_dataset}, scaling_config=ScalingConfig(num_workers=8), ) - dataset = ray.data.read_csv("...") - - result = trainer.fit() - -.. _train-dataset-pipeline: + trainer.fit() -Pipelined Execution -~~~~~~~~~~~~~~~~~~~ -For details on how to enable pipelined execution, please refer to :ref:`air-ingest`. +For more details on how to configure data ingest for Train, please refer to :ref:`air-ingest`. .. TODO link to Training Run Iterator API as a 3rd option for logging. diff --git a/doc/source/train/doc_code/gbdt_user_guide.py b/doc/source/train/doc_code/gbdt_user_guide.py index 3d0cea4a375c..0943a343c376 100644 --- a/doc/source/train/doc_code/gbdt_user_guide.py +++ b/doc/source/train/doc_code/gbdt_user_guide.py @@ -24,15 +24,59 @@ params={ # XGBoost specific params "objective": "binary:logistic", + # "tree_method": "gpu_hist", # uncomment this to use GPU for training "eval_metric": ["logloss", "error"], }, datasets={"train": train_dataset, "valid": valid_dataset}, ) result = trainer.fit() print(result.metrics) - # __xgboost_end__ +# __xgb_detail_intro_start__ +import ray + +# Load data. +dataset = ray.data.read_csv("s3://anonymous@air-example-data/breast_cancer.csv") + +# Split data into train and validation. +train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3) +# __xgb_detail_intro_end__ + +# __xgb_detail_scaling_start__ +from ray.air.config import ScalingConfig + +scaling_config = ScalingConfig( + # Number of workers to use for data parallelism. + num_workers=2, + # Whether to use GPU acceleration. + use_gpu=False, +) +# __xgb_detail_scaling_end__ + +# __xgb_detail_training_start__ +from ray.train.xgboost import XGBoostTrainer + +trainer = XGBoostTrainer( + scaling_config=scaling_config, + label_column="target", + num_boost_round=20, + params={ + # XGBoost specific params + "objective": "binary:logistic", + # "tree_method": "gpu_hist", # uncomment this to use GPU for training + "eval_metric": ["logloss", "error"], + }, + datasets={"train": train_dataset, "valid": valid_dataset}, +) +# __xgb_detail_training_end__ + +# __xgb_detail_fit_start__ +result = trainer.fit() +print(result.metrics) +# __xgb_detail_fit_end__ + + # __lightgbm_start__ import ray from ray.train.lightgbm import LightGBMTrainer @@ -62,9 +106,52 @@ ) result = trainer.fit() print(result.metrics) - # __lightgbm_end__ + +# __lgbm_detail_intro_start__ +import ray + +# Load data. +dataset = ray.data.read_csv("s3://anonymous@air-example-data/breast_cancer.csv") + +# Split data into train and validation. +train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3) +# __lgbm_detail_intro_end__ + +# __lgbm_detail_scaling_start__ +from ray.air.config import ScalingConfig + +scaling_config = ScalingConfig( + # Number of workers to use for data parallelism. + num_workers=2, + # Whether to use GPU acceleration. + use_gpu=False, +) +# __lgbm_detail_scaling_end__ + +# __lgbm_detail_training_start__ +from ray.train.lightgbm import LightGBMTrainer + +trainer = LightGBMTrainer( + scaling_config=scaling_config, + label_column="target", + num_boost_round=20, + params={ + # LightGBM specific params + "objective": "binary", + "metric": ["binary_logloss", "binary_error"], + }, + datasets={"train": train_dataset, "valid": valid_dataset}, +) +# __lgbm_detail_training_end__ + +# __lgbm_detail_fit_start__ +result = trainer.fit() +print(result.metrics) +# __lgbm_detail_fit_end__ + + # __scaling_cpu_start__ scaling_config = ScalingConfig( num_workers=4, diff --git a/doc/source/train/doc_code/key_concepts.py b/doc/source/train/doc_code/key_concepts.py new file mode 100644 index 000000000000..7c6748ac7866 --- /dev/null +++ b/doc/source/train/doc_code/key_concepts.py @@ -0,0 +1,155 @@ +# flake8: noqa +# isort: skip_file + +# __session_report_start__ +from ray.air import session, ScalingConfig +from ray.train.data_parallel_trainer import DataParallelTrainer + + +def train_fn(config): + for i in range(10): + session.report({"step": i}) + + +trainer = DataParallelTrainer( + train_loop_per_worker=train_fn, scaling_config=ScalingConfig(num_workers=1) +) +trainer.fit() + +# __session_report_end__ + + +# __session_data_info_start__ +import ray.data +from ray.air import session, ScalingConfig +from ray.train.data_parallel_trainer import DataParallelTrainer + + +def train_fn(config): + dataset_shard = session.get_dataset_shard("train") + + session.report( + { + # Global world size + "world_size": session.get_world_size(), + # Global worker rank on the cluster + "world_rank": session.get_world_rank(), + # Local worker rank on the current machine + "local_rank": session.get_local_rank(), + # Data + "data_shard": dataset_shard.to_pandas().to_numpy().tolist(), + } + ) + + +trainer = DataParallelTrainer( + train_loop_per_worker=train_fn, + scaling_config=ScalingConfig(num_workers=2), + datasets={"train": ray.data.from_items([1, 2, 3, 4])}, +) +trainer.fit() +# __session_data_info_end__ + + +# __session_checkpoint_start__ +from ray.air import session, ScalingConfig, Checkpoint +from ray.train.data_parallel_trainer import DataParallelTrainer + + +def train_fn(config): + checkpoint = session.get_checkpoint() + + if checkpoint: + state = checkpoint.to_dict() + else: + state = {"step": 0} + + for i in range(state["step"], 10): + state["step"] += 1 + session.report( + metrics={"step": state["step"]}, checkpoint=Checkpoint.from_dict(state) + ) + + +trainer = DataParallelTrainer( + train_loop_per_worker=train_fn, + scaling_config=ScalingConfig(num_workers=1), + resume_from_checkpoint=Checkpoint.from_dict({"step": 4}), +) +trainer.fit() + +# __session_checkpoint_end__ + + +# __scaling_config_start__ +from ray.air import ScalingConfig + +scaling_config = ScalingConfig( + # Number of distributed workers. + num_workers=2, + # Turn on/off GPU. + use_gpu=True, + # Specify resources used for trainer. + trainer_resources={"CPU": 1}, + # Try to schedule workers on different nodes. + placement_strategy="SPREAD", +) +# __scaling_config_end__ + +# __run_config_start__ +from ray.air import RunConfig + +run_config = RunConfig( + # Name of the training run (directory name). + name="my_train_run", + # Directory to store results in (will be local_dir/name). + local_dir="~/ray_results", + # Low training verbosity. + verbose=1, +) +# __run_config_end__ + +# __failure_config_start__ +from ray.air import RunConfig, FailureConfig + +run_config = RunConfig( + failure_config=FailureConfig( + # Tries to recover a run up to this many times. + max_failures=2 + ) +) +# __failure_config_end__ + +# __sync_config_start__ +from ray.air import RunConfig +from ray.tune import SyncConfig + +run_config = RunConfig( + sync_config=SyncConfig( + # This will store checkpoints on S3. + upload_dir="s3://remote-bucket/location" + ) +) +# __sync_config_end__ + +# __checkpoint_config_start__ +from ray.air import RunConfig, CheckpointConfig + +run_config = RunConfig( + checkpoint_config=CheckpointConfig( + # Only keep this many checkpoints. + num_to_keep=2 + ) +) +# __checkpoint_config_end__ + + +# __results_start__ +result = trainer.fit() + +# Print metrics +print("Observed metrics:", result.metrics) + +checkpoint_data = result.checkpoint.to_dict() +print("Checkpoint data:", checkpoint_data["step"]) +# __results_end__ diff --git a/doc/source/train/doc_code/xgboost_train_predict.py b/doc/source/train/doc_code/xgboost_train_predict.py new file mode 100644 index 000000000000..a23f0a03f623 --- /dev/null +++ b/doc/source/train/doc_code/xgboost_train_predict.py @@ -0,0 +1,33 @@ +# flake8: noqa +# isort: skip_file + +# __train_predict_start__ +import numpy as np +import ray + +from ray.train.xgboost import XGBoostTrainer, XGBoostPredictor +from ray.air.config import ScalingConfig + +train_dataset = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)]) +trainer = XGBoostTrainer( + label_column="y", + params={"objective": "reg:squarederror"}, + scaling_config=ScalingConfig(num_workers=3), + datasets={"train": train_dataset}, +) +result = trainer.fit() + +predictor = XGBoostPredictor.from_checkpoint(result.checkpoint) +predictions = predictor.predict(np.expand_dims(np.arange(32, 64), 1)) +# __train_predict_end__ + +# __batch_predict_start__ +from ray.train.batch_predictor import BatchPredictor + +batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, XGBoostPredictor) +predictions = batch_predictor.predict( + data=ray.data.from_items([{"x": x} for x in range(32)]), + batch_size=8, + min_scoring_workers=2, +) +# __batch_predict_end__ diff --git a/doc/source/train/examples/torch_data_prefetch_benchmark/benchmark_example.rst b/doc/source/train/examples/torch_data_prefetch_benchmark/benchmark_example.rst index acb6df2aae87..38fbc1ec2da2 100644 --- a/doc/source/train/examples/torch_data_prefetch_benchmark/benchmark_example.rst +++ b/doc/source/train/examples/torch_data_prefetch_benchmark/benchmark_example.rst @@ -4,7 +4,7 @@ Torch Data Prefetching Benchmark ================================ We provide a benchmark example to show how the auto pipeline for host to device data transfer speeds up training on GPUs. -This functionality can be easily enabled by setting ``auto_transfer=True`` in :ref:`train.torch.prepare_data_loader() `. +This functionality can be easily enabled by setting ``auto_transfer=True`` in :func:`train.torch.prepare_data_loader`. .. code-block:: python diff --git a/doc/source/train/faq.rst b/doc/source/train/faq.rst index 8e99829ed645..9e5b135f3a6b 100644 --- a/doc/source/train/faq.rst +++ b/doc/source/train/faq.rst @@ -23,6 +23,6 @@ To override these resource requests or request additional custom resources, you can initialize the ``Trainer`` with ``resources_per_worker`` specified in ``scaling_config``. .. note:: - Some GPU utility functions (e.g. :ref:`train-api-torch-get-device`, :ref:`train-api-torch-prepare-model`) + Some GPU utility functions (e.g. :func:`ray.train.torch.get_device`, :func:`ray.train.torch.prepare_model`) currently assume each worker is allocated exactly 1 GPU. The partial GPU and multi GPU use-cases can still be run with Ray Train today without these functions. diff --git a/doc/source/train/gbdt.rst b/doc/source/train/gbdt.rst index ba35e26e1a4e..636eee3f03fe 100644 --- a/doc/source/train/gbdt.rst +++ b/doc/source/train/gbdt.rst @@ -1,15 +1,9 @@ -.. _air-trainers-gbdt-user-guide: +.. _train-gbdt-guide: +XGBoost / LightGBM User Guide +============================= -Ray Train XGBoost/LightGBM User Guide -===================================== - -Ray Train has Trainers for XGBoost and LightGBM. These trainers: - -* enable `multi-node <#usage>`_ and `multi-GPU <#multi-gpu-training>`_ training -* integrate seamlessly with distributed `hyperparameter optimization <#hyperparameter-tuning>`_ library `Ray Tune `_ -* support `distributed data loading <#distributed-data-loading>`_ - +Ray Train has built-in support for XGBoost and LightGBM. Basic Usage ----------- diff --git a/doc/source/train/getting-started.rst b/doc/source/train/getting-started.rst new file mode 100644 index 000000000000..edf07924eead --- /dev/null +++ b/doc/source/train/getting-started.rst @@ -0,0 +1,181 @@ +.. _train-getting-started: + +Getting Started +=============== + +Ray Train offers multiple ``Trainers`` which implement scalable model training for different machine learning frameworks. +Here are examples for some of the commonly used trainers: + +.. tabbed:: XGBoost + + In this example we will train a model using distributed XGBoost. + + First, we load the dataset from S3 using Ray Datasets and split it into a + train and validation dataset. + + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __xgb_detail_intro_start__ + :end-before: __xgb_detail_intro_end__ + + In the :class:`ScalingConfig `, + we configure the number of workers to use: + + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __xgb_detail_scaling_start__ + :end-before: __xgb_detail_scaling_end__ + + We then instantiate our XGBoostTrainer by passing in: + + - The aforementioned ``ScalingConfig``. + - The ``label_column`` refers to the column name containing the labels in the Ray Dataset + - The ``params`` are `XGBoost training parameters `__ + + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __xgb_detail_training_start__ + :end-before: __xgb_detail_training_end__ + + Lastly, we call ``trainer.fit()`` to kick off training and obtain the results. + + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __xgb_detail_fit_start__ + :end-before: __xgb_detail_fit_end__ + +.. tabbed:: LightGBM + + In this example we will train a model using distributed LightGBM. + + First, we load the dataset from S3 using Ray Datasets and split it into a + train and validation dataset. + + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __lgbm_detail_intro_start__ + :end-before: __lgbm_detail_intro_end__ + + In the :class:`ScalingConfig `, + we configure the number of workers to use: + + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __xgb_detail_scaling_start__ + :end-before: __xgb_detail_scaling_end__ + + We then instantiate our LightGBMTrainer by passing in: + + - The aforementioned ``ScalingConfig`` + - The ``label_column`` refers to the column name containing the labels in the Ray Dataset + - The ``params`` are core `LightGBM training parameters `__ + + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __lgbm_detail_training_start__ + :end-before: __lgbm_detail_training_end__ + + And lastly we call ``trainer.fit()`` to kick off training and obtain the results. + + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __lgbm_detail_fit_start__ + :end-before: __lgbm_detail_fit_end__ + +.. tabbed:: PyTorch + + This example shows how you can use Ray Train with PyTorch. + + First, set up your dataset and model. + + .. literalinclude:: /../../python/ray/train/examples/torch_quick_start.py + :language: python + :start-after: __torch_setup_begin__ + :end-before: __torch_setup_end__ + + + Now define your single-worker PyTorch training function. + + .. literalinclude:: /../../python/ray/train/examples/torch_quick_start.py + :language: python + :start-after: __torch_single_begin__ + :end-before: __torch_single_end__ + + This training function can be executed with: + + .. literalinclude:: /../../python/ray/train/examples/torch_quick_start.py + :language: python + :start-after: __torch_single_run_begin__ + :end-before: __torch_single_run_end__ + + Now let's convert this to a distributed multi-worker training function! + + All you have to do is use the ``ray.train.torch.prepare_model`` and + ``ray.train.torch.prepare_data_loader`` utility functions to + easily setup your model & data for distributed training. + This will automatically wrap your model with ``DistributedDataParallel`` + and place it on the right device, and add ``DistributedSampler`` to your DataLoaders. + + .. literalinclude:: /../../python/ray/train/examples/torch_quick_start.py + :language: python + :start-after: __torch_distributed_begin__ + :end-before: __torch_distributed_end__ + + Then, instantiate a ``TorchTrainer`` + with 4 workers, and use it to run the new training function! + + .. literalinclude:: /../../python/ray/train/examples/torch_quick_start.py + :language: python + :start-after: __torch_trainer_begin__ + :end-before: __torch_trainer_end__ + + See :ref:`train-porting-code` for a more comprehensive example. + +.. tabbed:: TensorFlow + + This example shows how you can use Ray Train to set up `Multi-worker training + with Keras `_. + + First, set up your dataset and model. + + .. literalinclude:: /../../python/ray/train/examples/tensorflow_quick_start.py + :language: python + :start-after: __tf_setup_begin__ + :end-before: __tf_setup_end__ + + Now define your single-worker TensorFlow training function. + + .. literalinclude:: /../../python/ray/train/examples/tensorflow_quick_start.py + :language: python + :start-after: __tf_single_begin__ + :end-before: __tf_single_end__ + + This training function can be executed with: + + .. literalinclude:: /../../python/ray/train/examples/tensorflow_quick_start.py + :language: python + :start-after: __tf_single_run_begin__ + :end-before: __tf_single_run_end__ + + Now let's convert this to a distributed multi-worker training function! + All you need to do is: + + 1. Set the per-worker batch size - each worker will process the same size + batch as in the single-worker code. + 2. Choose your TensorFlow distributed training strategy. In this example + we use the ``MultiWorkerMirroredStrategy``. + + .. literalinclude:: /../../python/ray/train/examples/tensorflow_quick_start.py + :language: python + :start-after: __tf_distributed_begin__ + :end-before: __tf_distributed_end__ + + Then, instantiate a ``TensorflowTrainer`` with 4 workers, + and use it to run the new training function! + + .. literalinclude:: /../../python/ray/train/examples/tensorflow_quick_start.py + :language: python + :start-after: __tf_trainer_begin__ + :end-before: __tf_trainer_end__ + + See :ref:`train-porting-code` for a more comprehensive example. diff --git a/doc/source/train/images/train-specific.svg b/doc/source/train/images/train-specific.svg new file mode 100644 index 000000000000..1015f51645e4 --- /dev/null +++ b/doc/source/train/images/train-specific.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/doc/source/train/key-concepts.rst b/doc/source/train/key-concepts.rst new file mode 100644 index 000000000000..bec0bed8460f --- /dev/null +++ b/doc/source/train/key-concepts.rst @@ -0,0 +1,121 @@ +.. _train-key-concepts: + +Key Concepts +============ + +There are four main concepts in the Ray Train library. + +1. ``Trainers`` execute distributed training. +2. ``Configuration`` objects are used to configure training. +3. ``Checkpoints`` are returned as the result of training. +4. ``Predictors`` can be used for inference and batch prediction. + +.. https://docs.google.com/drawings/d/1FezcdrXJuxLZzo6Rjz1CHyJzseH8nPFZp6IUepdn3N4/edit + +.. image:: images/train-specific.svg + +Trainers +-------- + +Trainers are responsible for executing (distributed) training runs. +The output of a Trainer run is a :ref:`Result ` that contains +metrics from the training run and the latest saved :ref:`Checkpoint `. +Trainers can also be configured with :ref:`Datasets ` and :ref:`Preprocessors ` for scalable data ingest and preprocessing. + +There are three categories of built-in Trainers: + +.. tabbed:: Deep Learning Trainers + + Ray Train supports the following deep learning trainers: + + - :class:`TorchTrainer ` + - :class:`TensorflowTrainer ` + - :class:`HorovodTrainer ` + + For these trainers, you usually define your own training function that loads the model + and executes single-worker training steps. Refer to the following guides for more details: + + - :ref:`Deep learning user guide ` + - :ref:`Quick overview of deep-learning trainers in the Ray AIR documentation ` + +.. tabbed:: Tree-Based Trainers + + Tree-based trainers utilize gradient-based decision trees for training. The most popular libraries + for this are XGBoost and LightGBM. + + - :class:`XGBoostTrainer ` + - :class:`LightGBMTrainer ` + + For these trainers, you just pass a dataset and parameters. The training loop is configured + automatically. + + - :ref:`XGBoost/LightGBM user guide ` + - :ref:`Quick overview of tree-based trainers in the Ray AIR documentation ` + + +.. tabbed:: Other Trainers + + Some trainers don't fit into the other two categories, such as: + + - :class:`HuggingFaceTrainer ` for NLP + - :class:`RLTrainer ` for reinforcement learning + - :class:`SklearnTrainer ` for (non-distributed) training of sklearn models. + + - :ref:`Other trainers in the Ray AIR documentation ` + +.. _train-key-concepts-config: + +Configuration +------------- + +Trainers are configured with configuration objects. There are two main configuration classes, +the :class:`ScalingConfig ` and the :class:`RunConfig `. +The latter contains subconfigurations, such as the :class:`FailureConfig `, +:class:`SyncConfig ` and :class:`CheckpointConfig `. + +Check out the :ref:`Configurations User Guide ` for an in-depth guide on using these configurations. + +.. _train-key-concepts-results: + +Checkpoints +----------- + +Calling ``Trainer.fit()`` returns a :class:`Result ` object, which includes +information about the run such as the reported metrics and the saved checkpoints. + +Checkpoints have the following purposes: + +* They can be passed to a Trainer to resume training from the given model state. +* They can be used to create a Predictor / BatchPredictor for scalable batch prediction. +* They can be deployed with Ray Serve. + +.. _train-key-concepts-predictors: + +Predictors +---------- + +Predictors are the counterpart to Trainers. A Trainer trains a model on a dataset, and a predictor +uses the resulting model and performs inference on it. + +Each Trainer has a respective Predictor implementation that is compatible with its generated checkpoints. + +.. dropdown:: Example: :class:`XGBoostPredictor ` + + .. literalinclude:: /train/doc_code/xgboost_train_predict.py + :language: python + :start-after: __train_predict_start__ + :end-before: __train_predict_end__ + + +A predictor can be passed into a :class:`BatchPredictor ` +is used to scale up prediction over a Ray cluster. +It takes a Ray Dataset as input. + +.. dropdown:: Example: Batch prediction with :class:`XGBoostPredictor ` + + .. literalinclude:: /train/doc_code/xgboost_train_predict.py + :language: python + :start-after: __batch_predict_start__ + :end-before: __batch_predict_end__ + +See :ref:`the Predictors user guide ` for more information and examples. diff --git a/doc/source/train/train.rst b/doc/source/train/train.rst index 92e41fa120ec..f3152daa2abb 100644 --- a/doc/source/train/train.rst +++ b/doc/source/train/train.rst @@ -2,166 +2,123 @@ .. _train-docs: -Ray Train: Distributed Deep Learning -==================================== +Ray Train: Scalable Model Training +================================== .. _`issue on GitHub`: https://github.com/ray-project/ray/issues -.. _`1.12 docs`: https://docs.ray.io/en/releases-1.12.0/raysgd/raysgd.html -.. tip:: Get in touch with us if you're using or considering using `Ray Train `_! +.. tip:: -Ray Train is a lightweight library for distributed deep learning, allowing you -to scale up and speed up training for your deep learning models. + Train is currently in **beta**. Fill out `this short form `_ to get involved with Train development! -The main features are: +Ray Train scales model training for popular ML frameworks such as Torch, XGBoost, TensorFlow, and more. It seamlessly integrates with other Ray libraries such as Tune and Predictors: -- **Ease of use**: Scale your single process training code to a cluster in just a couple lines of code. -- **Composability**: Ray Train interoperates with :ref:`Ray Tune ` to tune your distributed model and :ref:`Ray Datasets ` to train on large amounts of data. -- **Interactivity**: Ray Train fits in your workflow with support to run from any environment, including seamless Jupyter notebook support. +.. https://docs.google.com/drawings/d/1FezcdrXJuxLZzo6Rjz1CHyJzseH8nPFZp6IUepdn3N4/edit -.. note:: - - This API is in its Beta release (as of Ray 1.9) and may be revised in - future Ray releases. If you encounter any bugs, please file an - `issue on GitHub`_. - -.. note:: - - Ray Train replaces Ray SGD as the standard library for distributed deep learning on Ray. - Ray SGD has been fully deprecated as of Ray 1.13. If you are using an older version of Ray - and are looking for the Ray SGD docs, you can find them in the Ray `1.12 docs`_. +.. image:: images/train-specific.svg Intro to Ray Train ------------------ -Ray Train is a library that aims to simplify distributed deep learning. +**Framework support**: Train abstracts away the complexity of scaling up training +for common machine learning frameworks such as XGBoost, Pytorch, and Tensorflow. +There are three broad categories of Trainers that Train offers: -**Frameworks**: Ray Train is built to abstract away the coordination/configuration setup of distributed deep learning frameworks such as Pytorch Distributed and Tensorflow Distributed, allowing users to only focus on implementing training logic. +* :ref:`Deep Learning Trainers ` (Pytorch, Tensorflow, Horovod) +* :ref:`Tree-based Trainers ` (XGboost, LightGBM) +* Other ML frameworks (HuggingFace, Scikit-Learn, RLlib) -* For Pytorch, Ray Train automatically handles the construction of the distributed process group. -* For Tensorflow, Ray Train automatically handles the coordination of the ``TF_CONFIG``. The current implementation assumes that the user will use a MultiWorkerMirroredStrategy, but this will change in the near future. -* For Horovod, Ray Train automatically handles the construction of the Horovod runtime and Rendezvous server. - -**Built for data scientists/ML practitioners**: Ray Train has support for standard ML tools and features that practitioners love: +**Built for ML practitioners**: Train supports standard ML tools and features that practitioners love: * Callbacks for early stopping * Checkpointing * Integration with TensorBoard, Weights/Biases, and MLflow * Jupyter notebooks -**Integration with Ray Ecosystem**: Distributed deep learning often comes with a lot of complexity. - +**Batteries included**: Train is part of :ref:`Ray AIR ` and seamlessly operates in the Ray ecosystem. -* Use :ref:`Ray Datasets ` with Ray Train to handle and train on large amounts of data. -* Use :ref:`Ray Tune ` with Ray Train to leverage cutting edge hyperparameter techniques and distribute both your training and tuning. -* You can leverage the :ref:`Ray cluster launcher ` to launch autoscaling or spot instance clusters to train your model at scale on any cloud. +* Use :ref:`Ray Datasets ` with Train to load and process datasets both small and large. +* Use :ref:`Ray Tune ` with Train to sweep parameter grids and leverage cutting edge hyperparameter search algorithms. +* Leverage the :ref:`Ray cluster launcher ` to launch autoscaling or spot instance clusters on any cloud. Quick Start ----------- -Ray Train abstracts away the complexity of setting up a distributed training -system. Let's take following simple examples: - -.. tabbed:: PyTorch - - This example shows how you can use Ray Train with PyTorch. - - First, set up your dataset and model. - - .. literalinclude:: /../../python/ray/train/examples/torch_quick_start.py - :language: python - :start-after: __torch_setup_begin__ - :end-before: __torch_setup_end__ - - - Now define your single-worker PyTorch training function. - - .. literalinclude:: /../../python/ray/train/examples/torch_quick_start.py - :language: python - :start-after: __torch_single_begin__ - :end-before: __torch_single_end__ - - This training function can be executed with: - - .. literalinclude:: /../../python/ray/train/examples/torch_quick_start.py - :language: python - :start-after: __torch_single_run_begin__ - :end-before: __torch_single_run_end__ - - Now let's convert this to a distributed multi-worker training function! - - All you have to do is use the ``ray.train.torch.prepare_model`` and - ``ray.train.torch.prepare_data_loader`` utility functions to - easily setup your model & data for distributed training. - This will automatically wrap your model with ``DistributedDataParallel`` - and place it on the right device, and add ``DistributedSampler`` to your DataLoaders. - - .. literalinclude:: /../../python/ray/train/examples/torch_quick_start.py - :language: python - :start-after: __torch_distributed_begin__ - :end-before: __torch_distributed_end__ - - Then, instantiate a ``Trainer`` that uses a ``"torch"`` backend - with 4 workers, and use it to run the new training function! - - .. literalinclude:: /../../python/ray/train/examples/torch_quick_start.py - :language: python - :start-after: __torch_trainer_begin__ - :end-before: __torch_trainer_end__ - - See :ref:`train-porting-code` for a more comprehensive example. - -.. tabbed:: TensorFlow - - This example shows how you can use Ray Train to set up `Multi-worker training - with Keras `_. - - First, set up your dataset and model. - - .. literalinclude:: /../../python/ray/train/examples/tensorflow_quick_start.py - :language: python - :start-after: __tf_setup_begin__ - :end-before: __tf_setup_end__ +.. tabbed:: XGBoost - Now define your single-worker TensorFlow training function. - - .. literalinclude:: /../../python/ray/train/examples/tensorflow_quick_start.py - :language: python - :start-after: __tf_single_begin__ - :end-before: __tf_single_end__ - - This training function can be executed with: - - .. literalinclude:: /../../python/ray/train/examples/tensorflow_quick_start.py - :language: python - :start-after: __tf_single_run_begin__ - :end-before: __tf_single_run_end__ - - Now let's convert this to a distributed multi-worker training function! - All you need to do is: - - 1. Set the *global* batch size - each worker will process the same size - batch as in the single-worker code. - 2. Choose your TensorFlow distributed training strategy. In this example - we use the ``MultiWorkerMirroredStrategy``. - - .. literalinclude:: /../../python/ray/train/examples/tensorflow_quick_start.py + .. literalinclude:: doc_code/gbdt_user_guide.py :language: python - :start-after: __tf_distributed_begin__ - :end-before: __tf_distributed_end__ + :start-after: __xgboost_start__ + :end-before: __xgboost_end__ - Then, instantiate a ``Trainer`` that uses a ``"tensorflow"`` backend - with 4 workers, and use it to run the new training function! +.. tabbed:: LightGBM - .. literalinclude:: /../../python/ray/train/examples/tensorflow_quick_start.py + .. literalinclude:: doc_code/gbdt_user_guide.py :language: python - :start-after: __tf_trainer_begin__ - :end-before: __tf_trainer_end__ - - See :ref:`train-porting-code` for a more comprehensive example. - - -**Next steps:** Check out the :ref:`User Guide `! + :start-after: __lightgbm_start__ + :end-before: __lightgbm_end__ + +.. tabbed:: Pytorch + + .. literalinclude:: /ray-air/doc_code/torch_trainer.py + :language: python + +.. tabbed:: Tensorflow + + .. literalinclude:: /ray-air/doc_code/tf_starter.py + :language: python + :start-after: __air_tf_train_start__ + :end-before: __air_tf_train_end__ + +.. tabbed:: Horovod + + .. literalinclude:: /ray-air/doc_code/hvd_trainer.py + :language: python + + +Framework Catalog +----------------- + +Here is a catalog of the framework-specific Trainer, Checkpoint, and Predictor classes that ship out of the box with Train: + +.. list-table:: + + * - **Trainer Class** + - **Checkpoint Class** + - **Predictor Class** + * - :class:`TorchTrainer ` + - :class:`TorchCheckpoint ` + - :class:`TorchPredictor ` + * - :class:`TensorflowTrainer ` + - :class:`TensorflowCheckpoint ` + - :class:`TensorflowPredictor ` + * - :class:`HorovodTrainer ` + - (Torch/TF Checkpoint) + - (Torch/TF Predictor) + * - :class:`XGBoostTrainer ` + - :class:`XGBoostCheckpoint ` + - :class:`XGBoostPredictor ` + * - :class:`LightGBMTrainer ` + - :class:`LightGBMCheckpoint ` + - :class:`LightGBMPredictor ` + * - :class:`SklearnTrainer ` + - :class:`SklearnCheckpoint ` + - :class:`SklearnPredictor ` + * - :class:`HuggingFaceTrainer ` + - :class:`HuggingFaceCheckpoint ` + - :class:`HuggingFacePredictor ` + * - :class:`RLTrainer ` + - :class:`RLCheckpoint ` + - :class:`RLPredictor ` + + +Next steps +---------- + +* :ref:`Getting Started ` +* :ref:`Key Concepts for Ray Train ` +* :ref:`User Guide for Deep Learning Trainers ` +* :ref:`User Guide for Tree-Based Trainers ` .. include:: /_includes/train/announcement_bottom.rst diff --git a/doc/source/train/user-guides.rst b/doc/source/train/user-guides.rst new file mode 100644 index 000000000000..cd636041ffe0 --- /dev/null +++ b/doc/source/train/user-guides.rst @@ -0,0 +1,45 @@ +User Guides +=========== + +.. panels:: + :container: container pb-4 full-width + :column: col-md-4 px-2 py-2 + :img-top-cls: pt-5 w-75 d-block mx-auto + + --- + :img-top: /ray-overview/images/ray_svg_logo.svg + + +++ + .. link-button:: config_guide + :type: ref + :text: Configurations User Guide + :classes: btn-link btn-block stretched-link + + --- + :img-top: /ray-overview/images/ray_svg_logo.svg + + +++ + .. link-button:: dl_guide + :type: ref + :text: Deep Learning User Guide + :classes: btn-link btn-block stretched-link + + + --- + :img-top: /ray-overview/images/ray_svg_logo.svg + + +++ + .. link-button:: gbdt + :type: ref + :text: XGBoost / LightGBM User Guide + :classes: btn-link btn-block stretched-link + + --- + :img-top: /ray-overview/images/ray_svg_logo.svg + + +++ + .. link-button:: architecture + :type: ref + :text: Ray Train Architecture + :classes: btn-link btn-block stretched-link + diff --git a/doc/source/tune/api_docs/overview.rst b/doc/source/tune/api_docs/overview.rst index 25e46f1554c7..a2581cf88495 100644 --- a/doc/source/tune/api_docs/overview.rst +++ b/doc/source/tune/api_docs/overview.rst @@ -21,6 +21,7 @@ on `Github`_. stoppers.rst result_grid.rst reporters.rst + syncing.rst logging.rst env.rst sklearn.rst diff --git a/doc/source/tune/api_docs/syncing.rst b/doc/source/tune/api_docs/syncing.rst new file mode 100644 index 000000000000..7fdb7c0dc2bd --- /dev/null +++ b/doc/source/tune/api_docs/syncing.rst @@ -0,0 +1,20 @@ +Syncing +======= + +.. _tune-syncconfig: + +SyncConfig +---------- + +.. autoclass:: ray.tune.syncer.SyncConfig + :members: + + + +.. _tune-syncer: + +Syncer +------ + +.. autoclass:: ray.tune.syncer.Syncer + :members: diff --git a/doc/source/tune/api_docs/trainable.rst b/doc/source/tune/api_docs/trainable.rst index ef57fccad064..ef3b04549085 100644 --- a/doc/source/tune/api_docs/trainable.rst +++ b/doc/source/tune/api_docs/trainable.rst @@ -23,8 +23,12 @@ Function API With the Function API, you can report intermediate metrics by simply calling ``session.report`` within the provided function. + .. code-block:: python + from ray import tune + from ray.air import session + def trainable(config): # config (dict): A dict of hyperparameters. diff --git a/python/ray/air/config.py b/python/ray/air/config.py index 9893349aedbc..69e5ac9f7fc4 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -434,8 +434,10 @@ class FailureConfig: Will recover from the latest checkpoint if present. Setting to -1 will lead to infinite recovery retries. Setting to 0 will disable retries. Defaults to 0. - fail_fast: Whether to fail upon the first error. - If fail_fast='raise' provided, Tune will automatically + fail_fast: Whether to fail upon the first error. Only used for + Ray Tune - this does not apply + to single training runs (e.g. with ``Trainer.fit()``). + If fail_fast='raise' provided, Ray Tune will automatically raise the exception received by the Trainable. fail_fast='raise' can easily leak resources and should be used with caution (it is best used with `ray.init(local_mode=True)`). diff --git a/python/ray/train/_internal/checkpoint.py b/python/ray/train/_internal/checkpoint.py index 8205a2544ac1..27b1ba698a39 100644 --- a/python/ray/train/_internal/checkpoint.py +++ b/python/ray/train/_internal/checkpoint.py @@ -208,14 +208,14 @@ def _load_checkpoint( if isinstance(loaded_checkpoint, Checkpoint): # The new logic checkpoint_dict = loaded_checkpoint.to_dict() - self._latest_checkpoint_id = checkpoint_dict[TUNE_CHECKPOINT_ID] + self._latest_checkpoint_id = checkpoint_dict.get(TUNE_CHECKPOINT_ID, 0) return loaded_checkpoint # legacy path... if loaded_checkpoint is not None: # If the Tune trial is restarted, a new Trainer is instantiated. # However, we want the checkpoint_id to continue incrementing # from the previous run. - self._latest_checkpoint_id = loaded_checkpoint[TUNE_CHECKPOINT_ID] + self._latest_checkpoint_id = loaded_checkpoint.get(TUNE_CHECKPOINT_ID, 0) return loaded_checkpoint def add_tune_checkpoint_id(self, checkpoint: Dict): diff --git a/python/ray/train/data_parallel_trainer.py b/python/ray/train/data_parallel_trainer.py index 470967d61f83..bcfdb9f01f76 100644 --- a/python/ray/train/data_parallel_trainer.py +++ b/python/ray/train/data_parallel_trainer.py @@ -81,8 +81,7 @@ def train_loop_per_worker(config: Dict): ``session.get_dataset_shard(...)`` will return the the entire Dataset. Inside the ``train_loop_per_worker`` function, you can use any of the - :ref:`Ray AIR session methods ` and - :ref:`Ray Train function utils `. + :ref:`Ray AIR session methods `. .. code-block:: python diff --git a/python/ray/train/examples/torch_quick_start.py b/python/ray/train/examples/torch_quick_start.py index d6ccae6b59d0..9f10e6333acb 100644 --- a/python/ray/train/examples/torch_quick_start.py +++ b/python/ray/train/examples/torch_quick_start.py @@ -84,7 +84,11 @@ def train_func_distributed(): # For GPU Training, set `use_gpu` to True. use_gpu = False - trainer = TorchTrainer(train_func_distributed, scaling_config=ScalingConfig(num_workers=4, use_gpu=use_gpu)) + trainer = TorchTrainer( + train_func_distributed, + scaling_config=ScalingConfig( + num_workers=4, use_gpu=use_gpu) + ) results = trainer.fit() diff --git a/python/ray/train/horovod/horovod_trainer.py b/python/ray/train/horovod/horovod_trainer.py index da0e2c70a1b1..ac54e013c240 100644 --- a/python/ray/train/horovod/horovod_trainer.py +++ b/python/ray/train/horovod/horovod_trainer.py @@ -45,8 +45,7 @@ def train_loop_per_worker(config: Dict): ``session.get_dataset_shard(...)`` will return the the entire Dataset. Inside the ``train_loop_per_worker`` function, you can use any of the - :ref:`Ray AIR session methods ` and - :ref:`Ray Train function utils `. + :ref:`Ray AIR session methods `. .. code-block:: python diff --git a/python/ray/train/tensorflow/tensorflow_trainer.py b/python/ray/train/tensorflow/tensorflow_trainer.py index bcf1453a3565..43b6c2263585 100644 --- a/python/ray/train/tensorflow/tensorflow_trainer.py +++ b/python/ray/train/tensorflow/tensorflow_trainer.py @@ -43,8 +43,7 @@ def train_loop_per_worker(config: Dict): ``session.get_dataset_shard(...)`` will return the the entire Dataset. Inside the ``train_loop_per_worker`` function, you can use any of the - :ref:`Ray AIR session methods ` and - :ref:`Ray Train function utils `. + :ref:`Ray AIR session methods `. .. code-block:: python @@ -68,8 +67,8 @@ def train_loop_per_worker(): # Returns the rank of the worker on the current node. session.get_local_rank() - You can also use any of the :ref:`TensorFlow specific function utils - `. + You can also use :meth:`ray.train.tensorflow.prepare_dataset_shard` + within your training code. .. code-block:: python diff --git a/python/ray/train/torch/torch_trainer.py b/python/ray/train/torch/torch_trainer.py index 8c02844b992e..e84012cbe098 100644 --- a/python/ray/train/torch/torch_trainer.py +++ b/python/ray/train/torch/torch_trainer.py @@ -43,8 +43,7 @@ def train_loop_per_worker(config: Dict): ``session.get_dataset_shard(...)`` will return the the entire Dataset. Inside the ``train_loop_per_worker`` function, you can use any of the - :ref:`Ray AIR session methods ` and - :ref:`Ray Train function utils `. + :ref:`Ray AIR session methods `. .. code-block:: python @@ -68,8 +67,8 @@ def train_loop_per_worker(): # Returns the rank of the worker on the current node. session.get_local_rank() - You can also use any of the :ref:`Torch specific function utils - `. + You can also use any of the Torch specific function utils, + such as :func:`ray.train.torch.get_device` and :func:`ray.train.torch.prepare_model` .. code-block:: python @@ -124,13 +123,16 @@ def train_loop_per_worker(): dataset_shard = session.get_dataset_shard("train") model = NeuralNetwork() loss_fn = nn.MSELoss() - optimizer = optim.SGD(model.parameters(), lr=0.1) + optimizer = torch.optim.SGD(model.parameters(), lr=0.1) model = train.torch.prepare_model(model) for epoch in range(num_epochs): - for batch in iter(dataset_shard.to_torch(batch_size=32)): - output = model(input) + for batches in dataset_shard.iter_torch_batches( + batch_size=32, dtypes=torch.float + ): + inputs, labels = torch.unsqueeze(batches["x"], 1), batches["y"] + output = model(inputs) loss = loss_fn(output, labels) optimizer.zero_grad() loss.backward() @@ -144,7 +146,9 @@ def train_loop_per_worker(): ), ) - train_dataset = ray.data.from_items([1, 2, 3]) + train_dataset = ray.data.from_items( + [{"x": x, "y": 2 * x + 1} for x in range(200)] + ) scaling_config = ScalingConfig(num_workers=3) # If using GPUs, use the below scaling config instead. # scaling_config = ScalingConfig(num_workers=3, use_gpu=True) diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index 3c141bd91781..74866c492186 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -139,8 +139,8 @@ class Syncer(abc.ABC): This class handles data transfer for two cases: 1. Synchronizing data from the driver to external storage. This affects - experiment-level checkpoints and trial-level checkpoints if no cloud storage - is used. + experiment-level checkpoints and trial-level checkpoints if no cloud storage + is used. 2. Synchronizing data from remote trainables to external storage. Synchronizing tasks are usually asynchronous and can be awaited using ``wait()``.