ray-project · amogkam · Jul 7, 2022 · Jun 13, 2022 · Jun 14, 2022 · Jun 14, 2022
@@ -15,10 +15,10 @@ General Examples
 PyTorch
 ~~~~~~~
 
-* :doc:`/train/examples/train_linear_example`:
+* :doc:`/train/examples/torch_linear_example`:
   Simple example for PyTorch.
 
-* :doc:`/train/examples/train_fashion_mnist_example`:
+* :doc:`/train/examples/torch_fashion_mnist_example`:
   End-to-end example for PyTorch.
 
 * :doc:`/train/examples/transformers/transformers_example`:
@@ -59,10 +59,10 @@ Ray Datasets Integration Examples
 * :doc:`/train/examples/tensorflow_linear_dataset_example`:
   Simple example for training a linear TensorFlow model.
 
-* :doc:`/train/examples/train_linear_dataset_example`:
+* :doc:`/train/examples/torch_linear_dataset_example`:
   Simple example for training a linear PyTorch model.
 
-* :doc:`/train/examples/tune_linear_dataset_example`:
+* :doc:`/train/examples/tune_torch_linear_dataset_example`:
   Simple example for tuning a linear PyTorch model.
 
 
@@ -75,7 +75,7 @@ Ray Tune Integration Examples
 * :doc:`/train/examples/tune_tensorflow_mnist_example`:
   End-to-end example for tuning a TensorFlow model.
 
-* :doc:`/train/examples/tune_cifar_pytorch_pbt_example`:
+* :doc:`/train/examples/tune_cifar_torch_pbt_example`:
   End-to-end example for tuning a PyTorch model with PBT.
 
 ..

@@ -0,0 +1,6 @@
+:orphan:
+
+torch_fashion_mnist_example
+===========================
+
+.. literalinclude:: /../../python/ray/train/examples/torch_fashion_mnist_example.py
@@ -0,0 +1,6 @@
+:orphan:
+
+torch_linear_dataset_example
+============================
+
+.. literalinclude:: /../../python/ray/train/examples/torch_linear_dataset_example.py
@@ -0,0 +1,6 @@
+:orphan:
+
+torch_linear_example
+====================
+
+.. literalinclude:: /../../python/ray/train/examples/torch_linear_example.py
@@ -0,0 +1,6 @@
+:orphan:
+
+tune_cifar_torch_pbt_example
+============================
+
+.. literalinclude:: /../../python/ray/train/examples/tune_cifar_torch_pbt_example.py
@@ -0,0 +1,6 @@
+:orphan:
+
+tune_torch_linear_dataset_example
+=================================
+
+.. literalinclude:: /../../python/ray/air/examples/pytorch/tune_torch_linear_dataset_example.py
diff --git a/python/ray/air/result.py b/python/ray/air/result.py
@@ -1,5 +1,6 @@
-from typing import Any, Dict, List, Optional, Tuple
 from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
 
 from ray.air.checkpoint import Checkpoint
 from ray.util.annotations import PublicAPI
@@ -15,7 +16,7 @@ class Result:
     This is the class produced by Trainer.fit().
     It contains a checkpoint, which can be used for resuming training and for
     creating a Predictor object. It also contains a metrics object describing
-    training metrics. `error` is included so that non successful runs
+    training metrics. ``error`` is included so that non successful runs
     and trials can be represented as well.
 
     The constructor is a private API.
@@ -24,6 +25,7 @@ class Result:
         metrics: The final metrics as reported by an Trainable.
         checkpoint: The final checkpoint of the Trainable.
         error: The execution error of the Trainable run, if the trial finishes in error.
+        log_dir: Directory where the trial logs are saved.
         metrics_dataframe: The full result dataframe of the Trainable.
             The dataframe is indexed by iterations and contains reported
             metrics.
@@ -37,6 +39,7 @@ class Result:
     metrics: Optional[Dict[str, Any]]
     checkpoint: Optional[Checkpoint]
     error: Optional[Exception]
+    log_dir: Optional[Path]
     metrics_dataframe: Optional[pd.DataFrame]
     best_checkpoints: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]]
 

@@ -39,15 +39,6 @@ py_test(
     deps = [":train_lib"]
 )
 
-py_test(
-    name = "torch_tensorboard_profiler_example",
-    size = "small",
-    main = "examples/torch_tensorboard_profiler_example.py",
-    srcs = ["examples/torch_tensorboard_profiler_example.py"],
-    tags = ["team:ml", "exclusive"],
-    deps = [":train_lib"]
-)
-
 py_test(
     name = "transformers_example_gpu",
     size = "large",
@@ -73,25 +64,15 @@ py_test(
 )
 
 py_test(
-    name = "tune_cifar_pytorch_pbt_example",
+    name = "tune_cifar_torch_pbt_example",
     size = "medium",
-    main = "examples/tune_cifar_pytorch_pbt_example.py",
-    srcs = ["examples/tune_cifar_pytorch_pbt_example.py"],
+    main = "examples/tune_cifar_torch_pbt_example.py",
+    srcs = ["examples/tune_cifar_torch_pbt_example.py"],
     tags = ["team:ml", "exclusive", "pytorch", "tune"],
     deps = [":train_lib"],
     args = ["--smoke-test"]
 )
 
-py_test(
-    name = "tune_linear_dataset_example",
-    size = "medium",
-    main = "examples/tune_linear_dataset_example.py",
-    srcs = ["examples/tune_linear_dataset_example.py"],
-    tags = ["team:ml", "exclusive", "gpu_only", "tune"],
-    deps = [":train_lib"],
-    args = ["--smoke-test", "--use-gpu"]
-)
-
 py_test(
     name = "tune_linear_example",
     size = "medium",

@@ -2,15 +2,17 @@
 import os
 
 import horovod.torch as hvd
-import ray
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 import torch.utils.data.distributed
 from filelock import FileLock
-from ray.train import Trainer
 from torchvision import datasets, transforms
 
+import ray
+from ray import train
+from ray.train.horovod import HorovodTrainer
+
 
 def metric_average(val, name):
     tensor = torch.tensor(val)
@@ -142,21 +144,21 @@ def train_func(config):
 
     model, optimizer, train_loader, train_sampler = setup(config)
 
-    results = []
     for epoch in range(num_epochs):
         loss = train_epoch(
             model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda
         )
-        results.append(loss)
-    return results
+        train.report(loss=loss)
 
 
 def main(num_workers, use_gpu, kwargs):
-    trainer = Trainer("horovod", use_gpu=use_gpu, num_workers=num_workers)
-    trainer.start()
-    loss_per_epoch = trainer.run(train_func, config=kwargs)
-    trainer.shutdown()
-    print(loss_per_epoch)
+    trainer = HorovodTrainer(
+        train_func,
+        train_loop_config=kwargs,
+        scaling_config={"use_gpu": use_gpu, "num_workers": num_workers},
+    )
+    results = trainer.fit()
+    print(results)
 
 
 # Horovod Class API.

@@ -1,20 +1,23 @@
 import argparse
 
-from ray.train import Trainer
-from ray.train.examples.train_fashion_mnist_example import train_func
-from ray.train.callbacks.logging import MLflowLoggerCallback
+from ray.air import RunConfig
+from ray.train.examples.torch_fashion_mnist_example import train_func
+from ray.train.torch import TorchTrainer
+from ray.tune.integration.mlflow import MLflowLoggerCallback
 
 
 def main(num_workers=2, use_gpu=False):
-    trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu)
-    trainer.start()
-    final_results = trainer.run(
-        train_func=train_func,
-        config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
-        callbacks=[MLflowLoggerCallback(experiment_name="train_fashion_mnist")],
+    trainer = TorchTrainer(
+        train_func,
+        train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
+        scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
+        run_config=RunConfig(
+            callbacks=[MLflowLoggerCallback(experiment_name="train_fashion_mnist")]
+        ),
     )
+    final_results = trainer.fit()
 
-    print("Full losses for rank 0 worker: ", final_results)
+    print("Full results for rank 0 worker: ", final_results)
 
 
 if __name__ == "__main__":
@@ -44,7 +47,7 @@ def main(num_workers=2, use_gpu=False):
     import ray
 
     if args.smoke_test:
-        ray.init(num_cpus=2)
+        ray.init(num_cpus=4)
         args.num_workers = 2
         args.use_gpu = False
     else:

@@ -1,40 +1,53 @@
 from ray import train
-from ray.train import Trainer
-from ray.train.callbacks import MLflowLoggerCallback, TBXLoggerCallback
+from ray.air import RunConfig
+from ray.train.torch import TorchTrainer
+from ray.tune.integration.mlflow import MLflowLoggerCallback
+from ray.tune.logger import TBXLoggerCallback
 
 
 def train_func():
     for i in range(3):
         train.report(epoch=i)
 
 
-trainer = Trainer(backend="torch", num_workers=2)
-trainer.start()
+trainer = TorchTrainer(
+    train_func,
+    scaling_config={"num_workers": 2},
+    run_config=RunConfig(
+        callbacks=[
+            MLflowLoggerCallback(experiment_name="train_experiment"),
+            TBXLoggerCallback(),
+        ],
+    ),
+)
 
 # Run the training function, logging all the intermediate results
 # to MLflow and Tensorboard.
-result = trainer.run(
-    train_func,
-    callbacks=[
-        MLflowLoggerCallback(experiment_name="train_experiment"),
-        TBXLoggerCallback(),
-    ],
-)
+result = trainer.fit()
 
-# Print the latest run directory and keep note of it.
-# For example: /home/ray_results/train_2021-09-01_12-00-00/run_001
-print("Run directory:", trainer.latest_run_dir)
+# For MLFLow logs:
+
+# MLFlow logs will by default be saved in an `mlflow` directory
+# in the current working directory.
 
-trainer.shutdown()
+# $ cd mlflow
+# # View the MLflow UI.
+# $ mlflow ui
+
+# You can change the directory by setting the `tracking_uri` argument
+# in `MLflowLoggerCallback`.
+
+# For TensorBoard logs:
+
+# Print the latest run directory and keep note of it.
+# For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06
+print("Run directory:", result.log_dir.parent)  # TensorBoard is saved in parent dir
 
 # How to visualize the logs
 
 # Navigate to the run directory of the trainer.
-# For example `cd /home/ray_results/train_2021-09-01_12-00-00/run_001`
+# For example `cd /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06`
 # $ cd <TRAINER_RUN_DIR>
 #
-# # View the MLflow UI.
-# $ mlflow ui
-#
 # # View the tensorboard UI.
 # $ tensorboard --logdir .