From 1be63dfec49984bc98306f237d656172c1b6afa0 Mon Sep 17 00:00:00 2001 From: Matthew Deng Date: Mon, 16 Jan 2023 11:56:21 -0800 Subject: [PATCH 1/2] [train][docs] update docstrings/quickstarts to work when use_gpu=True Signed-off-by: Matthew Deng --- doc/source/ray-air/doc_code/hf_trainer.py | 10 ++++++---- doc/source/ray-air/doc_code/hvd_trainer.py | 10 ++++++---- doc/source/ray-air/doc_code/tf_starter.py | 8 +++++--- doc/source/ray-air/doc_code/torch_trainer.py | 11 +++++++---- python/ray/train/horovod/horovod_trainer.py | 7 ++++--- python/ray/train/huggingface/huggingface_trainer.py | 8 +++++--- python/ray/train/tensorflow/tensorflow_trainer.py | 5 ++++- python/ray/train/torch/torch_trainer.py | 9 +++++---- 8 files changed, 42 insertions(+), 26 deletions(-) diff --git a/doc/source/ray-air/doc_code/hf_trainer.py b/doc/source/ray-air/doc_code/hf_trainer.py index bbd54c3bb336..1d81d36dc35c 100644 --- a/doc/source/ray-air/doc_code/hf_trainer.py +++ b/doc/source/ray-air/doc_code/hf_trainer.py @@ -12,6 +12,10 @@ from ray.train.huggingface import HuggingFaceTrainer from ray.air.config import ScalingConfig + +# If using GPUs, set this to True. +use_gpu = False + model_checkpoint = "gpt2" tokenizer_checkpoint = "sgugger/gpt2-like-tokenizer" block_size = 128 @@ -66,7 +70,7 @@ def trainer_init_per_worker(train_dataset, eval_dataset, **config): logging_strategy="epoch", learning_rate=2e-5, weight_decay=0.01, - no_cuda=True, # Set to False for GPU training + no_cuda=(not use_gpu), ) return transformers.Trainer( model=model, @@ -76,9 +80,7 @@ def trainer_init_per_worker(train_dataset, eval_dataset, **config): ) -scaling_config = ScalingConfig(num_workers=3) -# If using GPUs, use the below scaling config instead. -# scaling_config = ScalingConfig(num_workers=3, use_gpu=True) +scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu) trainer = HuggingFaceTrainer( trainer_init_per_worker=trainer_init_per_worker, scaling_config=scaling_config, diff --git a/doc/source/ray-air/doc_code/hvd_trainer.py b/doc/source/ray-air/doc_code/hvd_trainer.py index eff640acb4ff..7c4e4fd67c42 100644 --- a/doc/source/ray-air/doc_code/hvd_trainer.py +++ b/doc/source/ray-air/doc_code/hvd_trainer.py @@ -8,6 +8,10 @@ from ray.train.horovod import HorovodTrainer from ray.air.config import ScalingConfig +# If using GPUs, set this to True. +use_gpu = False + + input_size = 1 layer_size = 15 output_size = 1 @@ -43,7 +47,7 @@ def train_loop_per_worker(): for epoch in range(num_epochs): model.train() for batch in dataset_shard.iter_torch_batches( - batch_size=32, dtypes=torch.float + batch_size=32, dtypes=torch.float, device=train.torch.get_device() ): inputs, labels = torch.unsqueeze(batch["x"], 1), batch["y"] inputs.to(device) @@ -61,9 +65,7 @@ def train_loop_per_worker(): train_dataset = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)]) -scaling_config = ScalingConfig(num_workers=3) -# If using GPUs, use the below scaling config instead. -# scaling_config = ScalingConfig(num_workers=3, use_gpu=True) +scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu) trainer = HorovodTrainer( train_loop_per_worker=train_loop_per_worker, scaling_config=scaling_config, diff --git a/doc/source/ray-air/doc_code/tf_starter.py b/doc/source/ray-air/doc_code/tf_starter.py index 02f99cd4bedb..53f39953b40f 100644 --- a/doc/source/ray-air/doc_code/tf_starter.py +++ b/doc/source/ray-air/doc_code/tf_starter.py @@ -10,6 +10,10 @@ from ray.train.tensorflow import TensorflowTrainer from ray.air.config import ScalingConfig + +# If using GPUs, set this to True. +use_gpu = False + a = 5 b = 10 size = 100 @@ -59,9 +63,7 @@ def train_func(config: dict): train_dataset = ray.data.from_items( [{"x": x / 200, "y": 2 * x / 200} for x in range(200)] ) -scaling_config = ScalingConfig(num_workers=2) -# If using GPUs, use the below scaling config instead. -# scaling_config = ScalingConfig(num_workers=2, use_gpu=True) +scaling_config = ScalingConfig(num_workers=2, use_gpu=use_gpu) trainer = TensorflowTrainer( train_loop_per_worker=train_func, train_loop_config=config, diff --git a/doc/source/ray-air/doc_code/torch_trainer.py b/doc/source/ray-air/doc_code/torch_trainer.py index 486f6553d21a..f8d4dc45bcfe 100644 --- a/doc/source/ray-air/doc_code/torch_trainer.py +++ b/doc/source/ray-air/doc_code/torch_trainer.py @@ -7,6 +7,11 @@ from ray.train.torch import TorchTrainer from ray.air.config import ScalingConfig + +# If using GPUs, set this to True. +use_gpu = False + + input_size = 1 layer_size = 15 output_size = 1 @@ -34,7 +39,7 @@ def train_loop_per_worker(): for epoch in range(num_epochs): for batches in dataset_shard.iter_torch_batches( - batch_size=32, dtypes=torch.float + batch_size=32, dtypes=torch.float, device=train.torch.get_device() ): inputs, labels = torch.unsqueeze(batches["x"], 1), batches["y"] output = model(inputs) @@ -53,9 +58,7 @@ def train_loop_per_worker(): train_dataset = ray.data.from_items([{"x": x, "y": 2 * x + 1} for x in range(200)]) -scaling_config = ScalingConfig(num_workers=3) -# If using GPUs, use the below scaling config instead. -# scaling_config = ScalingConfig(num_workers=3, use_gpu=True) +scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu) trainer = TorchTrainer( train_loop_per_worker=train_loop_per_worker, scaling_config=scaling_config, diff --git a/python/ray/train/horovod/horovod_trainer.py b/python/ray/train/horovod/horovod_trainer.py index c4a4c957cc6b..98cc572791d6 100644 --- a/python/ray/train/horovod/horovod_trainer.py +++ b/python/ray/train/horovod/horovod_trainer.py @@ -92,6 +92,9 @@ def train_loop_per_worker(): from ray.train.torch import TorchCheckpoint from ray.air.config import ScalingConfig + # If using GPUs, set this to True. + use_gpu = False + input_size = 1 layer_size = 15 output_size = 1 @@ -142,9 +145,7 @@ def train_loop_per_worker(): ), ) train_dataset = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)]) - scaling_config = ScalingConfig(num_workers=3) - # If using GPUs, use the below scaling config instead. - # scaling_config = ScalingConfig(num_workers=3, use_gpu=True) + scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu) trainer = HorovodTrainer( train_loop_per_worker=train_loop_per_worker, scaling_config=scaling_config, diff --git a/python/ray/train/huggingface/huggingface_trainer.py b/python/ray/train/huggingface/huggingface_trainer.py index 8afe9c2784b0..5e7a145c788f 100644 --- a/python/ray/train/huggingface/huggingface_trainer.py +++ b/python/ray/train/huggingface/huggingface_trainer.py @@ -124,6 +124,9 @@ class HuggingFaceTrainer(TorchTrainer): from ray.train.huggingface import HuggingFaceTrainer from ray.air.config import ScalingConfig + # If using GPUs, set this to True. + use_gpu = False + model_checkpoint = "gpt2" tokenizer_checkpoint = "sgugger/gpt2-like-tokenizer" block_size = 128 @@ -180,6 +183,7 @@ def trainer_init_per_worker(train_dataset, eval_dataset, **config): logging_strategy="epoch", learning_rate=2e-5, weight_decay=0.01, + no_cuda=(not use_gpu), ) return transformers.Trainer( model=model, @@ -188,9 +192,7 @@ def trainer_init_per_worker(train_dataset, eval_dataset, **config): eval_dataset=eval_dataset, ) - scaling_config = ScalingConfig(num_workers=3) - # If using GPUs, use the below scaling config instead. - # scaling_config = ScalingConfig(num_workers=3, use_gpu=True) + scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu) trainer = HuggingFaceTrainer( trainer_init_per_worker=trainer_init_per_worker, scaling_config=scaling_config, diff --git a/python/ray/train/tensorflow/tensorflow_trainer.py b/python/ray/train/tensorflow/tensorflow_trainer.py index 44a754647739..6b2ef8609df1 100644 --- a/python/ray/train/tensorflow/tensorflow_trainer.py +++ b/python/ray/train/tensorflow/tensorflow_trainer.py @@ -94,6 +94,9 @@ def train_loop_per_worker(): from ray.air.config import ScalingConfig from ray.train.tensorflow import TensorflowTrainer + # If using GPUs, set this to True. + use_gpu = False + def build_model(): # toy neural network : 1-layer return tf.keras.Sequential( @@ -128,7 +131,7 @@ def train_loop_per_worker(config): train_dataset = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)]) trainer = TensorflowTrainer( train_loop_per_worker=train_loop_per_worker, - scaling_config=ScalingConfig(num_workers=3), + scaling_config=ScalingConfig(num_workers=3, use_gpu=use_gpu), datasets={"train": train_dataset}, train_loop_config={"num_epochs": 2}, ) diff --git a/python/ray/train/torch/torch_trainer.py b/python/ray/train/torch/torch_trainer.py index 8323d0ad5f8c..2319b6d6225b 100644 --- a/python/ray/train/torch/torch_trainer.py +++ b/python/ray/train/torch/torch_trainer.py @@ -108,6 +108,9 @@ def train_loop_per_worker(): from ray.air.config import RunConfig from ray.air.config import CheckpointConfig + # If using GPUs, set this to True. + use_gpu = False + # Define NN layers archicture, epochs, and number of workers input_size = 1 layer_size = 32 @@ -145,7 +148,7 @@ def train_loop_per_worker(): # Iterate over epochs and batches for epoch in range(num_epochs): for batches in dataset_shard.iter_torch_batches(batch_size=32, - dtypes=torch.float): + dtypes=torch.float, device=train.torch.get_device()): # Add batch or unsqueeze as an additional dimension [32, x] inputs, labels = torch.unsqueeze(batches["x"], 1), batches["y"] @@ -176,9 +179,7 @@ def train_loop_per_worker(): ) # Define scaling and run configs - # If using GPUs, use the below scaling config instead. - # scaling_config = ScalingConfig(num_workers=3, use_gpu=True) - scaling_config = ScalingConfig(num_workers=num_workers) + scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu) run_config = RunConfig(checkpoint_config=CheckpointConfig(num_to_keep=1)) trainer = TorchTrainer( From b6058d638dc6230f9a0b264078e1fbb712e08aa6 Mon Sep 17 00:00:00 2001 From: Matthew Deng Date: Mon, 16 Jan 2023 12:02:50 -0800 Subject: [PATCH 2/2] horovod Signed-off-by: Matthew Deng --- python/ray/train/horovod/horovod_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/train/horovod/horovod_trainer.py b/python/ray/train/horovod/horovod_trainer.py index 98cc572791d6..d148e6775fae 100644 --- a/python/ray/train/horovod/horovod_trainer.py +++ b/python/ray/train/horovod/horovod_trainer.py @@ -127,7 +127,7 @@ def train_loop_per_worker(): for epoch in range(num_epochs): model.train() for batch in dataset_shard.iter_torch_batches( - batch_size=32, dtypes=torch.float + batch_size=32, dtypes=torch.float, device=train.torch.get_device() ): inputs, labels = torch.unsqueeze(batch["x"], 1), batch["y"] inputs.to(device)