Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[train][docs] update docstrings/quickstarts to work when use_gpu=True #31692

Merged
merged 4 commits into from
Jan 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions doc/source/ray-air/doc_code/hf_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
from ray.train.huggingface import HuggingFaceTrainer
from ray.air.config import ScalingConfig


# If using GPUs, set this to True.
use_gpu = False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dumb, yet effective trick for cases you want to show users you can use gpus, but want cpus on CI:

use_gpu = True  # include in docs

use_gpu = False # exclude

<code using use_gpu>


model_checkpoint = "gpt2"
tokenizer_checkpoint = "sgugger/gpt2-like-tokenizer"
block_size = 128
Expand Down Expand Up @@ -66,7 +70,7 @@ def trainer_init_per_worker(train_dataset, eval_dataset, **config):
logging_strategy="epoch",
learning_rate=2e-5,
weight_decay=0.01,
no_cuda=True, # Set to False for GPU training
no_cuda=(not use_gpu),
)
return transformers.Trainer(
model=model,
Expand All @@ -76,9 +80,7 @@ def trainer_init_per_worker(train_dataset, eval_dataset, **config):
)


scaling_config = ScalingConfig(num_workers=3)
# If using GPUs, use the below scaling config instead.
# scaling_config = ScalingConfig(num_workers=3, use_gpu=True)
scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu)
trainer = HuggingFaceTrainer(
trainer_init_per_worker=trainer_init_per_worker,
scaling_config=scaling_config,
Expand Down
10 changes: 6 additions & 4 deletions doc/source/ray-air/doc_code/hvd_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
from ray.train.horovod import HorovodTrainer
from ray.air.config import ScalingConfig

# If using GPUs, set this to True.
use_gpu = False


input_size = 1
layer_size = 15
output_size = 1
Expand Down Expand Up @@ -43,7 +47,7 @@ def train_loop_per_worker():
for epoch in range(num_epochs):
model.train()
for batch in dataset_shard.iter_torch_batches(
batch_size=32, dtypes=torch.float
batch_size=32, dtypes=torch.float, device=train.torch.get_device()
):
inputs, labels = torch.unsqueeze(batch["x"], 1), batch["y"]
inputs.to(device)
Expand All @@ -61,9 +65,7 @@ def train_loop_per_worker():


train_dataset = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])
scaling_config = ScalingConfig(num_workers=3)
# If using GPUs, use the below scaling config instead.
# scaling_config = ScalingConfig(num_workers=3, use_gpu=True)
scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu)
trainer = HorovodTrainer(
train_loop_per_worker=train_loop_per_worker,
scaling_config=scaling_config,
Expand Down
8 changes: 5 additions & 3 deletions doc/source/ray-air/doc_code/tf_starter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
from ray.train.tensorflow import TensorflowTrainer
from ray.air.config import ScalingConfig


# If using GPUs, set this to True.
use_gpu = False

a = 5
b = 10
size = 100
Expand Down Expand Up @@ -59,9 +63,7 @@ def train_func(config: dict):
train_dataset = ray.data.from_items(
[{"x": x / 200, "y": 2 * x / 200} for x in range(200)]
)
scaling_config = ScalingConfig(num_workers=2)
# If using GPUs, use the below scaling config instead.
# scaling_config = ScalingConfig(num_workers=2, use_gpu=True)
scaling_config = ScalingConfig(num_workers=2, use_gpu=use_gpu)
trainer = TensorflowTrainer(
train_loop_per_worker=train_func,
train_loop_config=config,
Expand Down
11 changes: 7 additions & 4 deletions doc/source/ray-air/doc_code/torch_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig


# If using GPUs, set this to True.
use_gpu = False


input_size = 1
layer_size = 15
output_size = 1
Expand Down Expand Up @@ -34,7 +39,7 @@ def train_loop_per_worker():

for epoch in range(num_epochs):
for batches in dataset_shard.iter_torch_batches(
batch_size=32, dtypes=torch.float
batch_size=32, dtypes=torch.float, device=train.torch.get_device()
):
inputs, labels = torch.unsqueeze(batches["x"], 1), batches["y"]
output = model(inputs)
Expand All @@ -53,9 +58,7 @@ def train_loop_per_worker():


train_dataset = ray.data.from_items([{"x": x, "y": 2 * x + 1} for x in range(200)])
scaling_config = ScalingConfig(num_workers=3)
# If using GPUs, use the below scaling config instead.
# scaling_config = ScalingConfig(num_workers=3, use_gpu=True)
scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu)
trainer = TorchTrainer(
train_loop_per_worker=train_loop_per_worker,
scaling_config=scaling_config,
Expand Down
9 changes: 5 additions & 4 deletions python/ray/train/horovod/horovod_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ def train_loop_per_worker():
from ray.train.torch import TorchCheckpoint
from ray.air.config import ScalingConfig

# If using GPUs, set this to True.
use_gpu = False

input_size = 1
layer_size = 15
output_size = 1
Expand Down Expand Up @@ -124,7 +127,7 @@ def train_loop_per_worker():
for epoch in range(num_epochs):
model.train()
for batch in dataset_shard.iter_torch_batches(
batch_size=32, dtypes=torch.float
batch_size=32, dtypes=torch.float, device=train.torch.get_device()
):
inputs, labels = torch.unsqueeze(batch["x"], 1), batch["y"]
inputs.to(device)
Expand All @@ -142,9 +145,7 @@ def train_loop_per_worker():
),
)
train_dataset = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])
scaling_config = ScalingConfig(num_workers=3)
# If using GPUs, use the below scaling config instead.
# scaling_config = ScalingConfig(num_workers=3, use_gpu=True)
scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu)
trainer = HorovodTrainer(
train_loop_per_worker=train_loop_per_worker,
scaling_config=scaling_config,
Expand Down
8 changes: 5 additions & 3 deletions python/ray/train/huggingface/huggingface_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ class HuggingFaceTrainer(TorchTrainer):
from ray.train.huggingface import HuggingFaceTrainer
from ray.air.config import ScalingConfig

# If using GPUs, set this to True.
use_gpu = False

model_checkpoint = "gpt2"
tokenizer_checkpoint = "sgugger/gpt2-like-tokenizer"
block_size = 128
Expand Down Expand Up @@ -180,6 +183,7 @@ def trainer_init_per_worker(train_dataset, eval_dataset, **config):
logging_strategy="epoch",
learning_rate=2e-5,
weight_decay=0.01,
no_cuda=(not use_gpu),
)
return transformers.Trainer(
model=model,
Expand All @@ -188,9 +192,7 @@ def trainer_init_per_worker(train_dataset, eval_dataset, **config):
eval_dataset=eval_dataset,
)

scaling_config = ScalingConfig(num_workers=3)
# If using GPUs, use the below scaling config instead.
# scaling_config = ScalingConfig(num_workers=3, use_gpu=True)
scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu)
trainer = HuggingFaceTrainer(
trainer_init_per_worker=trainer_init_per_worker,
scaling_config=scaling_config,
Expand Down
5 changes: 4 additions & 1 deletion python/ray/train/tensorflow/tensorflow_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ def train_loop_per_worker():
from ray.air.config import ScalingConfig
from ray.train.tensorflow import TensorflowTrainer

# If using GPUs, set this to True.
use_gpu = False

def build_model():
# toy neural network : 1-layer
return tf.keras.Sequential(
Expand Down Expand Up @@ -128,7 +131,7 @@ def train_loop_per_worker(config):
train_dataset = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])
trainer = TensorflowTrainer(
train_loop_per_worker=train_loop_per_worker,
scaling_config=ScalingConfig(num_workers=3),
scaling_config=ScalingConfig(num_workers=3, use_gpu=use_gpu),
datasets={"train": train_dataset},
train_loop_config={"num_epochs": 2},
)
Expand Down
9 changes: 5 additions & 4 deletions python/ray/train/torch/torch_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ def train_loop_per_worker():
from ray.air.config import RunConfig
from ray.air.config import CheckpointConfig

# If using GPUs, set this to True.
use_gpu = False

# Define NN layers archicture, epochs, and number of workers
input_size = 1
layer_size = 32
Expand Down Expand Up @@ -145,7 +148,7 @@ def train_loop_per_worker():
# Iterate over epochs and batches
for epoch in range(num_epochs):
for batches in dataset_shard.iter_torch_batches(batch_size=32,
dtypes=torch.float):
dtypes=torch.float, device=train.torch.get_device()):

# Add batch or unsqueeze as an additional dimension [32, x]
inputs, labels = torch.unsqueeze(batches["x"], 1), batches["y"]
Expand Down Expand Up @@ -176,9 +179,7 @@ def train_loop_per_worker():
)

# Define scaling and run configs
# If using GPUs, use the below scaling config instead.
# scaling_config = ScalingConfig(num_workers=3, use_gpu=True)
scaling_config = ScalingConfig(num_workers=num_workers)
scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu)
run_config = RunConfig(checkpoint_config=CheckpointConfig(num_to_keep=1))

trainer = TorchTrainer(
Expand Down