From be9e063fe3edbc5b6129ca2fe7bc4bd57f012127 Mon Sep 17 00:00:00 2001
From: matthewdeng <matt@anyscale.com>
Date: Thu, 13 Apr 2023 14:14:03 -0700
Subject: [PATCH] [train] Fix rendering of diff code-blocks (#34355)

Signed-off-by: Matthew Deng <matt@anyscale.com>
Signed-off-by: Jack He <jackhe2345@gmail.com>
---
 doc/source/train/dl_guide.rst | 115 +++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 57 deletions(-)

diff --git a/doc/source/train/dl_guide.rst b/doc/source/train/dl_guide.rst
index 39e4a4c93315..d9af4f66ae79 100644
--- a/doc/source/train/dl_guide.rst
+++ b/doc/source/train/dl_guide.rst
@@ -66,28 +66,29 @@ training.
 
     .. code-block:: diff
 
-        import torch
-        from torch.nn.parallel import DistributedDataParallel
+         import torch
+         from torch.nn.parallel import DistributedDataParallel
         +from ray.air import session
         +from ray import train
         +import ray.train.torch
 
 
-        def train_func():
-        -   device = torch.device(f"cuda:{session.get_local_rank()}" if
-        -         torch.cuda.is_available() else "cpu")
-        -   torch.cuda.set_device(device)
+         def train_func():
+        -    device = torch.device(f"cuda:{session.get_local_rank()}" if
+        -        torch.cuda.is_available() else "cpu")
+        -    torch.cuda.set_device(device)
 
-            # Create model.
-            model = NeuralNetwork()
+             # Create model.
+             model = NeuralNetwork()
 
-        -   model = model.to(device)
-        -   model = DistributedDataParallel(model,
-        -       device_ids=[session.get_local_rank()] if torch.cuda.is_available() else None)
+        -    model = model.to(device)
+        -    model = DistributedDataParallel(model,
+        -        device_ids=[session.get_local_rank()] if torch.cuda.is_available() else None)
 
-        +   model = train.torch.prepare_model(model)
+        +    model = train.torch.prepare_model(model)
 
-            ...
+             ...
+            
 
 
     Then, use the ``prepare_data_loader`` function to automatically add a ``DistributedSampler`` to your ``DataLoader``
@@ -96,35 +97,35 @@ training.
 
     .. code-block:: diff
 
-        import torch
-        from torch.utils.data import DataLoader, DistributedSampler
+         import torch
+         from torch.utils.data import DataLoader, DistributedSampler
         +from ray.air import session
         +from ray import train
         +import ray.train.torch
 
 
-        def train_func():
-        -   device = torch.device(f"cuda:{session.get_local_rank()}" if
-        -          torch.cuda.is_available() else "cpu")
-        -   torch.cuda.set_device(device)
+         def train_func():
+        -    device = torch.device(f"cuda:{session.get_local_rank()}" if
+        -        torch.cuda.is_available() else "cpu")
+        -    torch.cuda.set_device(device)
 
-            ...
+             ...
 
-        -   data_loader = DataLoader(my_dataset, batch_size=worker_batch_size, sampler=DistributedSampler(dataset))
+        -    data_loader = DataLoader(my_dataset, batch_size=worker_batch_size, sampler=DistributedSampler(dataset))
 
-        +   data_loader = DataLoader(my_dataset, batch_size=worker_batch_size)
-        +   data_loader = train.torch.prepare_data_loader(data_loader)
+        +    data_loader = DataLoader(my_dataset, batch_size=worker_batch_size)
+        +    data_loader = train.torch.prepare_data_loader(data_loader)
 
-            for X, y in data_loader:
-        -       X = X.to_device(device)
-        -       y = y.to_device(device)
+             for X, y in data_loader:
+        -        X = X.to_device(device)
+        -        y = y.to_device(device)
 
     .. tip::
-       Keep in mind that ``DataLoader`` takes in a ``batch_size`` which is the batch size for each worker.
-       The global batch size can be calculated from the worker batch size (and vice-versa) with the following equation:
-
-        .. code-block::
+        Keep in mind that ``DataLoader`` takes in a ``batch_size`` which is the batch size for each worker.
+        The global batch size can be calculated from the worker batch size (and vice-versa) with the following equation:
 
+        .. code-block:: python
+            
             global_batch_size = worker_batch_size * session.get_world_size()
 
 .. tabbed:: TensorFlow
@@ -300,11 +301,11 @@ Then, you can pass in the config dictionary as an argument to ``Trainer``:
 .. code-block:: diff
 
     +config = {} # This should be populated.
-    trainer = TorchTrainer(
-        train_func,
-    +   train_loop_config=config,
-        scaling_config=ScalingConfig(num_workers=2)
-    )
+     trainer = TorchTrainer(
+         train_func,
+    +    train_loop_config=config,
+         scaling_config=ScalingConfig(num_workers=2)
+     )
 
 Putting this all together, you can run your training function with different
 configurations. As an example:
@@ -1083,29 +1084,29 @@ precision datatype for operations like linear layers and convolutions.
 
     .. code-block:: diff
 
-        def train_func():
-        +   train.torch.accelerate(amp=True)
+         def train_func():
+        +    train.torch.accelerate(amp=True)
 
-            model = NeuralNetwork()
-            model = train.torch.prepare_model(model)
+             model = NeuralNetwork()
+             model = train.torch.prepare_model(model)
 
-            data_loader = DataLoader(my_dataset, batch_size=worker_batch_size)
-            data_loader = train.torch.prepare_data_loader(data_loader)
+             data_loader = DataLoader(my_dataset, batch_size=worker_batch_size)
+             data_loader = train.torch.prepare_data_loader(data_loader)
 
-            optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
-        +   optimizer = train.torch.prepare_optimizer(optimizer)
+             optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
+        +    optimizer = train.torch.prepare_optimizer(optimizer)
 
-            model.train()
-            for epoch in range(90):
-                for images, targets in dataloader:
-                    optimizer.zero_grad()
+             model.train()
+             for epoch in range(90):
+                 for images, targets in dataloader:
+                     optimizer.zero_grad()
 
-                    outputs = model(images)
-                    loss = torch.nn.functional.cross_entropy(outputs, targets)
+                     outputs = model(images)
+                     loss = torch.nn.functional.cross_entropy(outputs, targets)
 
-        -           loss.backward()
-        +           train.torch.backward(loss)
-                    optimizer.step()
+        -            loss.backward()
+        +            train.torch.backward(loss)
+                     optimizer.step()
             ...
 
 
@@ -1126,13 +1127,13 @@ Reproducibility
 
     .. code-block:: diff
 
-        def train_func():
-        +   train.torch.enable_reproducibility()
+         def train_func():
+        +    train.torch.enable_reproducibility()
 
-            model = NeuralNetwork()
-            model = train.torch.prepare_model(model)
+             model = NeuralNetwork()
+             model = train.torch.prepare_model(model)
 
-            ...
+             ...
 
     .. warning:: :func:`ray.train.torch.enable_reproducibility` can't guarantee
         completely reproducible results across executions. To learn more, read