pytorch · fduwjj · Aug 31, 2022 · Aug 9, 2022 · Aug 9, 2022 · Aug 9, 2022
diff --git a/.github/workflows/spmd_gpu_tests.sh b/.github/workflows/spmd_gpu_tests.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -x
+
+# Print test options
+echo "VERBOSE: ${VERBOSE}"
+
+nvidia-smi
+nvcc --version
+which python3
+python3 --version
+which pip3
+pip3 --version
+
+# Install git
+apt-get update
+apt-get install git -y
+
+# Install dependencies
+# Turn off progress bar to save logs
+pip3 install --upgrade pip
+pip3 config set global.progress_bar off
+pip3 install flake8 pytest pytest-cov pytest-shard numpy expecttest
+if [ -f requirements.txt ]; then pip3 install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html; fi
+
+# Install pippy
+python3 smpd/setup.py install
+
+# Run all integration tests
+python3 test/spmd/tensor/test_megatron_example.py
+python3 test/spmd/tensor/test_ddp.py
+python3 test/spmd/tensor/test_tp_sharding_ops.py 
diff --git a/.github/workflows/spmd_tests.yaml b/.github/workflows/spmd_tests.yaml
@@ -29,3 +29,69 @@ jobs:
       - name: Test with pytest
         run: |
           pytest --cov=spmd test/spmd/
+
+  pytest_tests_gpu:
+    runs-on: linux.16xlarge.nvidia.gpu
+    strategy:
+      matrix:
+        num-gpus: ["4"]
+    env:
+      DOCKER_IMAGE: qts8n/cuda-python:devel
+      PIPPY_ROOT: /PiPPy
+      VERBOSE: "0"
+      OMP_NUM_THREADS: "1"
+
+    steps:
+      - name: Clean working directory
+        shell: bash
+        run: |
+          sudo rm -rf /home/ec2-user/actions-runner/_work/PiPPy/PiPPy/* || true
+      - uses: actions/checkout@v2
+      - name: Clean up previous CUDA driver installations
+        shell: bash
+        run: |
+          set -x
+          yum list installed | grep nvidia || true
+          yum list installed | grep cuda || true
+          sudo yum remove -y cuda || true
+          sudo yum remove -y cuda-drivers || true
+          sudo yum remove -y "*nvidia*" || true
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        run: |
+          bash .github/workflows/install_nvidia_utils_linux.sh || true
+          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test docker run
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            --gpus all \
+            -e VERBOSE \
+            -e OMP_NUM_THREADS \
+            --tty \
+            --detach \
+            -v "$(pwd):${PIPPY_ROOT}" \
+            -w "${PIPPY_ROOT}" \
+            "${DOCKER_IMAGE}"
+          )
+          # Run GPU tests and return error signal from docker
+          docker exec -t -w "${PIPPY_ROOT}" "${container_name}" bash -c "bash .github/workflows/spmd_gpu_tests.sh; exit $?"
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd):${PIPPY_ROOT}" -w "${PIPPY_ROOT}" "${DOCKER_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/spmd/__init__.py b/spmd/__init__.py
@@ -61,6 +61,10 @@ def distribute_tensor(
             scatter_shape = list(tensor.size())
             scatter_shape[shard_dim] = chunk_size
             local_tensor = device_mesh.scatter(tensor_list, mesh_dim=idx)
+            # scatter call could not return a tensor with correct requires_grad
+            # field, as ProcessGroupNCCL refuse to take a tensor with requires_grad
+            # to do inplace update! So we manually set it here
+            local_tensor.requires_grad_(tensor.requires_grad)
             dist_tensor = DTensor(
                 local_tensor,
                 device_mesh,

diff --git a/spmd/tensor/api.py b/spmd/tensor/api.py
@@ -66,7 +66,9 @@ def backward(ctx, grad_output: torch.Tensor):  # type: ignore
 
 class FromTorchTensor(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, input: torch.Tensor, device_mesh, placements, run_check):  # type: ignore
+    def forward(
+        ctx, input: torch.Tensor, device_mesh, placements, run_check
+    ):  # type: ignore
         ctx.previous_placement = placements
         ctx.previous_device_mesh = device_mesh
 

diff --git a/spmd/tensor/device_mesh.py b/spmd/tensor/device_mesh.py
@@ -167,8 +167,7 @@ def __init__(
                     # pg or not, it's required that all ranks participate
                     # in subgroup construction
                     new_subgroup = new_group(
-                        ranks=subgroup_ranks,
-                        backend=backend_name,
+                        ranks=subgroup_ranks, backend=backend_name
                     )
                     # only add to dim_groups if the current rank in the subgroup
                     if self.get_rank() in subgroup_ranks:
@@ -240,9 +239,7 @@ def scatter(
         src_for_dim = 0
         if dim_group is not GroupMember.WORLD:
             src_for_dim = _get_global_rank(dim_group, 0)
-        tensor = torch.empty_like(
-            to_scatter[0], requires_grad=to_scatter[0].requires_grad
-        )
+        tensor = torch.empty_like(to_scatter[0])
         if src_for_dim == get_rank():
             scatter(
                 tensor,

diff --git a/spmd/tensor/ops/dropout.py b/spmd/tensor/ops/dropout.py
@@ -16,7 +16,7 @@ def _dist_dropout(
         raise RuntimeError("Not supported!")
     else:
         local_tensor, mask = torch.ops.aten.native_dropout(
-            self.to_local(), p=p, train=train
+            self._local_tensor, p=p, train=train
         )
         return (
             DTensor(

diff --git a/spmd/tensor/ops/matrix_ops.py b/spmd/tensor/ops/matrix_ops.py
@@ -2,13 +2,9 @@
 # implement matrix related ops for distributed tensor
 from typing import Optional
 from spmd.tensor.dispatch import OpSchema
-from spmd.tensor.placement_types import (
-    PlacementSpec,
-)
+from spmd.tensor.placement_types import PlacementSpec
 from spmd.tensor.ops.prop_rules import einop_prop, mm_prop, pointwise_prop
-from spmd.tensor.ops.utils import (
-    register_prop_rule,
-)
+from spmd.tensor.ops.utils import register_prop_rule
 
 
 @register_prop_rule("aten.mm.default")

diff --git a/spmd/tensor/ops/pointwise_ops.py b/spmd/tensor/ops/pointwise_ops.py
@@ -1,8 +1,6 @@
 from typing import Optional
 from spmd.tensor.dispatch import OpSchema
-from spmd.tensor.placement_types import (
-    PlacementSpec,
-)
+from spmd.tensor.placement_types import PlacementSpec
 from spmd.tensor.ops.prop_rules import pointwise_prop
 
 # leave the pointwise_ops list here for convenience,

diff --git a/spmd/tensor/ops/tp_sharding_ops.py b/spmd/tensor/ops/tp_sharding_ops.py
@@ -7,9 +7,7 @@
 from typing import List
 from spmd.tensor.api import DTensor
 from spmd.tensor.placement_types import Shard
-from spmd.tensor.utils import (
-    unwrap_local_tensor,
-)
+from spmd.tensor.utils import unwrap_local_tensor
 from spmd.tensor.ops.utils import unwrap_single_placement, register_impl
 
 """

diff --git a/test/spmd/test_utils.py → spmd/test/_utils.py b/test/spmd/test_utils.py → spmd/test/_utils.py
diff --git a/test/spmd/tensor/test_ddp.py b/test/spmd/tensor/test_ddp.py
@@ -2,7 +2,7 @@
 import torch
 import torch.nn as nn
 from torch.testing._internal.common_utils import run_tests
-from ..test_utils import DistTensorTestBase, with_comms
+from spmd.test._utils import DistTensorTestBase, with_comms
 from spmd import (
     distribute_tensor,
     distribute_module,

diff --git a/test/spmd/tensor/test_device_mesh.py b/test/spmd/tensor/test_device_mesh.py
@@ -8,7 +8,7 @@
     _get_global_rank,
 )
 from torch.testing._internal.common_utils import run_tests
-from ..test_utils import DistTensorTestBase, with_comms
+from spmd.test._utils import DistTensorTestBase, with_comms
 from spmd.tensor import DeviceMesh, DTensor, Shard, Replicate
 
 

diff --git a/test/spmd/tensor/test_math_ops.py b/test/spmd/tensor/test_math_ops.py
@@ -4,7 +4,7 @@
 
 from spmd.tensor.ops.prop_rules import einop_prop
 from spmd.tensor.placement_types import PlacementSpec
-from ..test_utils import DistTensorTestBase, with_comms
+from spmd.test._utils import DistTensorTestBase, with_comms
 from spmd import distribute_tensor, DeviceMesh, Shard, Replicate
 
 

diff --git a/test/spmd/tensor/test_matrix_ops.py b/test/spmd/tensor/test_matrix_ops.py
@@ -1,7 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import torch
 from torch.testing._internal.common_utils import run_tests
-from ..test_utils import DistTensorTestBase, with_comms
+from spmd.test._utils import DistTensorTestBase, with_comms
 from spmd import distribute_tensor, DeviceMesh, Shard, Replicate
 
 

diff --git a/test/spmd/tensor/test_megatron_example.py b/test/spmd/tensor/test_megatron_example.py
@@ -0,0 +1,114 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import torch
+import torch.distributed as dist
+import functools
+from torch.testing._internal.common_utils import run_tests
+from spmd.test._utils import DistTensorTestBase, with_comms
+from spmd import distribute_tensor, DeviceMesh, DTensor, Shard, Replicate
+
+
+class SimpleModel(torch.nn.Module):
+    def __init__(self):
+        super(SimpleModel, self).__init__()
+        self.net1 = torch.nn.Linear(10, 16)
+        self.relu = torch.nn.ReLU()
+        self.net2 = torch.nn.Linear(16, 12)
+
+    def forward(self, x):
+        return self.net2(self.relu(self.net1(x)))
+
+
+def _aggregate_local_tensor(module: torch.nn.Module) -> torch.nn.Module:
+    def hook_func(_module, _input, output):
+        if isinstance(output, DTensor):
+            replica_placement = [Replicate()]
+            return (
+                output.redistribute(output.device_mesh, replica_placement)
+                .contiguous()
+                .local_tensor()
+            )
+
+    module.register_forward_hook(hook_func)
+    return module
+
+
+def _replicate_input_tensor(
+    module: torch.nn.Module, device_mesh, replica_placement
+) -> torch.nn.Module:
+    def hook_func(_, input):
+        if not isinstance(input[0], DTensor):
+            return DTensor.from_local(input[0], device_mesh, replica_placement)
+
+    module.register_forward_pre_hook(hook_func)
+    return module
+
+
+def _gradient_hook(param, grad):
+    param._local_tensor.grad = grad._local_tensor
+
+
+def shard_module(m, device_type):
+    pg = dist.distributed_c10d._get_default_group()
+    start_idx = 0
+    device_mesh = DeviceMesh(
+        device_type,
+        list(range(start_idx, start_idx + pg.size())),
+        dim_groups=[pg],
+    )
+    col_wise_sharding = [Shard(0)]
+    row_wise_sharding = [Shard(1)]
+    replicate = [Replicate()]
+    m.net1.weight = torch.nn.Parameter(
+        distribute_tensor(m.net1.weight, device_mesh, col_wise_sharding)
+    )
+    m.net2.weight = torch.nn.Parameter(
+        distribute_tensor(m.net2.weight, device_mesh, row_wise_sharding)
+    )
+    m.net1.bias = torch.nn.Parameter(
+        distribute_tensor(m.net1.bias, device_mesh, col_wise_sharding)
+    )
+    m.net2.bias = torch.nn.Parameter(
+        distribute_tensor(m.net2.bias, device_mesh, replicate)
+    )
+    m = _replicate_input_tensor(m, device_mesh, replicate)
+    m.net2 = _aggregate_local_tensor(m.net2)
+    m.net1.weight.register_hook(
+        functools.partial(_gradient_hook, m.net1.weight)
+    )
+
+
+class DistTensorMegatronTest(DistTensorTestBase):
+    @with_comms
+    def test_simple_megatron_e2e(self):
+        LR = 0.5
+        inp_size = [5, 10]
+        torch.manual_seed(0)
+        inp = torch.rand(*inp_size, device=self.device_type)
+        torch.manual_seed(5)
+        model = SimpleModel()
+        torch.manual_seed(5)
+        model_tp = SimpleModel()
+        shard_module(model_tp, self.device_type)
+
+        output = model(inp)
+        output_tp = model_tp(inp)
+        self.assertEqual(output, output_tp)
+
+        output.sum().backward()
+        output_tp.sum().backward()
+        # self.assertTrue(model_tp.net1.weight.local_tensor().grad is not None)
+
+        optim = torch.optim.SGD(model.parameters(), lr=LR)
+        optim.step()
+        optim = torch.optim.SGD(model_tp.parameters(), lr=LR)
+        optim.step()
+
+        torch.manual_seed(3)
+        inp = torch.rand(*inp_size).cuda(self.rank)
+        output = model(inp)
+        output_tp = model_tp(inp)
+        self.assertEqual(output, output_tp)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/spmd/tensor/test_pointwise_ops.py b/test/spmd/tensor/test_pointwise_ops.py
@@ -1,7 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import torch
 from torch.testing._internal.common_utils import run_tests
-from ..test_utils import DistTensorTestBase, with_comms, TEST_GPU_NUM
+from spmd.test._utils import DistTensorTestBase, with_comms, TEST_GPU_NUM
 from spmd import DeviceMesh, DTensor, Shard, Replicate, _Partial
 from torch.distributed.distributed_c10d import ReduceOp
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
@@ -13,8 +13,10 @@ def _run_sharded_elementwise_ops(
         self, mesh, spec, input_size, op, reset_seed=None, **kwargs
     ):
         torch.manual_seed(self.rank)
-        input_tensor = torch.randn(*input_size, requires_grad=True)
-        dist_tensor = DTensor.from_local(input_tensor, mesh, spec)
+        input_tensor = torch.randn(
+            *input_size, device=self.device_type, requires_grad=True
+        )
+        dist_tensor = DTensor(input_tensor, mesh, spec)
         reset_seed() if reset_seed else None
         dt = op(dist_tensor, **kwargs)
         reset_seed() if reset_seed else None

diff --git a/test/spmd/tensor/test_redistribute.py b/test/spmd/tensor/test_redistribute.py
@@ -5,7 +5,7 @@
 
 from torch.testing._internal.common_utils import run_tests
 
-from ..test_utils import DistTensorTestBase, with_comms
+from spmd.test._utils import DistTensorTestBase, with_comms
 from spmd.tensor import DeviceMesh, DTensor, Replicate, Shard, _Partial
 
 

diff --git a/test/spmd/tensor/test_tensor.py b/test/spmd/tensor/test_tensor.py
@@ -4,7 +4,7 @@
 from torch.distributed.distributed_c10d import ReduceOp
 
 from torch.testing._internal.common_utils import run_tests
-from ..test_utils import DistTensorTestBase, with_comms
+from spmd.test._utils import DistTensorTestBase, with_comms
 from spmd.tensor import DeviceMesh, DTensor, Replicate, Shard, _Partial
 
 

diff --git a/test/spmd/tensor/test_tensor_ops.py b/test/spmd/tensor/test_tensor_ops.py
@@ -1,7 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import torch
 from torch.testing._internal.common_utils import run_tests
-from ..test_utils import DistTensorTestBase, with_comms
+from spmd.test._utils import DistTensorTestBase, with_comms
 from spmd import distribute_tensor, DeviceMesh, DTensor, Shard, Replicate