pytorch · fduwjj · Aug 31, 2022 · Aug 9, 2022 · Aug 9, 2022 · Aug 9, 2022
diff --git a/.github/workflows/docker/Dockerfile b/.github/workflows/docker/Dockerfile
@@ -0,0 +1,46 @@
+# Using cuda 11.3
+FROM nvidia/cuda:11.3.1-devel-ubuntu18.04
+
+# nvidia cuda 11.3 paths
+ENV LD_LIBRARY_PATH=/usr/local/cuda-11.3/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+ENV LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda-11.3/lib64
+
+# ensure local python is preferred over distribution python
+ENV PATH /usr/local/bin:$PATH
+
+ENV LANG C.UTF-8
+
+# Ignore `tzdata` asking questions
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN echo "US/Pacific" > /etc/timezone \
+    && ln -fs /usr/share/zoneinfo/America/Los_Angeles /etc/localtime \
+    && apt update && apt upgrade -y \
+    && apt-get -y install build-essential checkinstall wget git \
+	libreadline-gplv2-dev libncursesw5-dev libssl-dev \
+	libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev libffi-dev zlib1g-dev
+
+# Set Python Version
+ENV PYTHON_VERSION 3.9.12
+ENV PYTHON_COMMAND 3.9
+
+# Install Python from source.
+RUN cd /opt \
+	&& wget https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tgz \
+	&& tar xzf Python-$PYTHON_VERSION.tgz \
+	&& cd Python-$PYTHON_VERSION \
+	&& ./configure --enable-optimizations \
+	&& make altinstall \
+    && ln -fs /usr/local/bin/python$PYTHON_COMMAND /usr/bin/python \
+	&& ln -fs /usr/local/bin/python$PYTHON_COMMAND /usr/bin/python3 \
+	&& ln -fs /usr/local/bin/pip$PYTHON_COMMAND /usr/bin/pip \
+	&& ln -fs /usr/local/bin/pip$PYTHON_COMMAND /usr/bin/pip3 \
+	&& cd /
+
+# Install python libraries needed for CI test.
+RUN pip3 install --upgrade pip \
+	&& pip3 config set global.progress_bar off \
+	&& pip3 install flake8 pytest pytest-cov pytest-shard numpy expecttest hypothesis pyyaml
+
+LABEL versin="1.0.2"
+LABEL description="Build docker image for ubuntu Linux OS with cuda 11.3 and Python."
diff --git a/.github/workflows/spmd_gpu_tests.sh b/.github/workflows/spmd_gpu_tests.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+set -x
+
+# Print test options
+echo "VERBOSE: ${VERBOSE}"
+echo "SHARD: ${SHARD}"
+
+nvidia-smi
+nvcc --version
+cat /etc/os-release
+which python3
+python3 --version
+which pip3
+pip3 --version
+
+# Install dependencies
+# Turn off progress bar to save logs
+pip3 install --upgrade pip
+if [ -f requirements.txt ]; then pip3 install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html; fi
+
+# Install pippy
+python3 spmd/setup.py install
+
+set -ex
+
+# Run all integration tests
+pytest --shard-id=${SHARD} --num-shards=4 --cov=spmd test/spmd/ --ignore=test/spmd/tensor/test_dtensor_ops.py
diff --git a/.github/workflows/spmd_tests.yaml b/.github/workflows/spmd_tests.yaml
@@ -39,3 +39,80 @@ jobs:
       - name: Test with pytest
         run: |
           pytest --shard-id=${{ matrix.shard }} --num-shards=4 --cov=spmd test/spmd/
+
+  pytest_tests_gpu:
+    runs-on: linux.g4dn.12xlarge.nvidia.gpu
+    strategy:
+      matrix:
+        shard: ["0", "1", "2", "3"]
+    env:
+      DOCKER_IMAGE: gingerhugo/cuda-11.3-python-3.9:v1.0.2
+      SPMD_ROOT: /PiPPy
+      VERBOSE: "0"
+      OMP_NUM_THREADS: "1"
+      SHARD: ${{ matrix.shard }}
+
+    steps:
+      - name: Clean working directory
+        shell: bash
+        run: |
+          sudo rm -rf /home/ec2-user/actions-runner/_work/PiPPy/PiPPy/* || true
+      - uses: actions/checkout@v2
+      - name: Clean up previous CUDA driver installations
+        shell: bash
+        run: |
+          set -x
+          yum list installed | grep nvidia || true
+          yum list installed | grep cuda || true
+          sudo yum remove -y cuda || true
+          sudo yum remove -y cuda-drivers || true
+          sudo yum remove -y "*nvidia*" || true
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/pytorch/.github/actions/setup-ssh@master
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        run: |
+          bash .github/workflows/install_nvidia_utils_linux.sh || true
+          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test docker run
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            --gpus all \
+            --shm-size=1g --ulimit memlock=-1 \
+            -e VERBOSE \
+            -e OMP_NUM_THREADS \
+            -e SHARD \
+            --tty \
+            --detach \
+            -v "$(pwd):${SPMD_ROOT}" \
+            -w "${SPMD_ROOT}" \
+            "${DOCKER_IMAGE}"
+          )
+          # Run GPU tests and return error signal from docker
+          docker exec -t -w "${SPMD_ROOT}" "${container_name}" bash -c "bash .github/workflows/spmd_gpu_tests.sh; exit \$?"
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd):${SPMD_ROOT}" -w "${SPMD_ROOT}" "${DOCKER_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        shell: bash
+        if: inputs.skip-wait-ssh == ''
+        run: .github/workflows/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        shell: bash
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/wait_for_ssh_to_drain.sh b/.github/workflows/wait_for_ssh_to_drain.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+echo "Holding runner for 2 hours until all ssh sessions have logged out"
+for _ in $(seq 1440); do
+    # Break if no ssh session exists anymore
+    if [ "$(who)" = "" ]; then
+      break
+    fi
+    echo "."
+    sleep 5
+done
diff --git a/spmd/__init__.py b/spmd/__init__.py
@@ -72,10 +72,7 @@ def distribute_tensor(
             raise RuntimeError("Not supported!")
 
     return DTensor(
-        tensor,
-        device_mesh,
-        placements,
-        requires_grad=tensor.requires_grad,
+        tensor, device_mesh, placements, requires_grad=tensor.requires_grad
     )
 
 

diff --git a/spmd/tensor/dispatch.py b/spmd/tensor/dispatch.py
@@ -102,9 +102,7 @@ def _reshape_alias(
 
 _CURRENT_DECOMPOSITION_TABLE: Dict[
     Callable[..., object], Callable[..., object]
-] = {
-    torch.ops.aten._reshape_alias.default: _reshape_alias,
-}
+] = {torch.ops.aten._reshape_alias.default: _reshape_alias}
 
 
 def operator_dispatch(
@@ -133,10 +131,7 @@ def operator_dispatch(
     args_schema = tree_map(unwrap_schema, args)
     kwargs_schema = tree_map(unwrap_schema, kwargs)
 
-    op_schema = OpSchema(
-        args_schema,
-        kwargs_schema,
-    )
+    op_schema = OpSchema(args_schema, kwargs_schema)
     sharding_prop_func = op_to_rules.get(op_key, None)
 
     # step 1. there's sharding propagation rule, run
@@ -187,10 +182,7 @@ def operator_dispatch(
         # run local op computation with potentially modified args/kwargs
         local_tensor_args = cast(Tuple[object, ...], local_tensor_args)
         local_tensor_kwargs = cast(Dict[str, object], local_tensor_kwargs)
-        local_results = op_call(
-            *local_tensor_args,
-            **local_tensor_kwargs,
-        )
+        local_results = op_call(*local_tensor_args, **local_tensor_kwargs)
 
         if schema_kind == SchemaKind.inplace:
             # inplace op should return self instead of re-wrapping
@@ -230,8 +222,5 @@ def operator_dispatch(
         else:
             tensor_args = tree_map(unwrap_local_tensor, args)
             tensor_kwargs = tree_map(unwrap_local_tensor, kwargs)
-            local_results = op_call(
-                *tensor_args,
-                **tensor_kwargs,
-            )
+            local_results = op_call(*tensor_args, **tensor_kwargs)
             return wrap(local_results, op_schema.args_spec[0])
diff --git a/spmd/tensor/ops/math_ops.py b/spmd/tensor/ops/math_ops.py
@@ -19,9 +19,7 @@ def _gen_spec_with_pending_sum(
 
 
 def einop_rule(
-    equation: str,
-    op_schema: OpSchema,
-    linearity: bool = False,
+    equation: str, op_schema: OpSchema, linearity: bool = False
 ) -> OutputSharding:
     """
     Propagate the sharding of inputs to output for ops whose data

diff --git a/spmd/tensor/ops/tensor_ops.py b/spmd/tensor/ops/tensor_ops.py
@@ -41,9 +41,7 @@ def no_shard_prop_rule(op_schema: OpSchema) -> OutputSharding:
     "aten.new_empty_strided.default",
 ]
 
-no_shard_prop_ops = [
-    "aten._local_scalar_dense.default",
-]
+no_shard_prop_ops = ["aten._local_scalar_dense.default"]
 
 for op in default_prop_ops:
     DTensor._op_to_rules[op] = default_prop_rule

diff --git a/spmd/tensor/ops/view_ops.py b/spmd/tensor/ops/view_ops.py
@@ -237,9 +237,7 @@ def dim_movedim(
     assert max(input) < ndim
     assert max(destination) < ndim
 
-    dest = [
-        -1,
-    ] * ndim
+    dest = [-1] * ndim
     for i, d in zip(input, destination):
         dest[d] = i
 

diff --git a/spmd/tensor/redistribute.py b/spmd/tensor/redistribute.py
@@ -216,9 +216,7 @@ def backward(ctx, grad_output: "spmd_tensor.DTensor"):  # type: ignore
 
         return (
             redistribute_spmd_tensor(
-                grad_output,
-                previous_device_mesh,
-                target_placements,
+                grad_output, previous_device_mesh, target_placements
             ),
             None,
             None,

diff --git a/spmd/test/common_utils.py b/spmd/test/common_utils.py
@@ -205,17 +205,15 @@ def __next__(self) -> Tuple[Tuple[object, ...], Dict[str, object]]:
                 else:
                     new_kwargs.append(arg)
 
-            return tree_unflatten(
-                new_args, self.flatten_args_spec
-            ), tree_unflatten(new_kwargs, self.flatten_kwargs_spec)
+            return (
+                tree_unflatten(new_args, self.flatten_args_spec),
+                tree_unflatten(new_kwargs, self.flatten_kwargs_spec),
+            )
         except StopIteration:
             raise StopIteration
 
     def to_dist_tensor(
-        self,
-        t: torch.Tensor,
-        mesh: DeviceMesh,
-        placements: List[Placement],
+        self, t: torch.Tensor, mesh: DeviceMesh, placements: List[Placement]
     ) -> torch.Tensor:
         if type(t) is torch.Tensor or type(t) is torch.nn.Parameter:
             if self.is_supported_tensor(t):
@@ -231,7 +229,9 @@ def to_dist_tensor(
                 else:
                     r = distribute_tensor(t, mesh, placements)
                 if type(t) is torch.nn.Parameter:
-                    r = torch.nn.Parameter(r, requires_grad=r.requires_grad)  # type: ignore
+                    r = torch.nn.Parameter(
+                        r, requires_grad=r.requires_grad
+                    )  # type: ignore
                 return r
             else:
                 self.miss += 1

diff --git a/test/spmd/tensor/test_ddp.py b/test/spmd/tensor/test_ddp.py
@@ -2,7 +2,10 @@
 import torch
 import torch.nn as nn
 from torch.testing._internal.common_utils import run_tests
-from spmd.test.common_utils import DistTensorTestBase, with_comms  # type: ignore
+from spmd.test.common_utils import (  # type: ignore
+    DistTensorTestBase,
+    with_comms,
+)
 from spmd import (
     distribute_tensor,
     distribute_module,

diff --git a/test/spmd/tensor/test_device_mesh.py b/test/spmd/tensor/test_device_mesh.py
@@ -8,7 +8,10 @@
     _get_global_rank,
 )
 from torch.testing._internal.common_utils import run_tests
-from spmd.test.common_utils import DistTensorTestBase, with_comms  # type: ignore
+from spmd.test.common_utils import (  # type: ignore
+    DistTensorTestBase,
+    with_comms,
+)
 from spmd.tensor import DeviceMesh, DTensor, Shard, Replicate
 
 

diff --git a/test/spmd/tensor/test_dtensor.py b/test/spmd/tensor/test_dtensor.py
@@ -4,7 +4,10 @@
 from torch.distributed.distributed_c10d import ReduceOp
 
 from torch.testing._internal.common_utils import run_tests
-from spmd.test.common_utils import DistTensorTestBase, with_comms  # type: ignore
+from spmd.test.common_utils import (  # type: ignore
+    DistTensorTestBase,
+    with_comms,
+)
 from spmd.tensor import DeviceMesh, DTensor, Replicate, Shard, _Partial
 
 

diff --git a/test/spmd/tensor/test_dtensor_ops.py b/test/spmd/tensor/test_dtensor_ops.py
@@ -671,12 +671,7 @@ def wrapped(fn):
 }
 
 
-def run_dtensor_crossref(
-    test_case,
-    func,
-    args,
-    kwargs,
-):
+def run_dtensor_crossref(test_case, func, args, kwargs):
     to_dtensor = DTensorConverter(test_case.mesh, args, kwargs)
 
     # TODO: also handle cases where func raise an exception

diff --git a/test/spmd/tensor/test_math_ops.py b/test/spmd/tensor/test_math_ops.py
@@ -5,7 +5,10 @@
 
 from spmd.tensor.ops.math_ops import einop_rule, reduction_rule
 from spmd.tensor.placement_types import DTensorSpec, Replicate
-from spmd.test.common_utils import DistTensorTestBase, with_comms  # type: ignore
+from spmd.test.common_utils import (  # type: ignore
+    DistTensorTestBase,
+    with_comms,
+)
 from spmd import distribute_tensor, DeviceMesh, Shard