tensorflow: Merge pull request #58788 from trevor-m:tmorris-bf16-spac…

…edepth Commit: 95eb6e503c6f30d593fc2803f19c62e1a3e60d23
sgtest · Dec 27, 2022 · ac79dc4 · ac79dc4
1 parent eed08bc
commit ac79dc4
Show file tree

Hide file tree

Showing 11 changed files with 54 additions and 27 deletions.
diff --git a/tensorflow/tensorflow/core/kernels/batchtospace_op.cc b/tensorflow/tensorflow/core/kernels/batchtospace_op.cc
@@ -284,6 +284,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER);
                           BatchToSpaceOp<GPUDevice, T>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER);
+TF_CALL_bfloat16(REGISTER);
 #undef REGISTER
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 

diff --git a/tensorflow/tensorflow/core/kernels/depthtospace_op.cc b/tensorflow/tensorflow/core/kernels/depthtospace_op.cc
@@ -192,6 +192,10 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("DepthToSpace").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
     DepthToSpaceOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(Name("DepthToSpace")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::bfloat16>("T"),
+                        DepthToSpaceOp<GPUDevice, Eigen::bfloat16>);
 REGISTER_KERNEL_BUILDER(
     Name("DepthToSpace").Device(DEVICE_GPU).TypeConstraint<qint8>("T"),
     DepthToSpaceOp<GPUDevice, qint8>);

diff --git a/tensorflow/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc b/tensorflow/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
@@ -248,6 +248,12 @@ template struct functor::DepthToSpaceOpFunctor<GPUDevice, Eigen::half,
 template struct functor::DepthToSpaceOpFunctor<GPUDevice, Eigen::half,
                                                FORMAT_NHWC>;
 
+// Instantiate the GPU implementations for Eigen::bfloat16.
+template struct functor::DepthToSpaceOpFunctor<GPUDevice, Eigen::bfloat16,
+                                               FORMAT_NCHW>;
+template struct functor::DepthToSpaceOpFunctor<GPUDevice, Eigen::bfloat16,
+                                               FORMAT_NHWC>;
+
 // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.
 template struct functor::DepthToSpaceOpFunctor<GPUDevice, int32, FORMAT_NCHW>;
 

diff --git a/tensorflow/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc b/tensorflow/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
@@ -158,7 +158,8 @@ struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
 #define INSTANTIATE_FOR_T(T) \
   TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T)
 
-TF_CALL_GPU_NUMBER_TYPES(INSTANTIATE_FOR_T)
+TF_CALL_GPU_NUMBER_TYPES(INSTANTIATE_FOR_T);
+TF_CALL_bfloat16(INSTANTIATE_FOR_T);
 
 #undef INSTANTIATE_FOR_T
 #undef INSTANTIATE

diff --git a/tensorflow/tensorflow/core/kernels/spacetobatch_op.cc b/tensorflow/tensorflow/core/kernels/spacetobatch_op.cc
@@ -296,6 +296,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER);
                           SpaceToBatchOp<GPUDevice, T>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER);
+TF_CALL_bfloat16(REGISTER);
 #undef REGISTER
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 

diff --git a/tensorflow/tensorflow/core/kernels/spacetodepth_op.cc b/tensorflow/tensorflow/core/kernels/spacetodepth_op.cc
@@ -209,6 +209,10 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
     SpaceToDepthOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(Name("SpaceToDepth")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::bfloat16>("T"),
+                        SpaceToDepthOp<GPUDevice, Eigen::bfloat16>);
 REGISTER_KERNEL_BUILDER(
     Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<qint8>("T"),
     SpaceToDepthOp<GPUDevice, qint8>);

diff --git a/tensorflow/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc b/tensorflow/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
@@ -245,6 +245,12 @@ template struct functor::SpaceToDepthOpFunctor<GPUDevice, Eigen::half,
 template struct functor::SpaceToDepthOpFunctor<GPUDevice, Eigen::half,
                                                FORMAT_NHWC>;
 
+// Instantiate the GPU implementations for Eigen::bfloat16.
+template struct functor::SpaceToDepthOpFunctor<GPUDevice, Eigen::bfloat16,
+                                               FORMAT_NCHW>;
+template struct functor::SpaceToDepthOpFunctor<GPUDevice, Eigen::bfloat16,
+                                               FORMAT_NHWC>;
+
 // Instantiate the GPU implementations for uint8.
 template struct functor::SpaceToDepthOpFunctor<GPUDevice, uint8, FORMAT_NCHW>;
 template struct functor::SpaceToDepthOpFunctor<GPUDevice, uint8, FORMAT_NHWC>;

diff --git a/tensorflow/tensorflow/python/kernel_tests/array_ops/batchtospace_op_test.py b/tensorflow/tensorflow/python/kernel_tests/array_ops/batchtospace_op_test.py
@@ -18,6 +18,7 @@
 op is tested in tandem with its reverse SpaceToBatch op.
 """
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -44,12 +45,14 @@ def batch_to_space(*args, **kwargs):
     return gen_array_ops.batch_to_space(*args, **kwargs)
 
 
-class BatchToSpaceDepthToSpace(test.TestCase, PythonOpImpl):
+class BatchToSpaceDepthToSpace(test.TestCase, parameterized.TestCase,
+                               PythonOpImpl):
 
   # Verifies that: batch_to_space(x) = transpose(depth_to_space(transpose(x)))
+  @parameterized.parameters(np.float32, dtypes.bfloat16.as_numpy_dtype)
   @test_util.run_deprecated_v1
-  def testDepthToSpaceTranspose(self):
-    x = np.arange(20 * 5 * 8 * 7, dtype=np.float32).reshape([20, 5, 8, 7])
+  def testDepthToSpaceTranspose(self, dtype):
+    x = np.arange(20 * 5 * 8 * 7, dtype=dtype).reshape([20, 5, 8, 7])
     block_size = 2
     for crops_dtype in [dtypes.int64, dtypes.int32]:
       crops = array_ops.zeros((2, 2), dtype=crops_dtype)

diff --git a/tensorflow/tensorflow/python/kernel_tests/array_ops/depthtospace_op_test.py b/tensorflow/tensorflow/python/kernel_tests/array_ops/depthtospace_op_test.py
@@ -15,6 +15,7 @@
 
 """Functional tests for DepthToSpace op."""
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import device_lib
@@ -31,7 +32,7 @@
 from tensorflow.python.platform import tf_logging
 
 
-class DepthToSpaceTest(test.TestCase):
+class DepthToSpaceTest(test.TestCase, parameterized.TestCase):
 
   def _testOne(self, inputs, block_size, outputs, dtype=dtypes.float32):
     input_nhwc = math_ops.cast(inputs, dtype)
@@ -63,19 +64,13 @@ def _testOne(self, inputs, block_size, outputs, dtype=dtypes.float32):
         output_nhwc = test_util.NCHWToNHWC(output_nchw)
         self.assertAllEqual(output_nhwc, outputs)
 
+  @parameterized.parameters(dtypes.float32, dtypes.float16, dtypes.bfloat16)
   @test_util.run_deprecated_v1
-  def testBasic(self):
+  def testBasic(self, dtype):
     x_np = [[[[1, 2, 3, 4]]]]
     block_size = 2
     x_out = [[[[1], [2]], [[3], [4]]]]
-    self._testOne(x_np, block_size, x_out)
-
-  @test_util.run_deprecated_v1
-  def testBasicFloat16(self):
-    x_np = [[[[1, 2, 3, 4]]]]
-    block_size = 2
-    x_out = [[[[1], [2]], [[3], [4]]]]
-    self._testOne(x_np, block_size, x_out, dtype=dtypes.float16)
+    self._testOne(x_np, block_size, x_out, dtype)
 
   # Tests for larger input dimensions. To make sure elements are
   # correctly ordered spatially.

diff --git a/tensorflow/tensorflow/python/kernel_tests/array_ops/spacetobatch_op_test.py b/tensorflow/tensorflow/python/kernel_tests/array_ops/spacetobatch_op_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Functional tests for SpaceToBatch and BatchToSpace ops."""
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -92,38 +93,41 @@ def batch_to_space(*args, **kwargs):
     return gen_array_ops.batch_to_space(*args, **kwargs)
 
 
-class SpaceToBatchTest(test.TestCase, PythonOpImpl):
+class SpaceToBatchTest(test.TestCase, parameterized.TestCase, PythonOpImpl):
   """Tests input-output pairs for the SpaceToBatch and BatchToSpace ops.
 
   This uses the Python compatibility wrapper that forwards to space_to_batch_nd.
   """
 
-  def _testPad(self, inputs, paddings, block_size, outputs):
+  def _testPad(self,
+               inputs,
+               paddings,
+               block_size,
+               outputs,
+               dtype=dtypes.float32):
     with self.cached_session():
       # outputs = space_to_batch(inputs)
       x_tf = self.space_to_batch(
-          math_ops.cast(inputs, dtypes.float32),
-          paddings,
-          block_size=block_size)
+          math_ops.cast(inputs, dtype), paddings, block_size=block_size)
       self.assertAllEqual(x_tf, outputs)
       # inputs = batch_to_space(outputs)
       x_tf = self.batch_to_space(
-          math_ops.cast(outputs, dtypes.float32),
-          paddings,
-          block_size=block_size)
+          math_ops.cast(outputs, dtype), paddings, block_size=block_size)
       self.assertAllEqual(x_tf, inputs)
 
-  def _testOne(self, inputs, block_size, outputs):
+  def _testOne(self, inputs, block_size, outputs, dtype=dtypes.float32):
     paddings = np.zeros((2, 2), dtype=np.int32)
-    self._testPad(inputs, paddings, block_size, outputs)
+    self._testPad(inputs, paddings, block_size, outputs, dtype)
 
   # [1, 2, 2, 1] <-> [4, 1, 1, 1]
+  @parameterized.parameters(dtypes.float32, dtypes.float16, dtypes.bfloat16,
+                            dtypes.uint8)
   @test_util.run_deprecated_v1
-  def testSmallInput2x2(self):
+  def testSmallInput2x2(self, dtype):
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 2
     x_out = [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-    self._testOne(x_np, block_size, x_out)
+    self._testOne(x_np, block_size, x_out, dtype)
 
   # [1, 2, 2, 1] <-> [1, 3, 3, 1] (padding) <-> [9, 1, 1, 1]
   @test_util.run_deprecated_v1

diff --git a/tensorflow/tensorflow/python/kernel_tests/array_ops/spacetodepth_op_test.py b/tensorflow/tensorflow/python/kernel_tests/array_ops/spacetodepth_op_test.py
@@ -50,7 +50,9 @@ def testBasic(self):
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 2
     x_out = [[[[1, 2, 3, 4]]]]
-    for dtype in [dtypes.float32, dtypes.float16, dtypes.uint8]:
+    for dtype in [
+        dtypes.float32, dtypes.float16, dtypes.bfloat16, dtypes.uint8
+    ]:
       self._testOne(x_np, block_size, x_out, dtype=dtype)