From aeafd9e2b5c2dc3a77489e8485b828dec8a6210d Mon Sep 17 00:00:00 2001
From: Thomas <thomasraoux@google.com>
Date: Fri, 2 Apr 2021 09:24:36 -0700
Subject: [PATCH] Fix CUDA HAL bug and enable more execution tests (#5296)

---
 iree/hal/cuda/graph_command_buffer.c       |  8 +++--
 iree/test/e2e/cuda_specific/BUILD          | 36 ++++++++++++++++++++++
 iree/test/e2e/cuda_specific/CMakeLists.txt | 26 ++++++++++++++++
 iree/test/e2e/cuda_specific/dot.mlir       | 29 +++++++++++++++++
 4 files changed, 96 insertions(+), 3 deletions(-)
 create mode 100644 iree/test/e2e/cuda_specific/BUILD
 create mode 100644 iree/test/e2e/cuda_specific/CMakeLists.txt
 create mode 100644 iree/test/e2e/cuda_specific/dot.mlir

diff --git a/iree/hal/cuda/graph_command_buffer.c b/iree/hal/cuda/graph_command_buffer.c
index 7fa0822c42e3..af3918533dbe 100644
--- a/iree/hal/cuda/graph_command_buffer.c
+++ b/iree/hal/cuda/graph_command_buffer.c
@@ -246,7 +246,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_fill_buffer(
   CUDA_MEMSET_NODE_PARAMS params = {
       .dst = target_device_buffer + target_offset,
       .elementSize = pattern_length,
-      .width = length,
+      // width in number of elements despite what driver documentation says.
+      .width = length / pattern_length,
       .height = 1,
       .value = dword_pattern,
   };
@@ -327,8 +328,9 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_push_descriptor_set(
     assert(arg_index < max_binding_count &&
            "binding index larger than the max expected.");
     CUdeviceptr device_ptr =
-        iree_hal_cuda_buffer_device_pointer(bindings[i].buffer) +
-        iree_hal_buffer_byte_offset(bindings[i].buffer);
+        iree_hal_cuda_buffer_device_pointer(
+            iree_hal_buffer_allocated_buffer(bindings[i].buffer)) +
+        iree_hal_buffer_byte_offset(bindings[i].buffer) + bindings[i].offset;
     *((CUdeviceptr*)command_buffer->current_descriptor[arg_index]) = device_ptr;
   }
   return iree_ok_status();
diff --git a/iree/test/e2e/cuda_specific/BUILD b/iree/test/e2e/cuda_specific/BUILD
new file mode 100644
index 000000000000..cb55d3d8cb4c
--- /dev/null
+++ b/iree/test/e2e/cuda_specific/BUILD
@@ -0,0 +1,36 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Tests for end-to-end IREE support specific to the CUDA backend to be able to
+# incrementally enable features.
+
+load("//build_tools/bazel:iree_check_test.bzl", "iree_check_single_backend_test_suite")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_check_single_backend_test_suite(
+    name = "check_cuda",
+    srcs = [
+        "dot.mlir",
+    ],
+    compiler_flags = [
+        "-iree-flow-dispatch-linalg-on-tensors",
+    ],
+    driver = "cuda",
+    target_backend = "cuda",
+)
diff --git a/iree/test/e2e/cuda_specific/CMakeLists.txt b/iree/test/e2e/cuda_specific/CMakeLists.txt
new file mode 100644
index 000000000000..aa07629755d5
--- /dev/null
+++ b/iree/test/e2e/cuda_specific/CMakeLists.txt
@@ -0,0 +1,26 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# iree/test/e2e/cuda_specific/BUILD                                            #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_check_single_backend_test_suite(
+  NAME
+    check_cuda
+  SRCS
+    "dot.mlir"
+  TARGET_BACKEND
+    "cuda"
+  DRIVER
+    "cuda"
+  COMPILER_FLAGS
+    "-iree-flow-dispatch-linalg-on-tensors"
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/iree/test/e2e/cuda_specific/dot.mlir b/iree/test/e2e/cuda_specific/dot.mlir
new file mode 100644
index 000000000000..caf38c26a0ff
--- /dev/null
+++ b/iree/test/e2e/cuda_specific/dot.mlir
@@ -0,0 +1,29 @@
+func @f32() attributes { iree.module.export } {
+  %lhs = iree.unfoldable_constant dense<[
+    [15.0, 14.0, 13.0],
+    [12.0, 11.0, 10.0],
+    [09.0, 08.0, 07.0],
+    [06.0, 05.0, 04.0],
+    [03.0, 02.0, 01.0]]> : tensor<5x3xf32>
+  %rhs = iree.unfoldable_constant dense<[
+    [15.0, 14.0, 13.0, 12.0, 11.0],
+    [10.0, 09.0, 08.0, 07.0, 06.0],
+    [05.0, 04.0, 03.0, 02.0, 01.0]]> : tensor<3x5xf32>
+  %res = "mhlo.dot"(%lhs, %rhs) : (tensor<5x3xf32>, tensor<3x5xf32>) -> tensor<5x5xf32>
+  check.expect_almost_eq_const(%res, dense<[
+    [430.0, 388.0, 346.0, 304.0, 262.0],
+    [340.0, 307.0, 274.0, 241.0, 208.0],
+    [250.0, 226.0, 202.0, 178.0, 154.0],
+    [160.0, 145.0, 130.0, 115.0, 100.0],
+    [70.0, 64.0, 58.0, 52.0, 46.0]]> : tensor<5x5xf32>) : tensor<5x5xf32>
+  return
+}
+
+// large aligned case that can be vectorized.
+func @large_aligned() attributes { iree.module.export } {
+  %lhs = iree.unfoldable_constant dense<1.0> : tensor<252x1024xf32>
+  %rhs = iree.unfoldable_constant dense<0.4> : tensor<1024x500xf32>
+  %res = "mhlo.dot"(%lhs, %rhs) : (tensor<252x1024xf32>, tensor<1024x500xf32>) -> tensor<252x500xf32>
+  check.expect_almost_eq_const(%res, dense<409.596> : tensor<252x500xf32>) : tensor<252x500xf32>
+  return
+}