[pytorch-vulkan][2/n] Height packing (pytorch#113883)

Summary: Enable logic for converting a channel packed tensor into heigh packed one. Not yet connecting with rest of the system yet. Test Plan: ``` (base) yipjustin@yipjustin-mac fbsource % buck2 run -c pt.has_backtraces=1 --target-platforms ovr_config//platform/macos:arm64-fbsource //xplat/caffe2:pt_vulkan_quantized_api_test_binAppleMac\#macosx-arm64 -- --gtest_filter="*packing*" File changed: fbsource//xplat/caffe2/aten/src/ATen/test/vulkan_quantized_api_test.cpp Buck UI: https://www.internalfb.com/buck2/9a0d6bd6-e4a2-4d58-8f38-f806a0703122 Network: Up: 0B Down: 0B Jobs completed: 4. Time elapsed: 0.1s. BUILD SUCCEEDED Running main() from third-party/googletest/1.14.0/googletest/googletest/src/gtest_main.cc Note: Google Test filter = *packing* [==========] Running 1 test from 1 test suite. [----------] Global test environment set-up. [----------] 1 test from VulkanAPITest [ RUN ] VulkanAPITest.channel_to_height_packing_test [ OK ] VulkanAPITest.channel_to_height_packing_test (35 ms) [----------] 1 test from VulkanAPITest (35 ms total) [----------] Global test environment tear-down [==========] 1 test from 1 test suite ran. (36 ms total) [ PASSED ] 1 test. ``` Reviewed By: SS-JIA Differential Revision: D51379737 Pull Request resolved: pytorch#113883 Approved by: https://github.com/SS-JIA
gflatters · Nov 18, 2023 · f8516ce · f8516ce
1 parent fdaddec
commit f8516ce
Show file tree

Hide file tree

Showing 10 changed files with 464 additions and 109 deletions.
diff --git a/aten/src/ATen/native/vulkan/api/Tensor.cpp b/aten/src/ATen/native/vulkan/api/Tensor.cpp
@@ -272,6 +272,7 @@ c10::SmallVector<int64_t, 6u> calc_gpu_sizes(
             gpu_sizes[1] = sizes[1];
             gpu_sizes[2] = sizes[3];
             gpu_sizes[3] = api::utils::align_up(sizes[3], INT64_C(4));
+            break;
           case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
             gpu_sizes[0] = sizes[0];
             gpu_sizes[1] = sizes[1];
@@ -320,12 +321,17 @@ api::utils::uvec3 create_image_extents(
       case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
         TORCH_CHECK(width % 4 == 0, "Channels must be divisible by 4!")
         width /= 4;
+        break;
       case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
         TORCH_CHECK(height % 4 == 0, "Channels must be divisible by 4!")
         height /= 4;
+        break;
       case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
         TORCH_CHECK(channels % 4 == 0, "Channels must be divisible by 4!")
         channels /= 4;
+        break;
+      default:
+        TORCH_CHECK(false, "Invalid memory format used!");
     }
 
     return {width, height, batch * channels};

diff --git a/aten/src/ATen/native/vulkan/api/Utils.h b/aten/src/ATen/native/vulkan/api/Utils.h
@@ -151,6 +151,18 @@ inline ivec4 make_ivec4(IntArrayRef ints, bool reverse = false) {
   }
 }
 
+inline ivec4 make_ivec4_prepadded1(IntArrayRef ints) {
+  TORCH_CHECK(ints.size() <= 4);
+
+  ivec4 result = {1, 1, 1, 1};
+  size_t base = 4 - ints.size();
+  for (size_t i = 0; i < ints.size(); ++i) {
+    result.data[i + base] = safe_downcast<int32_t>(ints[i]);
+  }
+
+  return result;
+}
+
 inline ivec3 make_ivec3(uvec3 ints) {
   return {
       safe_downcast<int32_t>(ints.data[0u]),

diff --git a/aten/src/ATen/native/vulkan/glsl/convert_channels_to_height_packed.glsl b/aten/src/ATen/native/vulkan/glsl/convert_channels_to_height_packed.glsl
@@ -0,0 +1,60 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT $format
+
+#include "indexing.h"
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict image3D uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION sampler3D uInput;
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  ivec4 sizes;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  int src_w = pos.x;
+  int src_base_h = pos.y * 4;
+
+  // uBlock.sizes.y is the c in nchw.
+  int num_c = uBlock.sizes.y;
+
+  int src_c = pos.z % num_c;
+  int src_n = pos.z / num_c;
+
+  // Fetch the 4 elements from the channel-packed tensor
+  ivec4 src_pos0 = get_channel_packed_pos_from_index(
+    ivec4(src_n, src_c, src_base_h, src_w),
+    uBlock.sizes);
+
+  ivec4 src_pos1 = get_channel_packed_pos_from_index(
+    ivec4(src_n, src_c, src_base_h + 1, src_w),
+    uBlock.sizes);
+
+  ivec4 src_pos2 = get_channel_packed_pos_from_index(
+    ivec4(src_n, src_c, src_base_h + 2, src_w),
+    uBlock.sizes);
+
+  ivec4 src_pos3 = get_channel_packed_pos_from_index(
+    ivec4(src_n, src_c, src_base_h + 3, src_w),
+    uBlock.sizes);
+
+  vec4 t0 = texelFetch(uInput, src_pos0.xyz, 0);
+  vec4 t1 = texelFetch(uInput, src_pos1.xyz, 0);
+  vec4 t2 = texelFetch(uInput, src_pos2.xyz, 0);
+  vec4 t3 = texelFetch(uInput, src_pos3.xyz, 0);
+
+  vec4 out_t = vec4(
+    t0[src_pos0.w],
+    t1[src_pos1.w],
+    t2[src_pos2.w],
+    t3[src_pos3.w]);
+
+  imageStore(uOutput, pos, out_t);
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/convert_channels_to_width_packed.glsl b/aten/src/ATen/native/vulkan/glsl/convert_channels_to_width_packed.glsl
@@ -0,0 +1,60 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT $format
+
+#include "indexing.h"
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict image3D uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION sampler3D uInput;
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  ivec4 sizes;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  int src_base_w = pos.x * 4;
+  int src_h = pos.y;
+
+  // uBlock.sizes.y is the c in nchw.
+  int num_c = uBlock.sizes.y;
+
+  int src_c = pos.z % num_c;
+  int src_n = pos.z / num_c;
+
+  // Fetch the 4 elements from the channel-packed tensor
+  ivec4 src_pos0 = get_channel_packed_pos_from_index(
+    ivec4(src_n, src_c, src_h, src_base_w),
+    uBlock.sizes);
+
+  ivec4 src_pos1 = get_channel_packed_pos_from_index(
+    ivec4(src_n, src_c, src_h, src_base_w + 1),
+    uBlock.sizes);
+
+  ivec4 src_pos2 = get_channel_packed_pos_from_index(
+    ivec4(src_n, src_c, src_h, src_base_w + 2),
+    uBlock.sizes);
+
+  ivec4 src_pos3 = get_channel_packed_pos_from_index(
+    ivec4(src_n, src_c, src_h, src_base_w + 3),
+    uBlock.sizes);
+
+  vec4 t0 = texelFetch(uInput, src_pos0.xyz, 0);
+  vec4 t1 = texelFetch(uInput, src_pos1.xyz, 0);
+  vec4 t2 = texelFetch(uInput, src_pos2.xyz, 0);
+  vec4 t3 = texelFetch(uInput, src_pos3.xyz, 0);
+
+  vec4 out_t = vec4(
+    t0[src_pos0.w],
+    t1[src_pos1.w],
+    t2[src_pos2.w],
+    t3[src_pos3.w]);
+
+  imageStore(uOutput, pos, out_t);
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/indexing.h b/aten/src/ATen/native/vulkan/glsl/indexing.h
@@ -11,3 +11,25 @@ uvec4 idx_to_coord(const uint idx, const uvec4 strides, const uvec4 sizes) {
 uint coord_to_idx(const uvec4 coord, const uvec4 strides) {
   return int(dot(coord * strides, ivec4(1)));
 }
+
+int align_up_4(int v) {
+  return ((v + 4 - 1) / 4) * 4;
+}
+
+// Return the x, y, z and index value the channel-packed 3D tensor from the {n,
+// c, h, w}-index.
+ivec4 get_channel_packed_pos_from_index(ivec4 nchw, ivec4 sizes) {
+  int n = nchw.x;
+  int c = nchw.y;
+  int h = nchw.z;
+  int w = nchw.w;
+
+  int aligned_c = align_up_4(sizes.y);
+  int c_stride = aligned_c / 4;
+
+  return ivec4(
+      w, // x
+      h, // y
+      n * c_stride + c / 4, // z
+      c % 4);
+}
diff --git a/aten/src/ATen/native/vulkan/impl/Packing.cpp b/aten/src/ATen/native/vulkan/impl/Packing.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/vulkan/api/Utils.h>
 #include <ATen/native/vulkan/impl/Common.h>
 #include <ATen/native/vulkan/impl/Packing.h>
 
@@ -283,6 +284,71 @@ bool record_buffer_to_nchw_op(
       v_src.buffer_metadata());
 }
 
+vTensor channel_image_repacking(
+    const vTensor& v_input,
+    api::GPUMemoryLayout target_layout,
+    const api::ShaderInfo& shader_descriptor) {
+  api::Context* const context = api::context();
+
+  vTensor v_output{
+      context,
+      v_input.sizes(),
+      v_input.dtype(),
+      v_input.storage_type(),
+      target_layout,
+  };
+
+  // Required to determine how to insert memory barriers in the command buffer
+  api::PipelineBarrier pipeline_barrier{};
+
+  // The shader assumes a 4d nchw to calculate the lookup coordinate.
+  // If the input is not 4d, we need to pad it with 1's on the front.
+  const struct Block final {
+    api::utils::ivec4 sizes;
+  } block{
+      api::utils::make_ivec4_prepadded1(v_input.sizes()),
+  };
+
+  api::UniformParamsBuffer params(context, block);
+
+  context->submit_compute_job(
+      // shader descriptor
+      // VK_KERNEL(packing_channel_to_height),
+      shader_descriptor,
+      // pipeline barrier
+      pipeline_barrier,
+      // global work group size
+      v_output.extents(),
+      // local work group size
+      adaptive_work_group_size(v_output.extents()),
+      // fence handle
+      VK_NULL_HANDLE,
+      // shader arguments
+      v_output.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_input.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      // params buffer
+      params.buffer());
+
+  return v_output;
+}
+
+vTensor convert_image_channels_packed_to_height_packed(const vTensor& v_input) {
+  return channel_image_repacking(
+      v_input,
+      api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED,
+      VK_KERNEL(convert_channels_to_height_packed));
+}
+
+vTensor convert_image_channels_packed_to_width_packed(const vTensor& v_input) {
+  return channel_image_repacking(
+      v_input,
+      api::GPUMemoryLayout::TENSOR_WIDTH_PACKED,
+      VK_KERNEL(convert_channels_to_width_packed));
+}
+
 } // namespace packing
 } // namespace vulkan
 } // namespace native

diff --git a/aten/src/ATen/native/vulkan/impl/Packing.h b/aten/src/ATen/native/vulkan/impl/Packing.h
@@ -38,6 +38,10 @@ bool record_buffer_to_nchw_op(
     api::PipelineBarrier pipeline_barrier,
     const VkFence fence_handle);
 
+vTensor convert_image_channels_packed_to_height_packed(const vTensor& v_input);
+
+vTensor convert_image_channels_packed_to_width_packed(const vTensor& v_input);
+
 } // namespace packing
 } // namespace vulkan
 } // namespace native