Skip to content

Commit

Permalink
[pytorch-vulkan][2/n] Height packing (pytorch#113883)
Browse files Browse the repository at this point in the history
Summary:
Enable logic for converting a channel packed tensor into heigh packed one.

Not yet connecting with rest of the system yet.

Test Plan:
```
(base) yipjustin@yipjustin-mac fbsource % buck2 run  -c pt.has_backtraces=1  --target-platforms ovr_config//platform/macos:arm64-fbsource //xplat/caffe2:pt_vulkan_quantized_api_test_binAppleMac\#macosx-arm64  -- --gtest_filter="*packing*"
File changed: fbsource//xplat/caffe2/aten/src/ATen/test/vulkan_quantized_api_test.cpp
Buck UI: https://www.internalfb.com/buck2/9a0d6bd6-e4a2-4d58-8f38-f806a0703122
Network: Up: 0B  Down: 0B
Jobs completed: 4. Time elapsed: 0.1s.
BUILD SUCCEEDED
Running main() from third-party/googletest/1.14.0/googletest/googletest/src/gtest_main.cc
Note: Google Test filter = *packing*
[==========] Running 1 test from 1 test suite.
[----------] Global test environment set-up.
[----------] 1 test from VulkanAPITest
[ RUN      ] VulkanAPITest.channel_to_height_packing_test
[       OK ] VulkanAPITest.channel_to_height_packing_test (35 ms)
[----------] 1 test from VulkanAPITest (35 ms total)

[----------] Global test environment tear-down
[==========] 1 test from 1 test suite ran. (36 ms total)
[  PASSED  ] 1 test.
```

Reviewed By: SS-JIA

Differential Revision: D51379737

Pull Request resolved: pytorch#113883
Approved by: https://github.com/SS-JIA
  • Loading branch information
yipjustin authored and pytorchmergebot committed Nov 18, 2023
1 parent fdaddec commit f8516ce
Show file tree
Hide file tree
Showing 10 changed files with 464 additions and 109 deletions.
6 changes: 6 additions & 0 deletions aten/src/ATen/native/vulkan/api/Tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ c10::SmallVector<int64_t, 6u> calc_gpu_sizes(
gpu_sizes[1] = sizes[1];
gpu_sizes[2] = sizes[3];
gpu_sizes[3] = api::utils::align_up(sizes[3], INT64_C(4));
break;
case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
gpu_sizes[0] = sizes[0];
gpu_sizes[1] = sizes[1];
Expand Down Expand Up @@ -320,12 +321,17 @@ api::utils::uvec3 create_image_extents(
case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
TORCH_CHECK(width % 4 == 0, "Channels must be divisible by 4!")
width /= 4;
break;
case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
TORCH_CHECK(height % 4 == 0, "Channels must be divisible by 4!")
height /= 4;
break;
case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
TORCH_CHECK(channels % 4 == 0, "Channels must be divisible by 4!")
channels /= 4;
break;
default:
TORCH_CHECK(false, "Invalid memory format used!");
}

return {width, height, batch * channels};
Expand Down
12 changes: 12 additions & 0 deletions aten/src/ATen/native/vulkan/api/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,18 @@ inline ivec4 make_ivec4(IntArrayRef ints, bool reverse = false) {
}
}

inline ivec4 make_ivec4_prepadded1(IntArrayRef ints) {
TORCH_CHECK(ints.size() <= 4);

ivec4 result = {1, 1, 1, 1};
size_t base = 4 - ints.size();
for (size_t i = 0; i < ints.size(); ++i) {
result.data[i + base] = safe_downcast<int32_t>(ints[i]);
}

return result;
}

inline ivec3 make_ivec3(uvec3 ints) {
return {
safe_downcast<int32_t>(ints.data[0u]),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#version 450 core
#define PRECISION $precision
#define FORMAT $format

#include "indexing.h"

layout(std430) buffer;

/* Qualifiers: layout - storage - precision - memory */

layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict image3D uOutput;
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
layout(set = 0, binding = 2) uniform PRECISION restrict Block {
ivec4 sizes;
} uBlock;

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

void main() {
ivec3 pos = ivec3(gl_GlobalInvocationID);

int src_w = pos.x;
int src_base_h = pos.y * 4;

// uBlock.sizes.y is the c in nchw.
int num_c = uBlock.sizes.y;

int src_c = pos.z % num_c;
int src_n = pos.z / num_c;

// Fetch the 4 elements from the channel-packed tensor
ivec4 src_pos0 = get_channel_packed_pos_from_index(
ivec4(src_n, src_c, src_base_h, src_w),
uBlock.sizes);

ivec4 src_pos1 = get_channel_packed_pos_from_index(
ivec4(src_n, src_c, src_base_h + 1, src_w),
uBlock.sizes);

ivec4 src_pos2 = get_channel_packed_pos_from_index(
ivec4(src_n, src_c, src_base_h + 2, src_w),
uBlock.sizes);

ivec4 src_pos3 = get_channel_packed_pos_from_index(
ivec4(src_n, src_c, src_base_h + 3, src_w),
uBlock.sizes);

vec4 t0 = texelFetch(uInput, src_pos0.xyz, 0);
vec4 t1 = texelFetch(uInput, src_pos1.xyz, 0);
vec4 t2 = texelFetch(uInput, src_pos2.xyz, 0);
vec4 t3 = texelFetch(uInput, src_pos3.xyz, 0);

vec4 out_t = vec4(
t0[src_pos0.w],
t1[src_pos1.w],
t2[src_pos2.w],
t3[src_pos3.w]);

imageStore(uOutput, pos, out_t);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#version 450 core
#define PRECISION $precision
#define FORMAT $format

#include "indexing.h"

layout(std430) buffer;

/* Qualifiers: layout - storage - precision - memory */

layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict image3D uOutput;
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
layout(set = 0, binding = 2) uniform PRECISION restrict Block {
ivec4 sizes;
} uBlock;

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

void main() {
ivec3 pos = ivec3(gl_GlobalInvocationID);

int src_base_w = pos.x * 4;
int src_h = pos.y;

// uBlock.sizes.y is the c in nchw.
int num_c = uBlock.sizes.y;

int src_c = pos.z % num_c;
int src_n = pos.z / num_c;

// Fetch the 4 elements from the channel-packed tensor
ivec4 src_pos0 = get_channel_packed_pos_from_index(
ivec4(src_n, src_c, src_h, src_base_w),
uBlock.sizes);

ivec4 src_pos1 = get_channel_packed_pos_from_index(
ivec4(src_n, src_c, src_h, src_base_w + 1),
uBlock.sizes);

ivec4 src_pos2 = get_channel_packed_pos_from_index(
ivec4(src_n, src_c, src_h, src_base_w + 2),
uBlock.sizes);

ivec4 src_pos3 = get_channel_packed_pos_from_index(
ivec4(src_n, src_c, src_h, src_base_w + 3),
uBlock.sizes);

vec4 t0 = texelFetch(uInput, src_pos0.xyz, 0);
vec4 t1 = texelFetch(uInput, src_pos1.xyz, 0);
vec4 t2 = texelFetch(uInput, src_pos2.xyz, 0);
vec4 t3 = texelFetch(uInput, src_pos3.xyz, 0);

vec4 out_t = vec4(
t0[src_pos0.w],
t1[src_pos1.w],
t2[src_pos2.w],
t3[src_pos3.w]);

imageStore(uOutput, pos, out_t);
}
22 changes: 22 additions & 0 deletions aten/src/ATen/native/vulkan/glsl/indexing.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,25 @@ uvec4 idx_to_coord(const uint idx, const uvec4 strides, const uvec4 sizes) {
uint coord_to_idx(const uvec4 coord, const uvec4 strides) {
return int(dot(coord * strides, ivec4(1)));
}

int align_up_4(int v) {
return ((v + 4 - 1) / 4) * 4;
}

// Return the x, y, z and index value the channel-packed 3D tensor from the {n,
// c, h, w}-index.
ivec4 get_channel_packed_pos_from_index(ivec4 nchw, ivec4 sizes) {
int n = nchw.x;
int c = nchw.y;
int h = nchw.z;
int w = nchw.w;

int aligned_c = align_up_4(sizes.y);
int c_stride = aligned_c / 4;

return ivec4(
w, // x
h, // y
n * c_stride + c / 4, // z
c % 4);
}
66 changes: 66 additions & 0 deletions aten/src/ATen/native/vulkan/impl/Packing.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <ATen/native/vulkan/api/Utils.h>
#include <ATen/native/vulkan/impl/Common.h>
#include <ATen/native/vulkan/impl/Packing.h>

Expand Down Expand Up @@ -283,6 +284,71 @@ bool record_buffer_to_nchw_op(
v_src.buffer_metadata());
}

vTensor channel_image_repacking(
const vTensor& v_input,
api::GPUMemoryLayout target_layout,
const api::ShaderInfo& shader_descriptor) {
api::Context* const context = api::context();

vTensor v_output{
context,
v_input.sizes(),
v_input.dtype(),
v_input.storage_type(),
target_layout,
};

// Required to determine how to insert memory barriers in the command buffer
api::PipelineBarrier pipeline_barrier{};

// The shader assumes a 4d nchw to calculate the lookup coordinate.
// If the input is not 4d, we need to pad it with 1's on the front.
const struct Block final {
api::utils::ivec4 sizes;
} block{
api::utils::make_ivec4_prepadded1(v_input.sizes()),
};

api::UniformParamsBuffer params(context, block);

context->submit_compute_job(
// shader descriptor
// VK_KERNEL(packing_channel_to_height),
shader_descriptor,
// pipeline barrier
pipeline_barrier,
// global work group size
v_output.extents(),
// local work group size
adaptive_work_group_size(v_output.extents()),
// fence handle
VK_NULL_HANDLE,
// shader arguments
v_output.image(
pipeline_barrier,
api::PipelineStage::COMPUTE,
api::MemoryAccessType::WRITE),
v_input.image(pipeline_barrier, api::PipelineStage::COMPUTE),
// params buffer
params.buffer());

return v_output;
}

vTensor convert_image_channels_packed_to_height_packed(const vTensor& v_input) {
return channel_image_repacking(
v_input,
api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED,
VK_KERNEL(convert_channels_to_height_packed));
}

vTensor convert_image_channels_packed_to_width_packed(const vTensor& v_input) {
return channel_image_repacking(
v_input,
api::GPUMemoryLayout::TENSOR_WIDTH_PACKED,
VK_KERNEL(convert_channels_to_width_packed));
}

} // namespace packing
} // namespace vulkan
} // namespace native
Expand Down
4 changes: 4 additions & 0 deletions aten/src/ATen/native/vulkan/impl/Packing.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ bool record_buffer_to_nchw_op(
api::PipelineBarrier pipeline_barrier,
const VkFence fence_handle);

vTensor convert_image_channels_packed_to_height_packed(const vTensor& v_input);

vTensor convert_image_channels_packed_to_width_packed(const vTensor& v_input);

} // namespace packing
} // namespace vulkan
} // namespace native
Expand Down
Loading

0 comments on commit f8516ce

Please sign in to comment.