Skip to content

Commit

Permalink
Merge pull request #22 from junjihashimoto/feature/transpose
Browse files Browse the repository at this point in the history
Add transpose kernels
  • Loading branch information
austinvhuang authored Jul 29, 2024
2 parents bd8192b + b1da604 commit 47a85b7
Show file tree
Hide file tree
Showing 4 changed files with 272 additions and 1 deletion.
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ all: dawnlib check-clang check-linux-vulkan lib pch
cd examples/gpu_puzzles && make build/gpu_puzzles
cd examples/hello_world && make build/hello_world
cd examples/matmul && make build/matmul
cd examples/transpose && make build/transpose
cd examples/physics && make build/physics
cd examples/render && make build/render

Expand Down Expand Up @@ -78,6 +79,7 @@ clean:
rm -rf examples/gpu_puzzles/build/*
rm -rf examples/hello_world/build/*
rm -rf examples/matmul/build/matmul
rm -rf examples/transpose/build/transpose
rm -rf examples/physics/build/*
rm -rf examples/render/build/*
rm -f build/gpu.h.pch
Expand Down Expand Up @@ -108,7 +110,7 @@ check-linux-vulkan:
echo "Vulkan is installed."; \
vulkaninfo; \
else \
echo "Vulkan is not installed. Please install Vulkan drivers to continue. On Debian / Ubuntu: sudo apt install libvulkan1 mesa-vulkan-drivers vulkan-tools"; \
echo "Vulkan is not installed. Please install Vulkan drivers to continue. On Debian / Ubuntu: sudo apt install libvulkan1 mesa-vulkan-drivers vulkan-tools"; \
exit 1; \
fi \
else \
Expand Down
1 change: 1 addition & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ In order of beginning to advanced:
| [shadertui](shadertui) | A terminal UI taker on shadertoy demonstrating runtime live reloading of WGSL. |
| [physics](physics) | Parallel physics simulation of a double pendulum with each thread starting at a different initial condition. |
| [matmul](matmul) | Tiled matrix multiplication. |
| [transpose](transpose) | Tiled matrix transpose. |
| [webgpu_from_scratch](webgpu_from_scratch) | A minimal from-scratch example of how to use WebGPU directly without this library. This is useful to understand the code internals of gpu.cpp. Note this takes a while to build as it compiles the WebGPU C API implementation. |
27 changes: 27 additions & 0 deletions examples/transpose/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
CXX=clang++
GPUCPP ?= $(PWD)/../..
LIBDIR ?= $(GPUCPP)/third_party/lib
LIBSPEC ?= . $(GPUCPP)/source
NUM_JOBS?=$(shell nproc)
CODEPATH = find . ../../utils ../../ -maxdepth 1 -type f
TARGET=transpose
ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/null 2>&1 ; echo $$?),0)
STDLIB :=
else
STDLIB := -stdlib=libc++
endif
FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -ldawn

run: ./build/$(TARGET)
$(LIBSPEC) && ./build/$(TARGET)

# Use clang -v to see the include paths
build/$(TARGET): run.cpp
mkdir -p build && $(CXX) $(FLAGS) -o ./build/$(TARGET)

watch:
@command -v entr >/dev/null 2>&1 || { echo >&2 "Please install entr with 'brew install entr' or 'sudo apt-get install entr'"; exit 1; }
mkdir -p build && $(CODEPATH) | entr -s "$(LIBSPEC) && rm -f ./build/$(TARGET) && make -j$(NUM_JOBS) ./build/$(TARGET) && ./build/$(TARGET)"

clean:
read -r -p "This will delete the contents of build/*. Are you sure? [CTRL-C to abort] " response && rm -rf build/*
241 changes: 241 additions & 0 deletions examples/transpose/run.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
#include <array>
#include <chrono>
#include <future>
#include <random>
#include <cstdlib>

#include "gpu.h" // createContext, createTensor, createKernel, dispatchKernel,
// wait, resetCommandBuffer, toCPU

#include "llmc/reference_impls.h" // for CPU reference implementation
#include "utils/array_utils.h" // show, isclose, randn, randint
#include "utils/logging.h" // LOG
#include "experimental/wgsl.h" // loopUnrolling

using namespace gpu;

// This implements the tranpose kernels in https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc .

static const char *kShaderTranspose1 = R"(
@group(0) @binding(0) var<storage, read_write> A: array<{{precision}}>;
@group(0) @binding(1) var<storage, read_write> B: array<{{precision}}>;
@compute @workgroup_size({{workgroupSize}})
fn main(
@builtin(global_invocation_id) globalID : vec3<u32>) {
let bRow: u32 = globalID.x;
let bCol: u32 = globalID.y;
B[bRow * {{M}} + bCol] = A[bCol * {{N}} + bRow];
}
)";

inline KernelCode createTranspose1(const char *shaderTemplate,
const size_t M, const size_t N,
const Shape &workgroupSize = {256, 1, 1},
NumType precision = kf32) {
std::string codeString(shaderTemplate);
replaceAll(codeString, {{"{{workgroupSize}}", toString(workgroupSize)},
{"{{precision}}", toString(precision)},
{"{{M}}", toString(M)},
{"{{N}}", toString(N)}});
return {codeString, workgroupSize};
}

// Shared memory cache-blocking
static const char *kShaderTranspose2 = R"(
@group(0) @binding(0) var<storage, read_write> A: array<{{precision}}>;
@group(0) @binding(1) var<storage, read_write> B: array<{{precision}}>;
var<workgroup> tile: array<{{precision}}, {{BN}} * {{BM}}>;
@compute @workgroup_size({{workgroupSize}})
fn main(
@builtin(local_invocation_id) localID : vec3<u32>,
@builtin(workgroup_id) groupID: vec3<u32>) {
let bRow: u32 = groupID.x * {{BN}};
let bCol: u32 = groupID.y * {{BM}};
let aPtr = bCol * {{N}} + bRow;
let bPtr = bRow * {{M}} + bCol;
let numThread: u32 = ({{BM}} * {{BN}}) / ({{TM}} * {{TN}});
for (var resIdxM: u32 = 0; resIdxM < {{TM}}; resIdxM++) {
for (var resIdxN: u32 = 0; resIdxN < {{TN}}; resIdxN++) {
let idx: u32 = localID.x + numThread * (resIdxN + {{TN}} * resIdxM);
let loadRow: u32 = idx / {{BN}};
let loadCol: u32 = idx % {{BN}};
tile[loadCol * {{BN}} + loadRow] = A[aPtr + loadRow * {{N}} + loadCol];
}
}
workgroupBarrier();
for (var resIdxN: u32 = 0; resIdxN < {{TN}}; resIdxN++) {
for (var resIdxM: u32 = 0; resIdxM < {{TM}}; resIdxM++) {
let idx: u32 = localID.x + numThread * (resIdxM + {{TM}} * resIdxN);
let loadRow: u32 = idx / {{BM}};
let loadCol: u32 = idx % {{BM}};
B[bPtr + loadRow * {{M}} + loadCol] = tile[loadRow * {{BM}} + loadCol];
}
}
}
)";

inline KernelCode createTranspose2(const char *shaderTemplate,
const size_t M, const size_t N,
const size_t BM, const size_t BN,
const size_t TM, const size_t TN,
const Shape &workgroupSize = {256, 1, 1},
NumType precision = kf32) {
assert(BM % TM == 0);
assert(BN % TN == 0);
assert(M % BM == 0);
assert(N % BN == 0);
std::string codeString(shaderTemplate);
replaceAll(codeString, {{"{{workgroupSize}}", toString(workgroupSize)},
{"{{precision}}", toString(precision)},
{"{{M}}", toString(M)},
{"{{N}}", toString(N)},
{"{{BM}}", toString(BM)},
{"{{BN}}", toString(BN)},
{"{{TM}}", toString(TM)},
{"{{TN}}", toString(TN)}
});
std::string unrolledCode = codeString ;// loopUnrolling(codeString);
return {unrolledCode, workgroupSize};
}

void initData(size_t M, size_t N, std::unique_ptr<float[]> &inputPtr) {
std::mt19937 gen(314159);
randn(inputPtr.get(), M * N, gen);
LOG(kDefLog, kInfo, "%s", show<float>(inputPtr.get(), M, N, "Input").c_str());
}

Kernel selectTranspose(Context &ctx, int version,
const Bindings</* input, output */ 2> &bindings,
size_t M, size_t N) {
Kernel kernel;
if (version == 1) {
Shape wgSize = {16, 16, 1};
LOG(kDefLog, kInfo, "wgSize: %s", toString(wgSize).c_str());
KernelCode transpose =
createTranspose1(kShaderTranspose1, M, N, /*wgsize*/ wgSize); // The shape of input == M x N
kernel = createKernel(ctx, transpose, bindings,
/*nWorkgroups*/ cdiv({N, M, 1}, wgSize)); // The shape of output == N x M
} else if (version == 2) {
static constexpr size_t BM = 64;
static constexpr size_t BK = 16;
static constexpr size_t BN = 64;
static constexpr size_t TM = BM / BK;
static constexpr size_t TN = BN / BK;
Shape wgSize = {(BM / TM) * (BN / TN), 1, 1}; // This is the same as BK * BK.
Shape nWorkgroups = {cdiv(N, BN), cdiv(M, BM), 1};
LOG(kDefLog, kInfo, "M: %d, N: %d", M, N);
LOG(kDefLog, kInfo, "BM: %d, BK: %d, BN: %d, TM: %d, TN: %d", BM, BK, BN, TM, TN);
LOG(kDefLog, kInfo, "wgSize: ( %s )", toString(wgSize).c_str());
LOG(kDefLog, kInfo, "nWorkgroups: ( %s )", toString(nWorkgroups).c_str());
KernelCode transpose = createTranspose2(kShaderTranspose2, M, N, BM, BN, TM, TN,
/*wgSize*/ wgSize,
kf32);
kernel = createKernel(ctx, transpose, bindings,
/*nWorkgroups*/ nWorkgroups);
} else if (version == 0) {
LOG(kDefLog, kInfo, "Skip Creating Kernel", M, N);
}
return kernel;
}

void runTest(int version, size_t M, size_t N,
std::unique_ptr<float[]> &inputPtr,
std::unique_ptr<float[]> &outputPtr) {
bool isCPU = version == 0;

// Allocate GPU buffers and copy data
Context ctx = createContext();
Tensor input = createTensor(ctx, Shape{M, N}, kf32, inputPtr.get());
Tensor output = createTensor(ctx, Shape{N, M}, kf32);

constexpr size_t nIter = 50;

// Initialize Kernel and bind GPU buffers
LOG(kDefLog, kInfo, "Creating Kernel");
Kernel kernel = selectTranspose(ctx, version, {input, output}, M, N);

// Dispatch kernel execution
LOG(kDefLog, kInfo, "Dispatching Kernel version %d, %d iterations ...",
version, nIter);

// pre-allocate promises and futures for async dispatch
// TODO(avh): implement a pooling mechanism for promises/futures in gpu.h
std::array<std::promise<void>, nIter> promises;
std::array<std::future<void>, nIter> futures;
for (int i = 0; i < nIter; i++) {
futures[i] = promises[i].get_future();
}

// Dispatch kernel nIter times
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < nIter; i++) {
if (!isCPU) {
dispatchKernel(ctx, kernel, promises[i]);
wait(ctx, futures[i]);
resetCommandBuffer(ctx.device, kernel);
} else {
transpose(inputPtr.get(), outputPtr.get(), M, N);
}
}
auto end = std::chrono::high_resolution_clock::now();

// Report performance.
// Use microsecond for more accurate time measurement
auto duration =
std::chrono::duration_cast<std::chrono::microseconds>(end - start);
float gbps = sizeof(float) * M * N /
(static_cast<double>(duration.count()) / 1000000.0) /
1000000000.0 * static_cast<float>(nIter);

LOG(kDefLog, kInfo, "Copying result to CPU");
if (!isCPU) {
toCPU(ctx, output, outputPtr.get(), M * N * sizeof(float));
}
LOG(kDefLog, kInfo, "%s",
show<float>(outputPtr.get(), N, M, "Output").c_str());

LOG(kDefLog, kInfo, "\n\n===================================================================="
"============\nExecution Time: (M = %d, N = %d) x %d iterations "
":\n%.3f "
"milliseconds / dispatch ~ %.2f "
"GB/s\n================================================================"
"================\n\n",
M, N, nIter, duration.count() / static_cast<double>(nIter) / 1000.0 /* us -> ms */, gbps);
}

int main() {
char* version_str = getenv("TEST_VERSION");
int version = version_str == NULL ? 2 : atoi(version_str);
// 0 == cpu
// 1 == naive transpose
// 2 == tiling with shared memory

size_t M, N; // Matrix dimensions
static constexpr int kTestSize = 2;
if constexpr (kTestSize == 0) {
// Tiny test
M = 16;
N = 32;
} else if constexpr (kTestSize == 1) {
// Small test
M = 256;
N = 512;
} else {
// Large test
M = 4096;
N = 2 * 4096;
}

std::unique_ptr<float[]> inputPtr = std::make_unique<float[]>(M * N);
std::unique_ptr<float[]> outputPtr = std::make_unique<float[]>(N * M);

initData(M, N, inputPtr);
runTest(version, M, N, inputPtr, outputPtr);

LOG(kDefLog, kInfo, "Done.");
return 0;
}

0 comments on commit 47a85b7

Please sign in to comment.