diff --git a/dev/cuda/encoder_backward.cu b/dev/cuda/encoder_backward.cu new file mode 100644 index 000000000..43835473b --- /dev/null +++ b/dev/cuda/encoder_backward.cu @@ -0,0 +1,199 @@ +/* +Kernels for the positional encoder forward pass in GPT-2. + +Compile example: +nvcc -O3 --use_fast_math encoder_backward.cu -o encoder_backward + +version 1 is naive port from CPU code to kernel +parallelizes over B,T,C, uses atomics to add to dwte, dwpe +./encoder_backward 1 + +version 2 is another naive port +parallelizes over C, loops over B,T; much slower than version 1 +./encoder_backward 2 +*/ + +#include +#include +#include +#include "common.h" + +// ---------------------------------------------------------------------------- +// CPU code reference + +// GPT-2 positional encoder forward pass +void encoder_backward_cpu(float* dwte, float* dwpe, + float* dout, int* inp, + int B, int T, int C) { + for (int b = 0; b < B; b++) { + for (int t = 0; t < T; t++) { + float* dout_bt = dout + b * T * C + t * C; + int ix = inp[b * T + t]; + float* dwte_ix = dwte + ix * C; + float* dwpe_t = dwpe + t * C; + for (int i = 0; i < C; i++) { + float d = dout_bt[i]; + dwte_ix[i] += d; + dwpe_t[i] += d; + } + } + } +} + +// ---------------------------------------------------------------------------- +// GPU kernels + +// naive implementation with atomics +__global__ void encoder_backward_kernel1(float* dwte, float* dwpe, + const float* dout, const int* inp, + int B, int T, int C) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int N = B * T * C; + + if (idx < N) { + int bt = idx / C; + int b = bt / T; + int t = bt % T; + int c = idx % C; + + int ix = inp[b * T + t]; + + const float* dout_btc = dout + b * T * C + t * C + c; + float* dwte_ix = dwte + ix * C + c; + float* dwpe_tc = dwpe + t * C + c; + + atomicAdd(dwte_ix, *dout_btc); + atomicAdd(dwpe_tc, *dout_btc); + } +} + +// naive implementation that parallelizes over C and loops over B,T +// but it gets rid of atomics +__global__ void encoder_backward_kernel2(float* dwte, float* dwpe, + const float* dout, const int* inp, + int B, int T, int C) { + int c = blockIdx.x * blockDim.x + threadIdx.x; + if (c >= C) { return; } // guard + int BT = B * T; + for (int i = 0; i < B * T; i++) { + int t = i % T; + int ix = inp[i]; + float dout_btc = dout[i * C + c]; + dwte[ix * C + c] += dout_btc; + dwpe[t * C + c] += dout_btc; + } +} + +// ---------------------------------------------------------------------------- +// kernel launcher + +void encoder_backward1(float* dwte, float* dwpe, + const float* dout, const int* inp, + int B, int T, int C, + const int block_size) { + const int N = B * T * C; + const int grid_size = ceil_div(N, block_size); + encoder_backward_kernel1<<>>(dwte, dwpe, dout, inp, B, T, C); + cudaCheck(cudaGetLastError()); +} + +void encoder_backward2(float* dwte, float* dwpe, + const float* dout, const int* inp, + int B, int T, int C, + const int block_size) { + const int grid_size = ceil_div(C, block_size); + encoder_backward_kernel2<<>>(dwte, dwpe, dout, inp, B, T, C); + cudaCheck(cudaGetLastError()); +} + +// kernel version dispatch +void encoder_backward(int kernel_num, + float* dwte, float* dwpe, + const float* dout, const int* inp, + int B, int T, int C, + const int block_size) { + switch (kernel_num) { + case 1: + encoder_backward1(dwte, dwpe, dout, inp, B, T, C, block_size); + break; + case 2: + encoder_backward2(dwte, dwpe, dout, inp, B, T, C, block_size); + break; + default: + printf("Invalid kernel number\n"); + exit(1); + } +} + +// ---------------------------------------------------------------------------- + +int main(int argc, char **argv) { + srand(0); + + int B = 8; + int T = 1024; + int C = 768; + int V = 50257; + + int deviceIdx = 0; + cudaCheck(cudaSetDevice(deviceIdx)); + + // create host memory of random numbers + float* dout = make_random_float(B * T * C); + int* inp = make_random_int(B * T, V); + float* dwte = make_zeros_float(V * C); + float* dwpe = make_zeros_float(T * C); + + // move to GPU + float* d_dout; + int* d_inp; + float* d_dwte; + float* d_dwpe; + cudaCheck(cudaMalloc(&d_dout, B * T * C * sizeof(float))); + cudaCheck(cudaMalloc(&d_inp, B * T * sizeof(int))); + cudaCheck(cudaMalloc(&d_dwte, V * C * sizeof(float))); + cudaCheck(cudaMalloc(&d_dwpe, T * C * sizeof(float))); + cudaCheck(cudaMemcpy(d_dout, dout, B * T * C * sizeof(float), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_inp, inp, B * T * sizeof(int), cudaMemcpyHostToDevice)); + + // read kernel_num from command line + int kernel_num = 1; + if (argc > 1) { + kernel_num = atoi(argv[1]); + } + printf("Using kernel %d\n", kernel_num); + + // set up block sizes + int block_sizes[] = {32, 64, 128, 256, 512, 1024}; + + // first check the correctness of the kernel + for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) { + int block_size = block_sizes[j]; + printf("Checking block size %d.\n", block_size); + encoder_backward_cpu(dwte, dwpe, dout, inp, B, T, C); + encoder_backward(kernel_num, d_dwte, d_dwpe, d_dout, d_inp, B, T, C, block_size); + validate_result(d_dwte, dwte, "dwte", V * C, 1e-5f); + validate_result(d_dwpe, dwpe, "dwpe", T * C, 1e-5f); + } + printf("All results match. Starting benchmarks.\n\n"); + + for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) { + int block_size = block_sizes[j]; + int repeat_times = 1000; + float elapsed_time = benchmark_kernel(repeat_times, encoder_backward, + kernel_num, d_dwte, d_dwpe, d_dout, d_inp, B, T, C, block_size); + printf("block_size %4d | time %.4f ms\n", block_size, elapsed_time); + } + + // free memory + free(dout); + free(inp); + free(dwte); + free(dwpe); + cudaFree(d_dout); + cudaFree(d_inp); + cudaFree(d_dwte); + cudaFree(d_dwpe); + + return 0; +} \ No newline at end of file