Skip to content

Commit

Permalink
[GPU/OpenCl] Kernel optimization
Browse files Browse the repository at this point in the history
Kernel Optimized for GPU. Some trivial changes in code.

Signed-off-by: Yash Singh <[email protected]>
  • Loading branch information
yashSingh0723 committed Sep 12, 2024
1 parent 9eaca55 commit 1b0be27
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 78 deletions.
29 changes: 16 additions & 13 deletions nntrainer/tensor/cl_operations/attention_kernel_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ namespace nntrainer {
* @param[out] freqs base frequencies array to be used in the future computation
* @param[in] theta rotary angle
*/
void precompute_freqs(int dim, unsigned int seq_len,
void precompute_freqs(unsigned int dim, unsigned int seq_len,
std::vector<std::vector<float>> &freqs_cos,
std::vector<std::vector<float>> &freqs_sin,
std::vector<float> &freqs, float theta = 10000.0) {
Expand All @@ -33,24 +33,24 @@ void precompute_freqs(int dim, unsigned int seq_len,
freqs.push_back(1.0 / (std::pow(theta, (2 * i) / static_cast<float>(dim))));
}

auto cos = std::vector<std::vector<float>>();
cos.assign(seq_len, std::vector<float>(dim, 0));
auto cos_vec = std::vector<std::vector<float>>();
cos_vec.assign(seq_len, std::vector<float>(dim, 0));

auto sin = std::vector<std::vector<float>>();
sin.assign(seq_len, std::vector<float>(dim, 0));
auto sin_vec = std::vector<std::vector<float>>();
sin_vec.assign(seq_len, std::vector<float>(dim, 0));

for (unsigned int i = 0; i < seq_len; ++i) {
for (unsigned int j = 0; j < half_; ++j) {
float angle = i * freqs[j];
cos[i][j] = std::cos(angle);
cos[i][j + half_] = std::cos(angle); // repeated 2 times
cos_vec[i][j] = std::cos(angle);
cos_vec[i][j + half_] = std::cos(angle); // repeated 2 times

sin[i][j] = std::sin(angle);
sin[i][j + half_] = std::sin(angle); // repeated 2 times
sin_vec[i][j] = std::sin(angle);
sin_vec[i][j + half_] = std::sin(angle); // repeated 2 times
}
}
freqs_cos = cos;
freqs_sin = sin;
freqs_cos = cos_vec;
freqs_sin = sin_vec;
}

/**
Expand All @@ -59,12 +59,15 @@ void precompute_freqs(int dim, unsigned int seq_len,
* @param[in] dim hidden dim size
* @param[in] from sequence order
* @param[in] max_timestep maximum timestep
* @param[in] context layer context to get the resource manager and queue id
*
* @todo Calling precompute_freqs in finalize to reduce code redundancy.
*/
void apply_rotary_emb_cl(Tensor &in, unsigned int dim, unsigned int from,
unsigned int max_timestep, RunLayerContext &context) {
nntrainer::Tensor out(in.getDim());
float value = 0;
float transformed_value = 0.0;
float value = 0.0f;
float transformed_value = 0.0f;
unsigned int half_ = dim / 2;

std::vector<std::vector<float>> freqs_cos = {};
Expand Down
49 changes: 24 additions & 25 deletions nntrainer/tensor/cl_operations/attention_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,37 +30,36 @@ __kernel void rotary_emb_cl(__global float *input,
unsigned int half_,
unsigned int max_timestep,
unsigned int from) {
unsigned int gid = get_global_id(0);
unsigned int gws = get_global_size(0);
__global float *cos_ptr = cos_;
__global float *sin_ptr = sin_;
float value = 0.0f;
float transformed_value = 0.0f;
for (unsigned int b = 0; b < batch; b++) {
for (unsigned int c = 0; c < channel; c++) {
for (unsigned int h = 0; h < height; h++) {
if (from + h < max_timestep) {
unsigned idx = (from + h)*dim;
for(unsigned int i = idx; i < idx + dim; i++){
cos_ptr[i - idx] = freqs_cos[i];
sin_ptr[i - idx] = freqs_sin[i];
}
unsigned int b = get_global_id(0);
unsigned int c = get_global_id(1);
if(b < batch && c < channel){
for (unsigned int h = 0; h < height; h++) {
if (from + h < max_timestep) {
unsigned idx = (from + h)*dim;
for(unsigned int i = idx; i < idx + dim; i++){
cos_ptr[i - idx] = freqs_cos[i];
sin_ptr[i - idx] = freqs_sin[i];
}
for (unsigned int w = 0; w < width; w = w + dim) {
for (unsigned int k = 0; k < dim; k++) {
unsigned int span = w + k;
value = input[b * channel * height * width + c * height * width + h * width + span];
if (k < half_) {
transformed_value = -1.0f * input[b * channel * height * width + c * height * width + h * width + span + half_];
} else {
transformed_value = input[b * channel * height * width + c * height * width + h * width + span - half_];
}
value = value * cos_ptr[k] + transformed_value * sin_ptr[k];
output[b * channel * height * width + c * height * width + h * width + span] = value;
}
for (unsigned int w = 0; w < width; w = w + dim) {
for (unsigned int k = 0; k < dim; k++) {
unsigned int span = w + k;
value = input[b * channel * height * width + c * height * width + h * width + span];
if (k < half_) {
transformed_value = -1.0f * input[b * channel * height * width + c * height * width + h * width + span + half_];
} else {
transformed_value = input[b * channel * height * width + c * height * width + h * width + span - half_];
}
value = value * cos_ptr[k] + transformed_value * sin_ptr[k];
output[b * channel * height * width + c * height * width + h * width + span] = value;
}
}
}
Expand Down Expand Up @@ -252,8 +251,8 @@ void rotary_emb_cl(float *in, float *out,
break;
}

const int work_groups_count[3] = {1, 1, 1};
const int work_group_size[3] = {32, 1, 1}; // test-value
const int work_groups_count[3] = {(int)batch, (int)channel, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
result = context.command_queue_inst_.DispatchCommand(
kernel_rotary_emb, work_groups_count, work_group_size);
if (!result) {
Expand Down
48 changes: 23 additions & 25 deletions nntrainer/tensor/cl_operations/attention_kernels_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,38 +30,36 @@ __kernel void rotary_emb_cl_fp16(__global half *input,
unsigned int half_,
unsigned int max_timestep,
unsigned int from) {
unsigned int gid = get_global_id(0);
unsigned int gws = get_global_size(0);
__global float *cos_ptr = cos_;
__global float *sin_ptr = sin_;
float value = 0.0f;
float transformed_value = 0.0f;
for (unsigned int b = 0; b < batch; b++) {
for (unsigned int c = 0; c < channel; c++) {
for (unsigned int h = 0; h < height; h++) {
if (from + h < max_timestep) {
unsigned idx = (from + h)*dim;
for(int i = idx; i < idx + dim; i++ ){
cos_ptr[i - idx] = freqs_cos[i];
sin_ptr[i - idx] = freqs_sin[i];
}
unsigned int b = get_global_id(0);
unsigned int c = get_global_id(1);
if(b < batch && c < channel){
for (unsigned int h = 0; h < height; h++) {
if (from + h < max_timestep) {
unsigned idx = (from + h)*dim;
for(int i = idx; i < idx + dim; i++ ){
cos_ptr[i - idx] = freqs_cos[i];
sin_ptr[i - idx] = freqs_sin[i];
}
}
for (unsigned int w = 0; w < width; w = w + dim) {
for (unsigned int k = 0; k < dim; k++) {
unsigned int span = w + k;
value = (float)input[b * channel * height * width + c * height * width + h * width + span];
if (k < half_) {
transformed_value = -1.0f * (float)input[b * channel * height * width + c * height * width + h * width + span + half_];
} else {
transformed_value = (float)input[b * channel * height * width + c * height * width + h * width + span - half_];
}
value = value * cos_ptr[k] + transformed_value * sin_ptr[k];
output[b * channel * height * width + c * height * width + h * width + span] = (half)value;
for (unsigned int w = 0; w < width; w = w + dim) {
for (unsigned int k = 0; k < dim; k++) {
unsigned int span = w + k;
value = (float)input[b * channel * height * width + c * height * width + h * width + span];
if (k < half_) {
transformed_value = -1.0f * (float)input[b * channel * height * width + c * height * width + h * width + span + half_];
} else {
transformed_value = (float)input[b * channel * height * width + c * height * width + h * width + span - half_];
}
value = value * cos_ptr[k] + transformed_value * sin_ptr[k];
output[b * channel * height * width + c * height * width + h * width + span] = (half)value;
}
}
}
Expand Down Expand Up @@ -259,8 +257,8 @@ void rotary_emb_cl(__fp16 *in, __fp16 *out,
break;
}

const int work_groups_count[3] = {1, 1, 1};
const int work_group_size[3] = {32, 1, 1}; // test-value
const int work_groups_count[3] = {(int)batch, (int)channel, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
result = context.command_queue_inst_.DispatchCommand(
kernel_rotary_emb_fp16, work_groups_count, work_group_size);
if (!result) {
Expand Down
31 changes: 16 additions & 15 deletions nntrainer/tensor/cl_operations/testing_rotarty_emb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
#include <string>

/**
* @brief compute frequency for rotary embedding
* @brief Testing code for CPU results and compute frequency for rotary
* embedding
* @param[in] dim hidden dim size
* @param[in] seq_len sequency length
* @param[out] freqs_cos cosine of the frequencies
Expand All @@ -24,7 +25,7 @@
* sin values for each position in sequence
* @param[in] theta rotary angle
*/
void precompute_freqs(int dim, unsigned int seq_len,
void precompute_freqs(unsigned int dim, unsigned int seq_len,
std::vector<std::vector<float>> &freqs_cos,
std::vector<std::vector<float>> &freqs_sin,
std::vector<float> &freqs, float theta = 10000.0) {
Expand All @@ -35,29 +36,29 @@ void precompute_freqs(int dim, unsigned int seq_len,
(std::pow(theta, (2 * i) / static_cast<float>(dim))));
}

auto cos = std::vector<std::vector<float>>();
cos.assign(seq_len, std::vector<float>(dim, 0));
auto cos_vec = std::vector<std::vector<float>>();
cos_vec.assign(seq_len, std::vector<float>(dim, 0));

auto sin = std::vector<std::vector<float>>();
sin.assign(seq_len, std::vector<float>(dim, 0));
auto sin_vec = std::vector<std::vector<float>>();
sin_vec.assign(seq_len, std::vector<float>(dim, 0));

for (unsigned int i = 0; i < seq_len; ++i) {
for (unsigned int j = 0; j < half_; ++j) {
float angle = i * freqs[j];
cos[i][j] = std::cos(angle);
cos[i][j + half_] = std::cos(angle); // repeated 2 times
cos_vec[i][j] = std::cos(angle);
cos_vec[i][j + half_] = std::cos(angle); // repeated 2 times

sin[i][j] = std::sin(angle);
sin[i][j + half_] = std::sin(angle); // repeated 2 times
sin_vec[i][j] = std::sin(angle);
sin_vec[i][j + half_] = std::sin(angle); // repeated 2 times
}
}
freqs_cos = cos;
freqs_sin = sin;
freqs_cos = cos_vec;
freqs_sin = sin_vec;
}
}

/**
* @brief apply rotary embedding
* @brief Testing code for CPU results and apply rotary embedding
* @param[in] in input tensor
* @param[in] dim hidden dim size
* @param[in] from sequence order
Expand All @@ -66,8 +67,8 @@ void precompute_freqs(int dim, unsigned int seq_len,
void apply_rotary_emb_tensor(nntrainer::Tensor &in, unsigned int dim,
unsigned int from, unsigned int max_timestep) {
nntrainer::Tensor out(in.getDim());
float value = 0;
float transformed_value = 0.0;
float value = 0.0f;
float transformed_value = 0.0f;
unsigned int half_ = dim / 2;

std::vector<std::vector<float>> freqs_cos = {};
Expand Down

0 comments on commit 1b0be27

Please sign in to comment.