nnstreamer · jijoongmoon · Oct 20, 2024 · Aug 28, 2024 · Aug 28, 2024 · Aug 29, 2024
@@ -15,6 +15,7 @@
  */
 
 #include <addition_layer_cl.h>
+#include <attention_kernel_strings.h>
 #include <blas_kernel_strings.h>
 #include <cl_context.h>
 #include <concat_cl.h>
@@ -149,6 +150,21 @@ void ClContext::initBlasClKernels() {
   blas_kernels_initialized = true;
 }
 
+void ClContext::initAttentionClKernels() {
+  if (attention_kernels_initialized) {
+    ml_logi("ClContext: Default attention kernels already registered and "
+            "initialized");
+    return;
+  }
+
+  registerClKernel(rotary_emb_cl_kernel_, "rotary_emb_cl");
+
+#ifdef ENABLE_FP16
+  registerClKernel(rotary_emb_cl_kernel_fp16_, "rotary_emb_cl_fp16");
+#endif
+  attention_kernels_initialized = true;
+}
+
 const ClContext::SharedPtrClKernel
 ClContext::registerClKernel(std::string kernel_string,
                             std::string kernel_name) {

@@ -211,6 +211,11 @@ class ClContext {
    */
   void initBlasClKernels();
 
+  /**
+   * @brief Initialize and register all attention OpenCl kernels
+   */
+  void initAttentionClKernels();
+
   /**
    * @brief destructor to release opencl commandQueue
    */
@@ -229,6 +234,9 @@ class ClContext {
   // flag to check default blas kernels registered or not
   bool blas_kernels_initialized = false;
 
+  // flag to check default attention kernels registered or not
+  bool attention_kernels_initialized = false;
+
   FactoryMap<nntrainer::Layer> factory_map;
 
   template <typename Args, typename T> struct isSupportedHelper;

@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Yash Singh <[email protected]>
+ *
+ * @file	attention_kernel_interface.cpp
+ * @date	28 August 2024
+ * @brief	Interface for attention OpenCL kernels
+ * @see		https://github.com/nnstreamer/nntrainer
+ * @author	Yash Singh <[email protected]>
+ * @bug		No known bugs except for NYI items
+ *
+ */
+
+#include <attention_kernel_interface.h>
+#include <attention_kernels.h>
+
+namespace nntrainer {
+/**
+ * @brief      compute frequency for rotary embedding
+ * @param[in]  dim hidden dim size
+ * @param[in]  seq_len sequency length
+ * @param[out] freqs_cos cosine of the frequencies
+ * @param[out] freqs_sin sine of the frequencies
+ * @param[out] freqs base frequencies array to be used in the future computation
+ * @param[in]  theta rotary angle
+ */
+void precompute_freqs(unsigned int dim, unsigned int seq_len,
+                      std::vector<std::vector<float>> &freqs_cos,
+                      std::vector<std::vector<float>> &freqs_sin,
+                      std::vector<float> &freqs, float theta = 10000.0) {
+  unsigned int half_ = dim / 2;
+  for (unsigned int i = 0; i < half_; ++i) {
+    freqs.push_back(1.0 / (std::pow(theta, (2 * i) / static_cast<float>(dim))));
+  }
+
+  auto cos_vec = std::vector<std::vector<float>>();
+  cos_vec.assign(seq_len, std::vector<float>(dim, 0));
+
+  auto sin_vec = std::vector<std::vector<float>>();
+  sin_vec.assign(seq_len, std::vector<float>(dim, 0));
+
+  for (unsigned int i = 0; i < seq_len; ++i) {
+    for (unsigned int j = 0; j < half_; ++j) {
+      float angle = i * freqs[j];
+      cos_vec[i][j] = std::cos(angle);
+      cos_vec[i][j + half_] = std::cos(angle); // repeated 2 times
+
+      sin_vec[i][j] = std::sin(angle);
+      sin_vec[i][j + half_] = std::sin(angle); // repeated 2 times
+    }
+  }
+  freqs_cos = cos_vec;
+  freqs_sin = sin_vec;
+}
+
+/**
+ * @brief     apply rotary embedding
+ * @param[in] in input tensor
+ * @param[in] dim hidden dim size
+ * @param[in] from sequence order
+ * @param[in] max_timestep maximum timestep
+ *
+ * @todo      Calling precompute_freqs in finalize to reduce code redundancy.
+ */
- */
+ * @param[in] context layer context to get the resource manager and queue id
+ */
- */
+ * @param[in] context layer context to get the resource manager and queue id
+ */
+void apply_rotary_emb_cl(Tensor &in, unsigned int dim, unsigned int from,
+                         unsigned int max_timestep) {
+  nntrainer::Tensor out(in.getDim());
+  float value = 0.0f;
+  float transformed_value = 0.0f;
+  unsigned int half_ = dim / 2;
+
+  std::vector<std::vector<float>> freqs_cos = {};
+  std::vector<std::vector<float>> freqs_sin = {};
+  std::vector<float> freqs;
+
+  precompute_freqs(dim, max_timestep, freqs_cos, freqs_sin, freqs);
+
+  std::vector<float> cos_;
+  std::vector<float> sin_;
+
+  if (from >= max_timestep) {
+    cos_.resize(dim);
+    sin_.resize(dim);
+
+    for (unsigned int i = 0; i < half_; ++i) {
+      float angle = from * freqs[i];
+      cos_[i] = std::cos(angle);
+      cos_[i + half_] = std::cos(angle); // repeated 2 times
+
+      sin_[i] = std::sin(angle);
+      sin_[i + half_] = std::sin(angle); // repeated 2 times
+    }
+  } else {
+    cos_.resize(max_timestep);
+    sin_.resize(max_timestep);
+  }
+
+  unsigned int input_batch_size, input_height, input_width, input_channels;
+  input_batch_size = in.batch();
+  input_height = in.height();
+  input_width = in.width();
+  input_channels = in.channel();
+
+  if (in.getDataType() == ml::train::TensorDim::DataType::FP32) {
+
+    unsigned int in_size = in.size();
+    unsigned int out_size = out.size();
+    float *data = in.getData();
+    float *rdata = out.getData();
+
+    rotary_emb_cl(data, rdata, freqs_cos, freqs_sin, cos_, sin_,
+                  input_batch_size, input_channels, input_height, input_width,
+                  dim, from, max_timestep, in_size, out_size);
+
+  } else if (in.getDataType() == ml::train::TensorDim::DataType::FP16) {
+#ifdef ENABLE_FP16
+
+    unsigned int in_size = in.size();
+    unsigned int out_size = out.size();
+    _FP16 *data = in.getData<_FP16>();
+    _FP16 *rdata = out.getData<_FP16>();
+
+    rotary_emb_cl(data, rdata, freqs_cos, freqs_sin, cos_, sin_,
+                  input_batch_size, input_channels, input_height, input_width,
+                  dim, from, max_timestep, in_size, out_size);
+#else
+    throw std::invalid_argument("Error: enable-fp16 is not enabled");
+#endif
+  }
+
+  if (from >= max_timestep) {
+    cos_.clear();
+    sin_.clear();
+  }
+
+  in.copy(out);
+}
+} // namespace nntrainer
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Yash Singh <[email protected]>
+ *
+ * @file	blas_kernel_interface.h
+ * @date	28 August 2024
+ * @brief	Interface for attention OpenCL kernels
+ * @see		https://github.com/nnstreamer/nntrainer
+ * @author	Yash Singh <[email protected]>
+ * @bug		No known bugs except for NYI items
+ *
+ */
+
+#ifndef __ATTENTION_KERNEL_INTERFACE_H__
+#define __ATTENTION_KERNEL_INTERFACE_H__
+
+#include <string>
+#include <tensor.h>
+
+namespace nntrainer {
+
+/**
+ * @brief     Rotary Embedding kernel
+ * @param[in] in input tensor
+ * @param[in] dim hidden dim size
+ * @param[in] from sequence order
+ * @param[in] max_timestep maximum timestep
+ */
+void apply_rotary_emb_cl(Tensor &in, unsigned int dim, unsigned int from,
+                         unsigned int max_timestep);
+
+} // namespace nntrainer
+#endif /* __ATTENTION_KERNEL_INTERFACE_H__ */
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Yash Singh <[email protected]>
+ *
+ * @file	attention_kernel_strings.h
+ * @date	8 October 2024
+ * @brief	All attention OpenCL kernel strings
+ * @see		https://github.com/nnstreamer/nntrainer
+ * @author	Yash Singh <[email protected]>
+ * @bug		No known bugs except for NYI items
+ *
+ */
+
+#ifndef __ATTENTION_KERNEL_STRINGS_H__
+#define __ATTENTION_KERNEL_STRINGS_H__
+
+#include <string>
+
+namespace nntrainer {
+static const std::string rotary_emb_cl_kernel_ = R"(
+
+  #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void rotary_emb_cl(__global float *input,
+                                      __global float *output,
+                                      __global float *freqs_cos,
+                                      __global float *freqs_sin,
+                                      __global float *cos_,
+                                      __global float *sin_,
+                                      unsigned int batch,
+                                      unsigned int channel,
+                                      unsigned int height,
+                                      unsigned int width,
+                                      unsigned int dim,
+                                      unsigned int half_,
+                                      unsigned int max_timestep,
+                                      unsigned int from) {
+    __global float *cos_ptr = cos_;
+    __global float *sin_ptr = sin_;
+
+    float value = 0.0f;
+    float transformed_value = 0.0f;
+
+    unsigned int b = get_global_id(0);
+    unsigned int c = get_global_id(1);
+
+    if(b < batch && c < channel){
+      for (unsigned int h = 0; h < height; h++) {
+        if (from + h < max_timestep) {
+          unsigned idx = (from + h)*dim;
+          for(unsigned int i = idx; i < idx + dim; i++){
+            cos_ptr[i - idx] = freqs_cos[i];
+            sin_ptr[i - idx] = freqs_sin[i];
+          }
+        }
+
+        for (unsigned int w = 0; w < width; w = w + dim) {
+          for (unsigned int k = 0; k < dim; k++) {
+            unsigned int span = w + k;
+            value = input[b * channel * height * width + c * height * width + h * width + span];
+            if (k < half_) {
+              transformed_value = -1.0f * input[b * channel * height * width + c * height * width + h * width + span + half_];
+            } else {
+              transformed_value = input[b * channel * height * width + c * height * width + h * width + span - half_];
+            }
+            value = value * cos_ptr[k] + transformed_value * sin_ptr[k];
+            output[b * channel * height * width + c * height * width + h * width + span] = value;
+          }
+        }
+      }
+    }
+}
+)";
+
+#ifdef ENABLE_FP16
+static const std::string rotary_emb_cl_kernel_fp16_ = R"(
+
+  #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void rotary_emb_cl_fp16(__global half *input,
+                                      __global half *output,
+                                      __global float *freqs_cos,
+                                      __global float *freqs_sin,
+                                      __global float *cos_,
+                                      __global float *sin_,
+                                      unsigned int batch,
+                                      unsigned int channel,
+                                      unsigned int height,
+                                      unsigned int width,
+                                      unsigned int dim,
+                                      unsigned int half_,
+                                      unsigned int max_timestep,
+                                      unsigned int from) {
+    __global float *cos_ptr = cos_;
+    __global float *sin_ptr = sin_;
+
+    float value = 0.0f;
+    float transformed_value = 0.0f;
+
+    unsigned int b = get_global_id(0);
+    unsigned int c = get_global_id(1);
+
+    if(b < batch && c < channel){
+      for (unsigned int h = 0; h < height; h++) {
+        if (from + h < max_timestep) {
+          unsigned idx = (from + h)*dim;
+          for(int i = idx; i < idx + dim; i++ ){
+            cos_ptr[i - idx] = freqs_cos[i];
+            sin_ptr[i - idx] = freqs_sin[i];
+          }
+        }
+
+        for (unsigned int w = 0; w < width; w = w + dim) {
+          for (unsigned int k = 0; k < dim; k++) {
+            unsigned int span = w + k;
+            value = (float)input[b * channel * height * width + c * height * width + h * width + span];
+            if (k < half_) {
+              transformed_value = -1.0f * (float)input[b * channel * height * width + c * height * width + h * width + span + half_];
+            } else {
+              transformed_value = (float)input[b * channel * height * width + c * height * width + h * width + span - half_];
+            }
+            value = value * cos_ptr[k] + transformed_value * sin_ptr[k];
+            output[b * channel * height * width + c * height * width + h * width + span] = (half)value;
+          }
+        }
+      }
+    }
+}
+)";
+
+#endif
+} // namespace nntrainer
+#endif /* __ATTENTION_KERNEL_INTERFACE_H__ */