diff --git a/Applications/AlexNet/jni/Android.mk b/Applications/AlexNet/jni/Android.mk
index d2cbf8dce2..b3a5c8672e 100644
--- a/Applications/AlexNet/jni/Android.mk
+++ b/Applications/AlexNet/jni/Android.mk
@@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/compiler \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/tensor \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/api \
 	$(NNTRAINER_ROOT)/api/ccapi/include \
diff --git a/Applications/Android/NNDetector/app/src/main/jni/Android.mk b/Applications/Android/NNDetector/app/src/main/jni/Android.mk
index 00fdb1ffc1..757a97ffbf 100644
--- a/Applications/Android/NNDetector/app/src/main/jni/Android.mk
+++ b/Applications/Android/NNDetector/app/src/main/jni/Android.mk
@@ -28,17 +28,15 @@ include $(CLEAR_VARS)
 NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/include/nntrainer
 SIMPLESHOT_DIR = .
 
-
 LOCAL_ARM_NEON := true
-LOCAL_CFLAGS += -std=c++17 -Ofast -mcpu=cortex-a53 -Ilz4-nougat/lib
-LOCAL_LDFLAGS += -Llz4-nougat/lib/obj/local/$(TARGET_ARCH_ABI)/
-LOCAL_CXXFLAGS += -std=c++17 -frtti -fexceptions
+LOCAL_CFLAGS += -std=c++17 -Ofast -mcpu=cortex-a53 -Ilz4-nougat/lib -DARM=1 
+LOCAL_LDFLAGS += -Llz4-nougat/lib/obj/local/$(TARGET_ARCH_ABI)/ -DARM=1 
+LOCAL_CXXFLAGS += -std=c++17 -frtti -fexceptions -fopenmp -static-openmp -DARM=1 
 LOCAL_CFLAGS += -pthread -fexceptions -fopenmp -static-openmp
-LOCAL_LDFLAGS += -fexceptions -fopenmp -static-openmp
 LOCAL_MODULE_TAGS := optional
 LOCAL_ARM_MODE := arm
 LOCAL_MODULE := simpleshot_jni
-LOCAL_LDLIBS := -llog -landroid -fopenmp -static-openmp -ljnigraphics
+LOCAL_LDLIBS := -llog -landroid -fopenmp -static-openmp -ljnigraphics -DARM=1 
 
 LOCAL_SRC_FILES := simpleshot.cpp simpleshot_jni.cpp dataloader.cpp image.cpp
 LOCAL_SHARED_LIBRARIES := ccapi-nntrainer nntrainer
diff --git a/Applications/Custom/LayerClient/jni/Android.mk b/Applications/Custom/LayerClient/jni/Android.mk
index 56dd4286eb..af9021d7d6 100644
--- a/Applications/Custom/LayerClient/jni/Android.mk
+++ b/Applications/Custom/LayerClient/jni/Android.mk
@@ -22,6 +22,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/tensor \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \
 	$(NNTRAINER_ROOT)/api \
 	$(NNTRAINER_ROOT)/api/ccapi/include \
 	${ML_API_COMMON_INCLUDES}
diff --git a/Applications/LLaMA/jni/Android.mk b/Applications/LLaMA/jni/Android.mk
index f1a9c2f117..ddfab54ca4 100644
--- a/Applications/LLaMA/jni/Android.mk
+++ b/Applications/LLaMA/jni/Android.mk
@@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/graph \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/tensor \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/api \
 	$(NNTRAINER_ROOT)/api/ccapi/include \
diff --git a/Applications/Layers/jni/Android.mk b/Applications/Layers/jni/Android.mk
index bef36d8739..fea882c0ac 100644
--- a/Applications/Layers/jni/Android.mk
+++ b/Applications/Layers/jni/Android.mk
@@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/compiler \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/tensor \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/api \
 	$(NNTRAINER_ROOT)/api/ccapi/include \
diff --git a/Applications/Multi_input/jni/Android.mk b/Applications/Multi_input/jni/Android.mk
index 93445867b4..207eadd0bc 100644
--- a/Applications/Multi_input/jni/Android.mk
+++ b/Applications/Multi_input/jni/Android.mk
@@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/graph \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/tensor \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/api \
 	$(NNTRAINER_ROOT)/api/ccapi/include \
diff --git a/Applications/PicoGPT/jni/Android.mk b/Applications/PicoGPT/jni/Android.mk
index f7cc55d436..ce30e58868 100644
--- a/Applications/PicoGPT/jni/Android.mk
+++ b/Applications/PicoGPT/jni/Android.mk
@@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/graph \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/tensor \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/api \
 	$(NNTRAINER_ROOT)/api/ccapi/include \
diff --git a/Applications/ProductRatings/jni/Android.mk b/Applications/ProductRatings/jni/Android.mk
index 7a475d643c..96bc87d134 100644
--- a/Applications/ProductRatings/jni/Android.mk
+++ b/Applications/ProductRatings/jni/Android.mk
@@ -22,6 +22,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer/include \
 	$(NNTRAINER_ROOT)/nntrainer/layers \
 	$(NNTRAINER_ROOT)/nntrainer/compiler \
 	$(NNTRAINER_ROOT)/nntrainer/graph \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/tensor
diff --git a/Applications/ReinforcementLearning/DeepQ/jni/Android.mk b/Applications/ReinforcementLearning/DeepQ/jni/Android.mk
index 67173ad256..9d3cf4ddb5 100644
--- a/Applications/ReinforcementLearning/DeepQ/jni/Android.mk
+++ b/Applications/ReinforcementLearning/DeepQ/jni/Android.mk
@@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/graph \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/tensor \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/api \
 	$(NNTRAINER_ROOT)/api/ccapi/include \
diff --git a/Applications/Resnet/jni/Android.mk b/Applications/Resnet/jni/Android.mk
index 1c807ec393..460fb8e5eb 100644
--- a/Applications/Resnet/jni/Android.mk
+++ b/Applications/Resnet/jni/Android.mk
@@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/graph \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/tensor \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/api \
 	$(NNTRAINER_ROOT)/api/ccapi/include \
diff --git a/Applications/TransferLearning/CIFAR_Classification/jni/Android.mk b/Applications/TransferLearning/CIFAR_Classification/jni/Android.mk
index 22be25c1dc..271f5997b6 100644
--- a/Applications/TransferLearning/CIFAR_Classification/jni/Android.mk
+++ b/Applications/TransferLearning/CIFAR_Classification/jni/Android.mk
@@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/compiler \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/tensor \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/api \
 	$(NNTRAINER_ROOT)/api/ccapi/include \
diff --git a/Applications/TransferLearning/Draw_Classification/jni/Android.mk b/Applications/TransferLearning/Draw_Classification/jni/Android.mk
index 9e933db23a..0f4ed02cf2 100644
--- a/Applications/TransferLearning/Draw_Classification/jni/Android.mk
+++ b/Applications/TransferLearning/Draw_Classification/jni/Android.mk
@@ -18,6 +18,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/models \
 	$(NNTRAINER_ROOT)/nntrainer/graph \
 	$(NNTRAINER_ROOT)/nntrainer/tensor \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/api \
diff --git a/Applications/VGG/jni/Android.mk b/Applications/VGG/jni/Android.mk
index 76aa559a51..73de265251 100644
--- a/Applications/VGG/jni/Android.mk
+++ b/Applications/VGG/jni/Android.mk
@@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/compiler \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/tensor \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/api \
 	$(NNTRAINER_ROOT)/api/ccapi/include \
diff --git a/Applications/YOLOv2/jni/Android.mk b/Applications/YOLOv2/jni/Android.mk
index 9f0dfb7165..ad5d90e696 100644
--- a/Applications/YOLOv2/jni/Android.mk
+++ b/Applications/YOLOv2/jni/Android.mk
@@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/graph \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/tensor \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/api \
 	$(NNTRAINER_ROOT)/api/ccapi/include \
diff --git a/Applications/YOLOv3/jni/Android.mk b/Applications/YOLOv3/jni/Android.mk
index 115218f45c..0877c17cb3 100644
--- a/Applications/YOLOv3/jni/Android.mk
+++ b/Applications/YOLOv3/jni/Android.mk
@@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/graph \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/tensor \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/api \
 	$(NNTRAINER_ROOT)/api/ccapi/include \
diff --git a/debian/nntrainer-dev.install b/debian/nntrainer-dev.install
index 656755aa78..82e5260251 100644
--- a/debian/nntrainer-dev.install
+++ b/debian/nntrainer-dev.install
@@ -14,7 +14,10 @@
 /usr/include/nntrainer/short_tensor.h
 /usr/include/nntrainer/float_tensor.h
 /usr/include/nntrainer/tensor_wrap_specs.h
-/usr/include/nntrainer/blas_interface.h
+usr/include/nntrainer/fallback_internal.h 
+usr/include/nntrainer/cblas_interface.h
+usr/include/nntrainer/x86_compute_backend.h
+/usr/include/nntrainer/cpu_backend.h
 /usr/include/nntrainer/var_grad.h
 /usr/include/nntrainer/weight.h
 # todo: update dataset headers
diff --git a/jni/meson.build b/jni/meson.build
index e552919beb..82dff1db8f 100644
--- a/jni/meson.build
+++ b/jni/meson.build
@@ -25,6 +25,12 @@ and_conf.set('VERSION_MAJOR', nntrainer_version_split[0])
 and_conf.set('VERSION_MINOR', nntrainer_version_split[1])
 and_conf.set('VERSION_MICRO', nntrainer_version_split[2])
 
+arch = host_machine.cpu_family()
+and_conf.set('ARM', 1)
+if arch == 'arm'
+  and_conf.set('ARMV7', 1)
+endif
+
 if get_option('enable-capi').enabled()
    and_conf.set('MESON_CAPI_NNTRAINER_SRCS', ' '.join(capi_src))
    and_conf.set('MESON_CAPI_NNTRAINER_INCS', ' '.join(capi_inc_abs))
diff --git a/meson.build b/meson.build
index 98e2cae9f6..3123357533 100644
--- a/meson.build
+++ b/meson.build
@@ -68,9 +68,23 @@ warning_c_flags = [
   '-Wno-error=varargs'
 ]
 
+arch = host_machine.cpu_family()
+if arch == 'arm' or arch == 'aarch64' or get_option('platform') == 'android'
+  message('Build for ARM architecture')
+  extra_defines += '-DARM=1'
+  if arch == 'arm'
+    extra_defines += '-DARMV7=1'
+  endif
+elif arch == 'x86' or arch == 'x86_64'
+  message('Build for X86 architecture')
+  if get_option('enable-fp16')
+    add_project_arguments(['-march=native'], language: ['c','cpp'])
+    message('-march=native added for AVX hardware acceleration.')
+  endif
+  extra_defines += '-DX86=1'
+endif
 
 if get_option('enable-fp16')
-   arch = host_machine.cpu_family()
    if get_option('platform') == 'android'
      add_project_arguments('-mfp16-format=ieee', language: ['c', 'cpp'])
      extra_defines += '-DENABLE_FP16=1'
@@ -110,11 +124,6 @@ if get_option('enable-fp16')
      if cc.version().version_compare('>=12.1.0')
        message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.')
        extra_defines += '-DENABLE_FP16=1'
-       if get_option('enable-avx')
-        extra_defines += '-DUSE_AVX=1'
-        add_project_arguments(['-march=native'], language: ['c','cpp'])
-        message('-march=native added for AVX hardware acceleration.')
-       endif
      else
        warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
      endif
diff --git a/nntrainer/layers/acti_func.h b/nntrainer/layers/acti_func.h
index c6c3576414..930f46a797 100644
--- a/nntrainer/layers/acti_func.h
+++ b/nntrainer/layers/acti_func.h
@@ -16,8 +16,8 @@
 #define __ACTI_FUNC_H__
 #ifdef __cplusplus
 
-#include <blas_interface.h>
 #include <common_properties.h>
+#include <cpu_backend.h>
 
 namespace nntrainer {
 
diff --git a/nntrainer/layers/activation_layer.cpp b/nntrainer/layers/activation_layer.cpp
index 8fd59506cc..ec42bfbe57 100644
--- a/nntrainer/layers/activation_layer.cpp
+++ b/nntrainer/layers/activation_layer.cpp
@@ -20,8 +20,8 @@
 #include <vector>
 
 #include <activation_layer.h>
-#include <blas_interface.h>
 #include <common_properties.h>
+#include <cpu_backend.h>
 #include <layer_context.h>
 #include <nntrainer_error.h>
 #include <nntrainer_log.h>
@@ -32,8 +32,7 @@
 
 namespace nntrainer {
 ActivationLayer::ActivationLayer() :
-  Layer(),
-  activation_props(new PropTypes(props::Activation())) {
+  Layer(), activation_props(new PropTypes(props::Activation())) {
   acti_func.setActiFunc(ActivationType::ACT_NONE);
 }
 
diff --git a/nntrainer/layers/conv2d_layer.cpp b/nntrainer/layers/conv2d_layer.cpp
index c059ae9caf..59a218e997 100644
--- a/nntrainer/layers/conv2d_layer.cpp
+++ b/nntrainer/layers/conv2d_layer.cpp
@@ -16,8 +16,8 @@
 #include <limits>
 #include <string>
 
-#include <blas_interface.h>
 #include <conv2d_layer.h>
+#include <cpu_backend.h>
 #include <layer_context.h>
 #include <lazy_tensor.h>
 #include <nntr_threads.h>
diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
deleted file mode 100644
index 91187c50cc..0000000000
--- a/nntrainer/tensor/blas_interface.cpp
+++ /dev/null
@@ -1,1130 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-/**
- * Copyright (C) 2020 Jijoong Moon <jijoong.moon@samsung.com>
- *
- * @file   blas_interface.cpp
- * @date   28 Aug 2020
- * @see    https://github.com/nnstreamer/nntrainer
- * @author Jijoong Moon <jijoong.moon@samsung.com>
- * @author Sungsik Kong <ss.kong@samsung.com>
- * @bug    No known bugs except for NYI items
- * @brief  This is dummy header for blas support
- *
- */
-
-#include <blas_interface.h>
-#include <nntrainer_error.h>
-
-#if (defined USE__FP16 && defined USE_NEON)
-#include <blas_neon.h>
-#include <matrix_transpose_neon.h>
-#endif
-
-#if USE_AVX
-#include <blas_avx.h>
-#endif
-
-#ifdef USE_BLAS
-extern "C" {
-#include <cblas.h>
-}
-#endif
-
-#include <cmath>
-
-#define sgemv_loop(ci, cj, cM, cN)           \
-  do {                                       \
-    float y0;                                \
-    unsigned int i, j;                       \
-    for (ci = 0; ci != cM; ci++) {           \
-      y0 = Y[ci * incy] * beta;              \
-      for (cj = 0; cj != cN; cj++)           \
-        y0 += A[i + j * lda] * X[cj * incx]; \
-      Y[ci * incy] = y0;                     \
-    }                                        \
-  } while (0);
-
-#define hgemv_loop(ci, cj, cM, cN)                                      \
-  do {                                                                  \
-    float y0;                                                           \
-    unsigned int i, j;                                                  \
-    for (ci = 0; ci != cM; ci++) {                                      \
-      y0 = static_cast<float>(Y[ci * incy] * static_cast<_FP16>(beta)); \
-      for (cj = 0; cj != cN; cj++)                                      \
-        y0 += static_cast<float>(A[i + j * lda] * X[cj * incx]);        \
-      Y[ci * incy] = static_cast<_FP16>(y0);                            \
-    }                                                                   \
-  } while (0);
-
-#define haxpy_loop()                                                       \
-  do {                                                                     \
-    unsigned int i;                                                        \
-    for (i = 0; i < N; ++i)                                                \
-      Y[i * incY] = Y[i * incY] + static_cast<_FP16>(alpha) * X[i * incX]; \
-  } while (0);
-
-#define hgemm_loop()                                          \
-  do {                                                        \
-    for (unsigned int m = 0; m < M; ++m) {                    \
-      for (unsigned int n = 0; n < N; ++n) {                  \
-        float c = 0;                                          \
-        _FP16 c_old = C[m * ldc + n];                         \
-        for (unsigned int k = 0; k < K; ++k) {                \
-          _FP16 a, b;                                         \
-          a = ((TransA) ? A[k * lda + m] : A[m * lda + k]);   \
-          b = ((TransB) ? B[n * ldb + k] : B[k * ldb + n]);   \
-          c += static_cast<float>(a * b);                     \
-        }                                                     \
-        C[m * ldc + n] = static_cast<_FP16>(alpha * c);       \
-        if (beta != 0.0)                                      \
-          C[m * ldc + n] += static_cast<_FP16>(beta) * c_old; \
-      }                                                       \
-    }                                                         \
-  } while (0);
-
-namespace nntrainer {
-
-template <typename T>
-static inline void transpose_fallback(unsigned int M, unsigned int N,
-                                      const T *src, unsigned int ld_src, T *dst,
-                                      unsigned int ld_dst) {
-  for (unsigned int i = 0; i < M; i++) {
-    for (unsigned int j = 0; j < N; j++) {
-      dst[i + j * ld_dst] = src[i * ld_src + j];
-    }
-  }
-}
-
-#ifdef ENABLE_FP16
-static void saxpy_FP16(const unsigned int N, const float alpha, const _FP16 *X,
-                       const int incX, _FP16 *Y, const int incY) {
-  if (incX < 0 or incY < 0)
-    throw std::invalid_argument("Error: negative inc not supported");
-
-#if (defined USE__FP16 && USE_NEON)
-  // USE__FP16 is defined when platform is android
-  if (incX == 1 && incY == 1) {
-    nntrainer::neon::haxpy(N, alpha, X, Y);
-  } else {
-    haxpy_loop();
-  }
-#else
-  haxpy_loop();
-#endif
-}
-
-static void sgemv_FP16(const unsigned int TStorageOrder, bool TransA,
-                       const unsigned int M, const unsigned int N,
-                       const float alpha, const _FP16 *A,
-                       const unsigned int lda, const _FP16 *X, const int incX,
-                       const float beta, _FP16 *Y, const int incY) {
-#if (defined USE__FP16 && USE_NEON)
-  if (TransA) {
-    nntrainer::neon::hgemv_transpose(A, X, Y, M, N, alpha, beta);
-  } else {
-    nntrainer::neon::hgemv(A, X, Y, M, N, alpha, beta);
-  }
-#else
-  unsigned int lenX =
-    (TransA) ? 1 + (M - 1) * abs(incX) : 1 + (N - 1) * abs(incX);
-  unsigned int lenY =
-    (TransA) ? 1 + (N - 1) * abs(incY) : 1 + (M - 1) * abs(incY);
-
-  float *A_ = new float[M * N];
-  float *X_ = new float[lenX];
-  float *Y_ = new float[lenY];
-
-  scopy(M * N, A, 1, A_, 1);
-  scopy(lenX, X, 1, X_, 1);
-  scopy(lenY, Y, 1, Y_, 1);
-
-  sgemv(TStorageOrder, TransA, M, N, alpha, A_, lda, X_, incX, beta, Y_, incY);
-
-  scopy(lenY, Y_, 1, Y, 1);
-
-  delete[] A_;
-  delete[] X_;
-  delete[] Y_;
-#endif
-}
-
-static _FP16 sdot_FP16(const unsigned int N, const _FP16 *X,
-                       const unsigned int incX, const _FP16 *Y,
-                       const unsigned int incY) {
-
-  if (incX < 0 or incY < 0)
-    throw std::invalid_argument("Error: negative inc not supported");
-
-  _FP16 ret = 0;
-
-#if (defined USE__FP16 && USE_NEON)
-  if (incX == 1 && incY == 1) {
-    ret = nntrainer::neon::hdot(N, X, Y);
-  } else {
-    for (unsigned int i = 0; i < N; ++i) {
-      ret += X[i * incX] * Y[i * incY];
-    }
-  }
-#else
-  for (unsigned int i = 0; i < N; ++i) {
-    ret += X[i * incX] * Y[i * incY];
-  }
-#endif
-  return ret;
-}
-
-static void scopy_FP16(const unsigned int N, const _FP16 *X, const int incX,
-                       _FP16 *Y, const int incY) {
-  unsigned int incy = abs(incY);
-  unsigned int incx = abs(incX);
-
-#if (defined USE__FP16 && USE_NEON)
-  if (incX == 1 && incY == 1) {
-    nntrainer::neon::hcopy(N, X, Y);
-  } else {
-    for (unsigned int i = 0; i < N; ++i)
-      Y[i * incy] = X[i * incx];
-  }
-#else
-  for (unsigned int i = 0; i < N; ++i)
-    Y[i * incy] = X[i * incx];
-#endif
-}
-
-static void copy_float32_to_float16(const unsigned int N, const float *X,
-                                    const int incX, _FP16 *Y, const int incY) {
-  unsigned int incy = abs(incY);
-  unsigned int incx = abs(incX);
-
-#if (defined USE__FP16 && USE_NEON)
-  if (incX == 1 && incY == 1) {
-    nntrainer::neon::copy_fp32_to_fp16(N, X, Y);
-  } else {
-    for (unsigned int i = 0; i < N; ++i)
-      Y[i * incy] = X[i * incx];
-  }
-#elif USE_AVX
-  if (incX == 1 && incY == 1) {
-    nntrainer::avx::vcvt_f32_f16(N, X, Y);
-  } else {
-    for (unsigned int i = 0; i < N; ++i)
-      Y[i * incy] = static_cast<_FP16>(X[i * incx]);
-  }
-#else
-  for (unsigned int i = 0; i < N; ++i)
-    Y[i * incy] = static_cast<_FP16>(X[i * incx]);
-#endif
-}
-
-static void copy_float16_to_float32(const unsigned int N, const _FP16 *X,
-                                    const int incX, float *Y, const int incY) {
-  unsigned int incy = abs(incY);
-  unsigned int incx = abs(incX);
-
-#if (defined USE__FP16 && USE_NEON)
-  if (incX == 1 && incY == 1) {
-    nntrainer::neon::copy_fp16_to_fp32(N, X, Y);
-  } else {
-    for (unsigned int i = 0; i < N; ++i)
-      Y[i * incy] = X[i * incx];
-  }
-#elif USE_AVX
-  if (incX == 1 && incY == 1) {
-    nntrainer::avx::vcvt_f16_f32(N, X, Y);
-  } else {
-    for (unsigned int i = 0; i < N; ++i)
-      Y[i * incy] = static_cast<float>(X[i * incx]);
-  }
-#else
-  for (unsigned int i = 0; i < N; ++i)
-    Y[i * incy] = static_cast<float>(X[i * incx]);
-#endif
-}
-
-static void copy_int4_to_fp16(const unsigned int N, const uint8_t *X,
-                              const int incX, _FP16 *Y, const int incY) {
-  unsigned int incy = abs(incY);
-  unsigned int incx = abs(incX);
-
-#if (defined USE__FP16 && USE_NEON)
-  if (incX == 1 && incY == 1) {
-    nntrainer::neon::copy_int4_to_fp16(N, X, Y);
-  } else {
-    throw std::invalid_argument(
-      "Error: incX == 1 && incY == 1 is supported only");
-  }
-#else
-  for (unsigned int idx = 0; idx < N; idx++) {
-    Y[2 * idx] = X[idx] >> 4;
-    Y[2 * idx + 1] = X[idx] & 0x0f;
-  }
-#endif
-}
-
-static void copy_int8_to_fp16(const unsigned int N, const uint8_t *X,
-                              const int incX, _FP16 *Y, const int incY) {
-  unsigned int incy = abs(incY);
-  unsigned int incx = abs(incX);
-
-#if (defined USE__FP16 && USE_NEON)
-  if (incX == 1 && incY == 1) {
-    nntrainer::neon::copy_int8_to_fp16(N, X, Y);
-  } else {
-    throw std::invalid_argument(
-      "Error: incX == 1 && incY == 1 is supported only");
-  }
-#else
-  for (unsigned int idx = 0; idx < N; idx++) {
-    Y[idx] = X[idx];
-  }
-#endif
-}
-
-void sscal(const unsigned int N, const float alpha, _FP16 *X, const int incX) {
-  unsigned int incx = abs(incX);
-
-#if (defined USE__FP16 && USE_NEON)
-  if (incX == 1) {
-    nntrainer::neon::hscal(N, X, alpha);
-  } else {
-    for (unsigned int i = 0; i < N; ++i)
-      X[i * incx] = static_cast<_FP16>(alpha) * X[i * incx];
-  }
-#else
-  for (unsigned int i = 0; i < N; ++i)
-    X[i * incx] = static_cast<_FP16>(alpha) * X[i * incx];
-#endif
-}
-
-static _FP16 snrm2_FP16(const unsigned int N, const _FP16 *X, const int incX) {
-  unsigned int incx = abs(incX);
-  _FP16 sum;
-  _FP16 tmp;
-#if (defined USE__FP16 && USE_NEON)
-  if (incX == 1) {
-    sum = nntrainer::neon::hnrm2(N, X);
-  } else {
-    float sum32 = 0;
-    for (unsigned int i = 0; i < N; i++) {
-      tmp = X[i * incx];
-      sum32 += tmp * tmp;
-    }
-    sum = static_cast<_FP16>(sqrt(sum32));
-  }
-#else
-  float sum32 = 0;
-  for (unsigned int i = 0; i < N; i++) {
-    tmp = X[i * incx];
-    sum32 += tmp * tmp;
-  }
-  sum = static_cast<_FP16>(sqrt(sum32));
-#endif
-  return sum;
-}
-
-static void sgemm_FP16(const unsigned int TStorageOrder, bool TransA,
-                       bool TransB, const unsigned int M, const unsigned int N,
-                       const unsigned int K, const float alpha, const _FP16 *A,
-                       const unsigned int lda, const _FP16 *B,
-                       const unsigned int ldb, const float beta, _FP16 *C,
-                       const unsigned int ldc) {
-
-#if (defined USE__FP16 && USE_NEON)
-  nntrainer::neon::custom_hgemm(A, B, C, M, N, K, alpha, beta, TransA, TransB);
-#else
-  CBLAS_TRANSPOSE transA = TransA ? CblasTrans : CblasNoTrans;
-  CBLAS_TRANSPOSE transB = TransB ? CblasTrans : CblasNoTrans;
-  CBLAS_ORDER order = TStorageOrder ? CblasColMajor : CblasRowMajor;
-
-  float *A_ = new float[M * K];
-  float *B_ = new float[N * K];
-  float *C_ = new float[M * N];
-
-  scopy(M * K, A, 1, A_, 1);
-  scopy(N * K, B, 1, B_, 1);
-  scopy(M * N, C, 1, C_, 1);
-  cblas_sgemm(order, transA, transB, M, N, K, alpha, A_, lda, B_, ldb, beta, C_,
-              ldc);
-  scopy(M * N, C_, 1, C, 1);
-
-  delete[] A_;
-  delete[] B_;
-  delete[] C_;
-#endif
-}
-
-static unsigned int isamax_FP16(const unsigned int N, const _FP16 *X,
-                                const int incX) {
-  unsigned int max_idx = 0;
-
-#if (defined USE__FP16 && USE_NEON)
-  if (incX == 1 && N >= 8) {
-    max_idx = nntrainer::neon::isamax(N, X);
-  } else {
-    _FP16 max_val = X[0];
-    for (unsigned int n = 1; n < N; n += incX) {
-      _FP16 cur_val = (X[n] >= 0) ? X[n] : -1 * X[n];
-      if (cur_val > max_val) {
-        max_val = cur_val;
-        max_idx = n;
-      }
-    }
-  }
-#else
-  _FP16 max_val = X[0];
-  for (unsigned int n = 1; n < N; n += incX) {
-    _FP16 cur_val = (X[n] >= 0) ? X[n] : -1 * X[n];
-    if (cur_val > max_val) {
-      max_val = cur_val;
-      max_idx = n;
-    }
-  }
-#endif
-
-  return max_idx;
-}
-
-void saxpy(const unsigned int N, const float alpha, const _FP16 *X,
-           const int incX, _FP16 *Y, const int incY) {
-  saxpy_FP16(N, alpha, X, incX, Y, incY);
-}
-
-void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
-           const unsigned int M, const unsigned int N, const unsigned int K,
-           const float alpha, const _FP16 *A, const unsigned int lda,
-           const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C,
-           const unsigned int ldc) {
-  sgemm_FP16(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
-             beta, C, ldc);
-}
-
-void scopy(const unsigned int N, const _FP16 *X, const int incX, _FP16 *Y,
-           const int incY) {
-  scopy_FP16(N, X, incX, Y, incY);
-}
-
-void scopy(const unsigned int N, const float *X, const int incX, _FP16 *Y,
-           const int incY) {
-  copy_float32_to_float16(N, X, incX, Y, incY);
-}
-
-void scopy(const unsigned int N, const _FP16 *X, const int incX, float *Y,
-           const int incY) {
-  copy_float16_to_float32(N, X, incX, Y, incY);
-}
-
-void scopy_int4_to_float16(const unsigned int N, const uint8_t *X,
-                           const int incX, _FP16 *Y, const int incY) {
-  copy_int4_to_fp16(N, X, incX, Y, incY);
-}
-
-void scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
-                           const int incX, _FP16 *Y, const int incY) {
-  copy_int8_to_fp16(N, X, incX, Y, incY);
-}
-
-static void ele_mul_fallback(const unsigned int N, const _FP16 *X,
-                             const _FP16 *Y, _FP16 *Z, float alpha, float beta,
-                             unsigned int i_stride, unsigned int o_stride) {
-  for (unsigned int i = 0; i < N; ++i) {
-    *Z = *X * static_cast<_FP16>(alpha) * *Y + static_cast<_FP16>(beta) * *Z;
-    X += o_stride;
-    Y += i_stride;
-    Z += o_stride;
-  }
-}
-
-static void ele_add_fallback(const unsigned int N, const _FP16 *X,
-                             const _FP16 *Y, _FP16 *Z, float alpha, float beta,
-                             unsigned int i_stride, unsigned int o_stride) {
-  for (unsigned int i = 0; i < N; ++i) {
-    *Z = *X + static_cast<_FP16>(alpha) * *Y + static_cast<_FP16>(beta) * *Z;
-    X += o_stride;
-    Y += i_stride;
-    Z += o_stride;
-  }
-}
-
-static void ele_sub_fallback(const unsigned int N, const _FP16 *X,
-                             const _FP16 *Y, _FP16 *Z, float alpha, float beta,
-                             unsigned int i_stride, unsigned int o_stride) {
-  for (unsigned int i = 0; i < N; ++i) {
-    *Z = *X - static_cast<_FP16>(alpha) * *Y + static_cast<_FP16>(beta) * *Z;
-    X += o_stride;
-    Y += i_stride;
-    Z += o_stride;
-  }
-}
-
-static void ele_div_fallback(const unsigned int N, const _FP16 *X,
-                             const _FP16 *Y, _FP16 *Z, float alpha, float beta,
-                             unsigned int i_stride, unsigned int o_stride) {
-  for (unsigned int i = 0; i < N; ++i) {
-    *Z = *X / (static_cast<_FP16>(alpha) * *Y) + static_cast<_FP16>(beta) * *Z;
-    X += o_stride;
-    Y += i_stride;
-    Z += o_stride;
-  }
-}
-
-void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
-             float alpha, float beta, unsigned int i_stride,
-             unsigned int o_stride) {
-  if (i_stride == 1 && o_stride == 1) {
-#if (defined USE__FP16 && USE_NEON)
-    nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta);
-#else
-    ele_mul_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-#endif
-  } else
-    ele_mul_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-}
-
-void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
-             float alpha, float beta, unsigned int i_stride,
-             unsigned int o_stride) {
-  if (i_stride == 1 && o_stride == 1) {
-#if (defined USE__FP16 && USE_NEON)
-    nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta);
-#else
-    ele_add_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-#endif
-  } else
-    ele_add_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-}
-
-void ele_sub(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
-             float alpha, float beta, unsigned int i_stride,
-             unsigned int o_stride) {
-  if (i_stride == 1 && o_stride == 1) {
-#if (defined USE__FP16 && USE_NEON)
-    nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta);
-#else
-    ele_sub_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-#endif
-  } else
-    ele_sub_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-}
-
-void ele_div(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
-             float alpha, float beta, unsigned int i_stride,
-             unsigned int o_stride) {
-  if (i_stride == 1 && o_stride == 1) {
-#if (defined USE__FP16 && USE_NEON)
-    nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta);
-#else
-    ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-#endif
-  } else
-    ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-}
-
-_FP16 snrm2(const int N, const _FP16 *X, const int incX) {
-  return snrm2_FP16(N, X, incX);
-}
-
-_FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX,
-           const _FP16 *Y, const unsigned int incY) {
-  return sdot_FP16(N, X, incX, Y, incY);
-}
-
-void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
-           const unsigned int N, const float alpha, const _FP16 *A,
-           const unsigned int lda, const _FP16 *X, const int incX,
-           const float beta, _FP16 *Y, const int incY) {
-  sgemv_FP16(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
-             incY);
-}
-
-unsigned int isamax(const unsigned int N, const _FP16 *X, const int incX) {
-  /// @todo isamax_FP16 for BLAS_NUM_THREADS
-  return isamax_FP16(N, X, incX);
-}
-
-void inv_sqrt_inplace(const unsigned int N, _FP16 *X) {
-#ifdef USE_NEON
-  nntrainer::neon::inv_sqrt_inplace(N, X);
-#else
-  for (unsigned int i = 0; i < N; ++i) {
-    X[i] = static_cast<_FP16>(1 / std::sqrt(static_cast<float>(X[i])));
-  }
-#endif
-}
-
-void transpose_matrix(const unsigned int M, const unsigned int N,
-                      const _FP16 *src, unsigned int ld_src, _FP16 *dst,
-                      unsigned int ld_dst) {
-#ifdef USE_NEON
-  transpose_neon<_FP16>(M, N, src, ld_src, dst, ld_dst);
-#else
-  transpose_fallback<_FP16>(M, N, src, ld_src, dst, ld_dst);
-#endif
-}
-#endif
-
-#ifndef USE_BLAS
-static void saxpy_raw(const unsigned int N, const float alpha, const float *X,
-                      const int incX, float *Y, const int incY) {
-  if (incX < 0 or incY < 0)
-    throw std::invalid_argument("Error: negative inc not supported");
-  for (unsigned int i = 0; i < N; ++i)
-    Y[i * incY] = Y[i * incY] + X[i * incX] * alpha;
-}
-
-static void sgemv_raw(const unsigned int TStorageOrder, bool TransA,
-                      const unsigned int M, const unsigned int N,
-                      const float alpha, const float *A, const unsigned int lda,
-                      const float *X, const int incX, const float beta,
-                      float *Y, const int incY) {
-
-  unsigned int incy = abs(incY);
-  unsigned int incx = abs(incX);
-
-  if (TransA) {
-    sgemv_loop(i, j, N, M);
-  } else {
-    sgemv_loop(j, i, M, N);
-  }
-}
-
-static float sdot_raw(const unsigned int N, const float *X,
-                      const unsigned int incX, const float *Y,
-                      const unsigned int incY) {
-  float ret = 0;
-  for (unsigned int i = 0; i < N; ++i) {
-    ret += X[i * incX] * Y[i * incY];
-  }
-  return ret;
-}
-
-static void scopy_raw(const unsigned int N, const float *X, const int incX,
-                      float *Y, const int incY) {
-  unsigned int incy = abs(incY);
-  unsigned int incx = abs(incX);
-
-  for (unsigned int i = 0; i < N; ++i)
-    Y[i * incy] = X[i * incx];
-}
-
-static void sscal_raw(const unsigned int N, const float alpha, float *X,
-                      const int incX) {
-  unsigned int incx = abs(incX);
-
-  for (unsigned int i = 0; i < N; ++i)
-    X[i * incx] = alpha * X[i * incx];
-}
-
-static float snrm2_raw(const unsigned int N, const float *X, const int incX) {
-  unsigned int incx = abs(incX);
-  float sum = 0.0f;
-  float tmp;
-
-  for (unsigned int i = 0; i < N; i++) {
-    tmp = X[i * incx];
-    sum += tmp * tmp;
-  }
-  return sqrt(sum);
-}
-
-static void sgemm_raw(const unsigned int TStorageOrder, bool TransA,
-                      bool TransB, const unsigned int M, const unsigned int N,
-                      const unsigned int K, const float alpha, const float *A,
-                      const unsigned int lda, const float *B,
-                      const unsigned int ldb, const float beta, float *C,
-                      const unsigned int ldc) {
-
-  for (unsigned int m = 0; m < M; ++m) {
-    for (unsigned int n = 0; n < N; ++n) {
-      double c = 0.0;
-      float c_old = C[m * ldc + n];
-      for (unsigned int k = 0; k < K; ++k) {
-        float a, b;
-        a = ((TransA) ? A[k * lda + m] : A[m * lda + k]);
-        b = ((TransB) ? B[n * ldb + k] : B[k * ldb + n]);
-        c += a * b;
-      }
-      C[m * ldc + n] = alpha * c;
-      if (beta != 0.0)
-        C[m * ldc + n] += beta * c_old;
-    }
-  }
-}
-
-static unsigned int isamax_raw(const unsigned int N, const float *X,
-                               const int incX) {
-
-  unsigned int max_idx = 0;
-  float max_val = X[0];
-  for (unsigned int n = 1; n < N; n += incX) {
-    float cur_val = abs(X[n]);
-    if (cur_val > max_val) {
-      max_val = cur_val;
-      max_idx = n;
-    }
-  }
-
-  return max_idx;
-}
-
-#endif
-
-void sscal(const unsigned int N, const float alpha, void *X, const int incX,
-           ml::train::TensorDim::DataType d_type) {
-
-  if (d_type == ml::train::TensorDim::DataType::FP32) {
-
-#ifdef USE_BLAS
-#ifdef BLAS_NUM_THREADS
-    openblas_set_num_threads(BLAS_NUM_THREADS);
-#endif // BLAS_NUM_THREADS
-    cblas_sscal(N, alpha, (float *)X, incX);
-#else  // USE_BLAS else
-    sscal_raw(N, alpha, (float *)X, incX);
-#endif //  USE_BLAS
-  } else if (d_type == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    sscal(N, alpha, (_FP16 *)X, incX);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-}
-
-void sscal(const unsigned int N, const float alpha, float *X, const int incX) {
-#ifdef USE_BLAS
-#ifdef BLAS_NUM_THREADS
-  openblas_set_num_threads(BLAS_NUM_THREADS);
-#endif
-  cblas_sscal(N, alpha, X, incX);
-#else
-  sscal_raw(N, alpha, X, incX);
-#endif
-}
-
-void saxpy(const unsigned int N, const float alpha, const void *X,
-           const int incX, void *Y, const int incY,
-           ml::train::TensorDim::DataType d_type) {
-  if (d_type == ml::train::TensorDim::DataType::FP32) {
-#ifdef USE_BLAS
-#ifdef BLAS_NUM_THREADS
-    openblas_set_num_threads(BLAS_NUM_THREADS);
-#endif
-    cblas_saxpy(N, alpha, static_cast<const float *>(X), incX,
-                static_cast<float *>(Y), incY);
-#else
-    saxpy_raw(N, alpha, static_cast<const float *>(X), incX,
-              static_cast<float *>(Y), incY);
-#endif
-  } else if (d_type == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    saxpy_FP16(N, alpha, static_cast<const _FP16 *>(X), incX,
-               static_cast<_FP16 *>(Y), incY);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-}
-
-void saxpy(const unsigned int N, const float alpha, const float *X,
-           const int incX, float *Y, const int incY) {
-#ifdef USE_BLAS
-#ifdef BLAS_NUM_THREADS
-  openblas_set_num_threads(BLAS_NUM_THREADS);
-#endif
-  cblas_saxpy(N, alpha, X, incX, Y, incY);
-#else
-  saxpy_raw(N, alpha, X, incX, Y, incY);
-#endif
-}
-
-void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
-           const unsigned int M, const unsigned int N, const unsigned int K,
-           const float alpha, const void *A, const unsigned int lda,
-           const void *B, const unsigned int ldb, const float beta, void *C,
-           const unsigned int ldc, ml::train::TensorDim::DataType d_type) {
-  if (d_type == ml::train::TensorDim::DataType::FP32) {
-#ifdef USE_CUBLAS
-    int devID = 0;
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, devID);
-    float *d_A, *d_B, *d_C;
-
-    unsigned int size_A = M * K * sizeof(float);
-    unsigned int size_B = K * N * sizeof(float);
-    unsigned int size_C = M * N * sizeof(float);
-
-    cudaMalloc((void **)&d_A, size_A);
-    cudaMalloc((void **)&d_B, size_B);
-    cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice);
-    cudaMalloc((void **)&d_C, size_C);
-
-    cublasHandle_t handle;
-    cublasCreate(&handle);
-
-    cublasOperation_t transA = (TransA) ? CUBLAS_OP_T : CUBLAS_OP_N;
-    cublasOperation_t transB = (TransB) ? CUBLAS_OP_T : CUBLAS_OP_N;
-    cublasSgemm(handle, transA, transB, N, M, K, &alpha, d_B, N, d_A, K, &beta,
-                d_C, N);
-
-    cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost);
-    cublasDestroy(handle);
-
-#elif defined USE_BLAS
-
-#ifdef BLAS_NUM_THREADS
-    openblas_set_num_threads(BLAS_NUM_THREADS);
-#endif
-    CBLAS_TRANSPOSE transA = TransA ? CblasTrans : CblasNoTrans;
-    CBLAS_TRANSPOSE transB = TransB ? CblasTrans : CblasNoTrans;
-    CBLAS_ORDER order = TStorageOrder ? CblasColMajor : CblasRowMajor;
-    cblas_sgemm(
-      order, transA, transB, M, N, K, alpha, static_cast<const float *>(A), lda,
-      static_cast<const float *>(B), ldb, beta, static_cast<float *>(C), ldc);
-#else
-    sgemm_raw(TStorageOrder, TransA, TransB, M, N, K, alpha,
-              static_cast<const float *>(A), lda, static_cast<const float *>(B),
-              ldb, beta, static_cast<float *>(C), ldc);
-#endif
-
-  } else if (d_type == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    sgemm_FP16(TStorageOrder, TransA, TransB, M, N, K, alpha,
-               static_cast<const _FP16 *>(A), lda,
-               static_cast<const _FP16 *>(B), ldb, beta,
-               static_cast<_FP16 *>(C), ldc);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-} // namespace nntrainer
-
-void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
-           const unsigned int M, const unsigned int N, const unsigned int K,
-           const float alpha, const float *A, const unsigned int lda,
-           const float *B, const unsigned int ldb, const float beta, float *C,
-           const unsigned int ldc) {
-#ifdef USE_CUBLAS
-  int devID = 0;
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, devID);
-  float *d_A, *d_B, *d_C;
-
-  unsigned int size_A = M * K * sizeof(float);
-  unsigned int size_B = K * N * sizeof(float);
-  unsigned int size_C = M * N * sizeof(float);
-
-  cudaMalloc((void **)&d_A, size_A);
-  cudaMalloc((void **)&d_B, size_B);
-  cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice);
-  cudaMalloc((void **)&d_C, size_C);
-
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-
-  cublasOperation_t transA = (TransA) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  cublasOperation_t transB = (TransB) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  cublasSgemm(handle, transA, transB, N, M, K, &alpha, d_B, N, d_A, K, &beta,
-              d_C, N);
-
-  cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost);
-  cublasDestroy(handle);
-#elif defined USE_BLAS
-#ifdef BLAS_NUM_THREADS
-  openblas_set_num_threads(BLAS_NUM_THREADS);
-#endif
-  CBLAS_TRANSPOSE transA = TransA ? CblasTrans : CblasNoTrans;
-  CBLAS_TRANSPOSE transB = TransB ? CblasTrans : CblasNoTrans;
-  CBLAS_ORDER order = TStorageOrder ? CblasColMajor : CblasRowMajor;
-  cblas_sgemm(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C,
-              ldc);
-#else
-  sgemm_raw(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta,
-            C, ldc);
-#endif
-}
-
-void scopy(const unsigned int N, const void *X, const int incX, void *Y,
-           const int incY, ml::train::TensorDim::DataType d_type) {
-
-  if (d_type == ml::train::TensorDim::DataType::FP32) {
-
-#ifdef USE_BLAS
-#ifdef BLAS_NUM_THREADS
-    openblas_set_num_threads(BLAS_NUM_THREADS);
-#endif
-    cblas_scopy(N, (float *)X, incX, (float *)Y, incY);
-#else
-    scopy_raw(N, (float *)X, incX, (float *)Y, incY);
-#endif
-
-  } else if (d_type == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    scopy_FP16(N, (_FP16 *)X, incX, (_FP16 *)Y, incY);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-}
-
-void scopy(const unsigned int N, const float *X, const int incX, float *Y,
-           const int incY) {
-#ifdef USE_BLAS
-#ifdef BLAS_NUM_THREADS
-  openblas_set_num_threads(BLAS_NUM_THREADS);
-#endif
-  cblas_scopy(N, X, incX, Y, incY);
-#else
-  scopy_raw(N, X, incX, Y, incY);
-#endif
-}
-
-void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y,
-           const int intY) {
-#ifdef USE_NEON
-  nntrainer::neon::copy_int8_or_int4(N, X, Y);
-#else
-  for (unsigned int idx = 0; idx < N; idx++) {
-    Y[idx] = X[idx];
-  }
-#endif
-}
-
-void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
-                           const int incX, float *Y, const int incY) {
-#ifdef USE_NEON
-  nntrainer::neon::copy_int4_to_fp32(N, X, Y);
-#else
-  for (unsigned int idx = 0; idx < N; idx++) {
-    Y[2 * idx] = X[idx] >> 4;
-    Y[2 * idx + 1] = X[idx] & 0x0f;
-  }
-#endif
-}
-
-void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
-                           const int incX, float *Y, const int incY) {
-#ifdef USE_NEON
-  nntrainer::neon::copy_int8_to_fp32(N, X, Y);
-#else
-  for (unsigned int idx = 0; idx < N; idx++) {
-    Y[idx] = X[idx];
-  }
-#endif
-}
-
-float snrm2(const int N, const float *X, const int incX) {
-#ifdef USE_BLAS
-#ifdef BLAS_NUM_THREADS
-  openblas_set_num_threads(BLAS_NUM_THREADS);
-#endif
-  return cblas_snrm2(N, X, incX);
-#else
-  return snrm2_raw(N, X, incX);
-#endif
-}
-
-float sdot(const unsigned int N, const float *X, const unsigned int incX,
-           const float *Y, const unsigned int incY) {
-#ifdef USE_BLAS
-#ifdef BLAS_NUM_THREADS
-  openblas_set_num_threads(BLAS_NUM_THREADS);
-#endif
-  return cblas_sdot(N, X, incX, Y, incY);
-#else
-  return sdot_raw(N, X, incX, Y, incY);
-#endif
-}
-
-void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
-           const unsigned int N, const float alpha, const void *A,
-           const unsigned int lda, const void *X, const int incX,
-           const float beta, void *Y, const int incY,
-           ml::train::TensorDim::DataType d_type) {
-
-  if (d_type == ml::train::TensorDim::DataType::FP32) {
-#ifdef USE_BLAS
-#ifdef BLAS_NUM_THREADS
-    openblas_set_num_threads(BLAS_NUM_THREADS);
-#endif
-    CBLAS_TRANSPOSE transA = TransA ? CblasTrans : CblasNoTrans;
-    CBLAS_ORDER order = TStorageOrder ? CblasColMajor : CblasRowMajor;
-    return cblas_sgemv(
-      order, transA, M, N, alpha, static_cast<const float *>(A), lda,
-      static_cast<const float *>(X), incX, beta, static_cast<float *>(Y), incY);
-#else
-    return sgemv_raw(
-      TStorageOrder, TransA, M, N, alpha, static_cast<const float *>(A), lda,
-      static_cast<const float *>(X), incX, beta, static_cast<float *>(Y), incY);
-#endif
-  } else if (d_type == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    return sgemv_FP16(
-      TStorageOrder, TransA, M, N, alpha, static_cast<const _FP16 *>(A), lda,
-      static_cast<const _FP16 *>(X), incX, beta, static_cast<_FP16 *>(Y), incY);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-}
-
-void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
-           const unsigned int N, const float alpha, const float *A,
-           const unsigned int lda, const float *X, const int incX,
-           const float beta, float *Y, const int incY) {
-#ifdef USE_BLAS
-#ifdef BLAS_NUM_THREADS
-  openblas_set_num_threads(BLAS_NUM_THREADS);
-#endif
-  CBLAS_TRANSPOSE transA = TransA ? CblasTrans : CblasNoTrans;
-  CBLAS_ORDER order = TStorageOrder ? CblasColMajor : CblasRowMajor;
-  return cblas_sgemv(order, transA, M, N, alpha, A, lda, X, incX, beta, Y,
-                     incY);
-#else
-  return sgemv_raw(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
-                   incY);
-#endif
-}
-
-unsigned int isamax(const unsigned int N, const float *X, const int incX) {
-#ifdef USE_BLAS
-#ifdef BLAS_NUM_THREADS
-  openblas_set_num_threads(BLAS_NUM_THREADS);
-#endif
-  return cblas_isamax(N, X, incX);
-#else
-  return isamax_raw(N, X, incX);
-#endif
-}
-
-void sine(const unsigned int N, float *X, float *Y, float alpha) {
-#ifdef USE_NEON
-  nntrainer::neon::sine(N, X, Y, alpha);
-#else
-  unsigned int i = 0;
-  while (i < N) {
-    Y[i] = std::sin(alpha * X[i]);
-    ++i;
-  }
-#endif
-}
-
-void cosine(const unsigned int N, float *X, float *Y, float alpha) {
-#ifdef USE_NEON
-  nntrainer::neon::cosine(N, X, Y, alpha);
-#else
-  unsigned int i = 0;
-  while (i < N) {
-    Y[i] = std::cos(alpha * X[i]);
-    ++i;
-  }
-#endif
-}
-
-void inv_sqrt_inplace(const unsigned int N, float *X) {
-#ifdef USE_NEON
-  nntrainer::neon::inv_sqrt_inplace(N, X);
-#else
-  for (unsigned int i = 0; i < N; ++i) {
-    X[i] = 1 / std::sqrt(static_cast<float>(X[i]));
-  }
-#endif
-}
-static void ele_mul_fallback(const unsigned int N, const float *X,
-                             const float *Y, float *Z, float alpha, float beta,
-                             unsigned int i_stride, unsigned int o_stride) {
-  for (unsigned int i = 0; i < N; ++i) {
-    *Z = *X * alpha * *Y + beta * *Z;
-    X += o_stride;
-    Y += i_stride;
-    Z += o_stride;
-  }
-}
-
-static void ele_add_fallback(const unsigned int N, const float *X,
-                             const float *Y, float *Z, float alpha, float beta,
-                             unsigned int i_stride, unsigned int o_stride) {
-  for (unsigned int i = 0; i < N; ++i) {
-    *Z = *X + alpha * *Y + beta * *Z;
-    X += o_stride;
-    Y += i_stride;
-    Z += o_stride;
-  }
-}
-
-static void ele_sub_fallback(const unsigned int N, const float *X,
-                             const float *Y, float *Z, float alpha, float beta,
-                             unsigned int i_stride, unsigned int o_stride) {
-  for (unsigned int i = 0; i < N; ++i) {
-    *Z = *X - alpha * *Y + beta * *Z;
-    X += o_stride;
-    Y += i_stride;
-    Z += o_stride;
-  }
-}
-
-static void ele_div_fallback(const unsigned int N, const float *X,
-                             const float *Y, float *Z, float alpha, float beta,
-                             unsigned int i_stride, unsigned int o_stride) {
-  for (unsigned int i = 0; i < N; ++i) {
-    *Z = *X / (alpha * *Y) + beta * *Z;
-    X += o_stride;
-    Y += i_stride;
-    Z += o_stride;
-  }
-}
-
-void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
-             float alpha, float beta, unsigned int i_stride,
-             unsigned int o_stride) {
-  if (i_stride == 1 && o_stride == 1) {
-#ifdef USE_NEON
-    nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta);
-#else
-    ele_mul_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-#endif
-  } else
-    ele_mul_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-}
-
-void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
-             float alpha, float beta, unsigned int i_stride,
-             unsigned int o_stride) {
-  if (i_stride == 1 && o_stride == 1) {
-#ifdef USE_NEON
-    nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta);
-#else
-    ele_add_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-#endif
-  } else
-    ele_add_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-}
-
-void ele_sub(const unsigned int N, const float *X, const float *Y, float *Z,
-             float alpha, float beta, unsigned int i_stride,
-             unsigned int o_stride) {
-  if (i_stride == 1 && o_stride == 1) {
-#ifdef USE_NEON
-    nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta);
-#else
-    ele_sub_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-#endif
-  } else
-    ele_sub_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-}
-
-void ele_div(const unsigned int N, const float *X, const float *Y, float *Z,
-             float alpha, float beta, unsigned int i_stride,
-             unsigned int o_stride) {
-  if (i_stride == 1 && o_stride == 1) {
-#ifdef USE_NEON
-    nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta);
-#else
-    ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-#endif
-  } else
-    ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
-}
-
-} // namespace nntrainer
diff --git a/nntrainer/tensor/char_tensor.cpp b/nntrainer/tensor/char_tensor.cpp
index ec70943b84..33e3b94acf 100644
--- a/nntrainer/tensor/char_tensor.cpp
+++ b/nntrainer/tensor/char_tensor.cpp
@@ -8,11 +8,10 @@
  * @bug		No known bugs except for NYI items
  */
 
+#include <char_tensor.h>
+#include <cpu_backend.h>
 #include <iomanip>
 #include <iostream>
-
-#include <blas_interface.h>
-#include <char_tensor.h>
 #include <tensor.h>
 
 namespace nntrainer {
diff --git a/nntrainer/tensor/cpu_backend/arm/arm_compute_backend.cpp b/nntrainer/tensor/cpu_backend/arm/arm_compute_backend.cpp
new file mode 100644
index 0000000000..db82c6549d
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/arm/arm_compute_backend.cpp
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file arm_compute_backend.cpp
+ * @date   23 April 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Compute backend for arm
+ *
+ */
+#include <arm_compute_backend.h>
+#include <assert.h>
+#include <cblas_interface.h>
+#include <fallback_internal.h>
+#include <neon_impl.h>
+#include <nntrainer_error.h>
+
+#define ROW_MAJOR 0
+#define COL_MAJOR 1
+
+namespace nntrainer {
+
+void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX,
+           uint8_t *Y, const unsigned int incY) {
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::copy_int8_or_int4(N, X, Y);
+  } else {
+    __fallback_scopy(N, X, incX, Y, incY);
+  }
+}
+
+void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, float *Y,
+                           const unsigned int incY) {
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::copy_int4_to_fp32(N, X, Y);
+  } else {
+    __fallback_scopy_int4_to_float32(N, X, incX, Y, incY);
+  }
+}
+
+void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, float *Y,
+                           const unsigned int incY) {
+
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::copy_int8_to_fp32(N, X, Y);
+  } else {
+    __fallback_scopy_int8_to_float32(N, X, incX, Y, incY);
+  }
+}
+
+void sine(const unsigned int N, float *X, float *Y, float alpha) {
+  nntrainer::neon::sine(N, X, Y, alpha);
+}
+
+void cosine(const unsigned int N, float *X, float *Y, float alpha) {
+  nntrainer::neon::cosine(N, X, Y, alpha);
+}
+
+void inv_sqrt_inplace(const unsigned int N, float *X) {
+  nntrainer::neon::inv_sqrt_inplace(N, X);
+}
+
+void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  if (i_stride == 1 && o_stride == 1) {
+    nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta);
+  } else
+    __fallback_ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  if (i_stride == 1 && o_stride == 1) {
+    nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta);
+  } else
+    __fallback_ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  if (i_stride == 1 && o_stride == 1) {
+    nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta);
+  } else
+    __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  if (i_stride == 1 && o_stride == 1) {
+    nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta);
+  } else
+    __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void saxpy(const unsigned int N, const float alpha, const float *X,
+           const unsigned int incX, float *Y, const unsigned int incY) {
+  __cblas_saxpy(N, alpha, X, incX, Y, incY);
+}
+
+void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
+           const unsigned int N, const float alpha, const float *A,
+           const unsigned int lda, const float *X, const unsigned int incX,
+           const float beta, float *Y, const unsigned int incY) {
+  __cblas_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
+                incY);
+}
+
+float sdot(const unsigned int N, const float *X, const unsigned int incX,
+           const float *Y, const unsigned int incY) {
+  return __cblas_sdot(N, X, incX, Y, incY);
+}
+
+void scopy(const unsigned int N, const float *X, const unsigned int incX,
+           float *Y, const unsigned int incY) {
+  __cblas_scopy(N, X, incX, Y, incY);
+}
+
+void sscal(const unsigned int N, const float alpha, float *X,
+           const unsigned int incX) {
+  __cblas_sscal(N, alpha, X, incX);
+}
+
+float snrm2(const unsigned int N, const float *X, const unsigned int incX) {
+  return __cblas_snrm2(N, X, incX);
+}
+
+void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
+           const unsigned int M, const unsigned int N, const unsigned int K,
+           const float alpha, const float *A, const unsigned int lda,
+           const float *B, const unsigned int ldb, const float beta, float *C,
+           const unsigned int ldc) {
+  __cblas_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
+                beta, C, ldc);
+}
+
+unsigned int isamax(const unsigned int N, const float *X,
+                    const unsigned int incX) {
+  return __cblas_isamax(N, X, incX);
+}
+
+} /* namespace nntrainer */
diff --git a/nntrainer/tensor/cpu_backend/arm/arm_compute_backend.h b/nntrainer/tensor/cpu_backend/arm/arm_compute_backend.h
new file mode 100644
index 0000000000..a7617dc942
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/arm/arm_compute_backend.h
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file ARM_compute_backend.h
+ * @date   23 April 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Compute backend for ARM
+ *
+ */
+#ifndef __ARM_COMPUTE_BACKEND_H__
+#define __ARM_COMPUTE_BACKEND_H__
+#ifdef __cplusplus
+
+#include <cstdint>
+#include <tensor_dim.h>
+
+namespace nntrainer {
+
+#ifdef ENABLE_FP16
+/**
+ * @brief
+ *
+ *
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] alpha float number
+ */
+void sscal(const unsigned int N, const float alpha, _FP16 *X,
+           const unsigned int incX);
+
+/**
+ * @brief     snrm2 computation : Euclidean norm
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ */
+_FP16 snrm2(const unsigned int N, const _FP16 *X, const unsigned int incX);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           _FP16 *Y, const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void scopy(const unsigned int N, const float *X, const unsigned int incX,
+           _FP16 *Y, const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           float *Y, const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void scopy_int4_to_float16(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, _FP16 *Y,
+                           const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, _FP16 *Y,
+                           const unsigned int incY);
+
+/**
+ * @brief     sdot computation : sum of all X * Y
+ * @param[in] N number of elements in Y
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+_FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           const _FP16 *Y, const unsigned int incY);
+
+/**
+ * @brief     saxpy computation : Y = alpha*X + Y
+ * @param[in] N number of elements in Y
+ * @param[in] alpha float number
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void saxpy(const unsigned int N, const float alpha, const _FP16 *X,
+           const unsigned int incX, _FP16 *Y, const unsigned int incY);
+
+/**
+ * @brief     sgemm computation : Y = alpha*op(A)*op(B) + beta*C,
+ * where op(X) is one of X or X**T
+ * @param[in] A __fp16 * for Matrix A
+ * @param[in] B __fp16 * for Matrix B
+ * @param[in] C __fp16 * for Matrix C
+ * @param[in] M number of op(A)'s and C's row
+ * @param[in] N number of op(B)'s and C's columns
+ * @param[in] K number of op(A)'s and columns and op(B)'s rows
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
+           const unsigned int M, const unsigned int N, const unsigned int K,
+           const float alpha, const _FP16 *A, const unsigned int lda,
+           const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C,
+           const unsigned int ldc);
+/**
+ * @brief     sgemv computation : Y = alpha*A*X + beta*Y
+ * @param[in] A float * for Matrix A
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] rows number of A's row
+ * @param[in] cols number of A's columns
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
+           const unsigned int N, const float alpha, const _FP16 *A,
+           const unsigned int lda, const _FP16 *X, const unsigned int incX,
+           const float beta, _FP16 *Y, const unsigned int incY);
+/**
+ * @brief     elementwise vector multiplication : Z = X ⊙ alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+
+/**
+ * @brief     elementwise vector addition : Z = X + alpha * Y + beta *
+ * Z
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+/**
+ * @brief     elementwise vector subtraction with neon : Z = X - alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+
+/**
+ * @brief     elementwise vector division with neon : Z = X / (alpha * Y) + beta
+ * * Z
+ * @note ZeroDivisionError is not guaranteed in this function
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+
+/**
+ * @brief     isamax function : index of first maxima
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ */
+unsigned int isamax(const unsigned int N, const _FP16 *X,
+                    const unsigned int incX);
+
+/**
+ * @brief squared root transformation inplace : X = sqrt(X)
+ *
+ * @param N size of X
+ * @param X __fp16 * for Vector X
+ */
+void inv_sqrt_inplace(const unsigned int N, _FP16 *X);
+
+/**
+ * @brief Matrix transpose / 2D Tensor transpose
+ *
+ * @param M row length of input matrix
+ * @param N col length of input matrix
+ * @param src src data of input matrix
+ * @param ld_src data offset of input matrix
+ * @param dst destination of output matrix
+ * @param ld_dst data offset of output matrix
+ */
+void transpose_matrix(const unsigned int M, const unsigned int N,
+                      const _FP16 *src, unsigned int ld_src, _FP16 *dst,
+                      unsigned int ld_dst);
+#endif
+/**
+ * @brief     sscal computation : X = alpha * X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] alpha float number
+ */
+void sscal(const unsigned int N, const float alpha, float *X,
+           const unsigned int incX);
+/**
+ * @brief     snrm2 computation : Euclidean norm
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ */
+float snrm2(const unsigned int N, const float *X, const unsigned int incX);
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void scopy(const unsigned int N, const float *X, const unsigned int incX,
+           float *Y, const unsigned int incY);
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y uint8_t * for Vector Y
+ */
+void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX,
+           uint8_t *Y, const unsigned int incY);
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, float *Y,
+                           const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, float *Y,
+                           const unsigned int incY);
+
+/**
+ * @brief     sdot computation : sum of all X * Y
+ * @param[in] N number of elements in Y
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+float sdot(const unsigned int N, const float *X, const unsigned int incX,
+           const float *Y, const unsigned int incY);
+/**
+ * @brief     saxpy computation : Y = alpha*X + Y
+ * @param[in] N number of elements in Y
+ * @param[in] alpha float number
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void saxpy(const unsigned int N, const float alpha, const float *X,
+           const unsigned int incX, float *Y, const unsigned int incY);
+/**
+ * @brief     sgemm computation  : Y = alpha*op(A)*op(B) + beta*C,
+ * where op(X) is one of X or X**T
+ * @param[in] A float * for Matrix A
+ * @param[in] B float * for Matrix B
+ * @param[in] C float * for Matrix C
+ * @param[in] M number of op(A)'s and C's row
+ * @param[in] N number of op(B)'s and C's columns
+ * @param[in] K number of op(A)'s and columns and op(B)'s rows
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
+           const unsigned int M, const unsigned int N, const unsigned int K,
+           const float alpha, const float *A, const unsigned int lda,
+           const float *B, const unsigned int ldb, const float beta, float *C,
+           const unsigned int ldc);
+/**
+ * @brief     sgemv computation  : Y = alpha*A*X + beta*Y
+arch-dep:nntrainer/tensor/cpu_backend/arm/arm_compute_backend.h
+ * @param[in] A float * for Matrix A
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] rows number of A's row
+ * @param[in] cols number of A's columns
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
+           const unsigned int N, const float alpha, const float *A,
+           const unsigned int lda, const float *X, const unsigned int incX,
+           const float beta, float *Y, const unsigned int incY);
+/**
+ * @brief     isamax function : index of first maxima
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ */
+unsigned int isamax(const unsigned int N, const float *X,
+                    const unsigned int incX);
+
+/**
+ * @brief     sine with neon: Y = sin(alpha * X)
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] alpha float * for scaling angle (radian)
+ */
+void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f);
+
+/**
+ * @brief     cosine with neon: Y = cos(alpha * X)
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] alpha float * for scaling angle (radian)
+ */
+void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f);
+
+/**
+ * @brief inversed squared root transformation inplace : X = 1 / sqrt(X)
+ *
+ * @param N size of X
+ * @param X float * for Vector X
+ */
+void inv_sqrt_inplace(const unsigned int N, float *X);
+/**
+ * @brief     elementwise vector multiplication : Z = X ⊙ alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+
+/**
+ * @brief     elementwise vector addition : Z = X + alpha * Y + beta *
+ * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+/**
+ * @brief     elementwise vector subtraction with neon : Z = X - alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+
+/**
+ * @brief     elementwise vector division with neon : Z = X / (alpha * Y) + beta
+ * * Z
+ * @note ZeroDivisionError is not guaranteed in this function
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+
+} /* namespace nntrainer */
+#endif /* __cplusplus */
+#endif /* __ARM_COMPUTE_BACKEND_H__ */
diff --git a/nntrainer/tensor/cpu_backend/arm/arm_compute_backend_fp16.cpp b/nntrainer/tensor/cpu_backend/arm/arm_compute_backend_fp16.cpp
new file mode 100644
index 0000000000..133aeae47c
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/arm/arm_compute_backend_fp16.cpp
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file arm_compute_backend_fp16.cpp
+ * @date   23 April 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Compute backend for arm
+ *
+ */
+#include <arm_compute_backend.h>
+#include <assert.h>
+#include <fallback_internal.h>
+#include <neon_impl.h>
+#include <nntrainer_error.h>
+
+#define ROW_MAJOR 0
+#define COL_MAJOR 1
+
+namespace nntrainer {
+
+void sscal(const unsigned int N, const float alpha, _FP16 *X,
+           const unsigned int incX) {
+  assert(incX > 0);
+
+  if (incX == 1) {
+    nntrainer::neon::hscal(N, X, alpha);
+  } else {
+    __fallback_sscal(N, alpha, X, incX);
+  }
+}
+
+_FP16 snrm2(const unsigned int N, const _FP16 *X, const unsigned int incX) {
+  assert(incX > 0);
+  _FP16 sum;
+  if (incX == 1) {
+    sum = nntrainer::neon::hnrm2(N, X);
+  } else {
+    sum = __fallback_snrm2(N, X, incX);
+  }
+  return sum;
+}
+
+void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           _FP16 *Y, const unsigned int incY) {
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::hcopy(N, X, Y);
+  } else {
+    __fallback_scopy(N, X, incX, Y, incY);
+  }
+}
+
+void scopy(const unsigned int N, const float *X, const unsigned int incX,
+           _FP16 *Y, const unsigned int incY) {
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::copy_fp32_to_fp16(N, X, Y);
+  } else {
+    __fallback_scopy(N, X, incX, Y, incY);
+  }
+}
+
+void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           float *Y, const unsigned int incY) {
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::copy_fp16_to_fp32(N, X, Y);
+  } else {
+    __fallback_scopy(N, X, incX, Y, incY);
+  }
+}
+
+void scopy_int4_to_float16(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, _FP16 *Y,
+                           const unsigned int incY) {
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::copy_int4_to_fp16(N, X, Y);
+  } else {
+    __fallback_scopy_int4_to_float16(N, X, incX, Y, incY);
+  }
+}
+
+void scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, _FP16 *Y,
+                           const unsigned int incY) {
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::copy_int8_to_fp16(N, X, Y);
+  } else {
+    __fallback_scopy_int8_to_float16(N, X, incX, Y, incY);
+  }
+}
+
+_FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           const _FP16 *Y, const unsigned int incY) {
+  _FP16 ret = 0;
+  assert(incX > 0 && incY > 0);
+  if (incX == 1 && incY == 1) {
+    ret = nntrainer::neon::hdot(N, X, Y);
+  } else {
+    __fallback_sdot(N, X, incX, Y, incY);
+  }
+  return ret;
+}
+
+void saxpy(const unsigned int N, const float alpha, const _FP16 *X,
+           const unsigned int incX, _FP16 *Y, const unsigned int incY) {
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::haxpy(N, alpha, X, Y);
+  } else {
+    __fallback_saxpy(N, alpha, X, incX, Y, incY);
+  }
+}
+
+void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
+           const unsigned int M, const unsigned int N, const unsigned int K,
+           const float alpha, const _FP16 *A, const unsigned int lda,
+           const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C,
+           const unsigned int ldc) {
+  if (TStorageOrder) {
+    __fallback_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B,
+                     ldb, beta, C, ldc);
+  } else {
+    nntrainer::neon::custom_hgemm(A, B, C, M, N, K, alpha, beta, TransA,
+                                  TransB);
+  }
+}
+
+void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
+           const unsigned int N, const float alpha, const _FP16 *A,
+           const unsigned int lda, const _FP16 *X, const unsigned int incX,
+           const float beta, _FP16 *Y, const unsigned int incY) {
+  if (TStorageOrder) {
+    __fallback_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta,
+                     Y, incY);
+  } else {
+    if (TransA) {
+      nntrainer::neon::hgemv_transpose(A, X, Y, M, N, alpha, beta);
+    } else {
+      nntrainer::neon::hgemv(A, X, Y, M, N, alpha, beta);
+    }
+  }
+}
+
+void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  if (i_stride == 1 && o_stride == 1) {
+    nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta);
+  } else
+    __fallback_ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  if (i_stride == 1 && o_stride == 1) {
+    nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta);
+  } else
+    __fallback_ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  if (i_stride == 1 && o_stride == 1) {
+    nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta);
+  } else
+    __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  if (i_stride == 1 && o_stride == 1) {
+    nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta);
+  } else
+    __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+unsigned int isamax(const unsigned int N, const _FP16 *X,
+                    const unsigned int incX) {
+  unsigned int max_idx = 0;
+  if (incX == 1 && N >= 8) {
+    max_idx = nntrainer::neon::isamax(N, X);
+  } else {
+    max_idx = __fallback_isamax(N, X, incX);
+  }
+  return max_idx;
+}
+
+void inv_sqrt_inplace(const unsigned int N, _FP16 *X) {
+  nntrainer::neon::inv_sqrt_inplace(N, X);
+}
+
+void transpose_matrix(const unsigned int M, const unsigned int N,
+                      const __fp16 *src, unsigned int ld_src, __fp16 *dst,
+                      unsigned int ld_dst) {
+  nntrainer::neon::transpose_matrix(M, N, src, ld_src, dst, ld_dst);
+}
+
+} /* namespace nntrainer */
diff --git a/nntrainer/tensor/cpu_backend/arm/armv7_neon.h b/nntrainer/tensor/cpu_backend/arm/armv7_neon.h
new file mode 100644
index 0000000000..8b47e0525d
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/arm/armv7_neon.h
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file   cpu_backend.h
+ * @date   16 August 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Conditional header file to support unsupported intrinsics on armv7l
+ *
+ */
+
+#include <arm_neon.h>
+#include <cmath>
+
+/**
+ * @brief macro for vfmaq_n_f32
+ *
+ */
+#define vfmaq_n_f32(a, b, n) vaddq_f32(a, vmulq_f32(b, vmovq_n_f32(n)))
+
+/**
+ * @brief vdivq_f32 macro
+ *
+ * @param a a for a / b
+ * @param b b for a / b
+ * @return float32x4_t
+ */
+static inline float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) {
+  float32x4_t ret;
+  for (unsigned int i = 0; i < 4; ++i) {
+    ret[i] = a[i] / b[i];
+  }
+  return ret;
+}
+
+/**
+ * @brief vsqrtq_f32 macro
+ *
+ * @param a input vector
+ * @return float32x4_t
+ */
+static inline float32x4_t vsqrtq_f32(float32x4_t a) {
+  float32x4_t ret;
+  for (unsigned int i = 0; i < 4; ++i) {
+    ret[i] = std::sqrt(a[i]);
+  }
+  return ret;
+}
+
+/**
+ * @brief vmaxvq_f32 macro
+ *
+ * @param a input vector
+ * @return float
+ */
+static inline float vmaxvq_f32(float32x4_t a) {
+  float ret = a[0];
+  for (unsigned int i = 1; i < 4; ++i) {
+    if (ret > a[i])
+      ret = a[i];
+  }
+  return ret;
+}
+
+/**
+ * @brief vaddvq_f32
+ *
+ * @param a input vector
+ * @return float32_t
+ */
+static inline float32_t vaddvq_f32(float32x4_t a) {
+  float32_t ret = a[0];
+  for (unsigned int i = 1; i < 4; ++i) {
+    ret += a[i];
+  }
+  return ret;
+}
+
+#ifdef ENABLE_FP16
+/**
+ * @brief macro for vfmaq_n_f16
+ *
+ */
+#define vfmaq_n_f16(a, b, c, n) vaddq_f16(a, vmulq_f16(b, vmovq_n_f16(c[n])))
+
+/**
+ * @brief vmaxvq_f16 macro
+ *
+ * @param a input vector
+ * @return float16_t
+ */
+static inline float16_t vmaxvq_f16(float16x8_t a) {
+  float16_t ret = a[0];
+  for (unsigned int i = 1; i < 8; ++i) {
+    if (ret > a[i])
+      ret = a[i];
+  }
+  return ret;
+}
+#endif
diff --git a/nntrainer/tensor/hgemm/hgemm.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm.cpp
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm.cpp
diff --git a/nntrainer/tensor/hgemm/hgemm.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm.h
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm.h
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm.h
diff --git a/nntrainer/tensor/hgemm/hgemm_common.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_common.h
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_common.h
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_common.h
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel.h
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel.h
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel.h
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp
similarity index 98%
rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp
index 713117fbb0..2ca0925132 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp
+++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp
@@ -15,6 +15,9 @@
 #include <assert.h>
 #include <hgemm_kernel.h>
 #include <stdlib.h>
+#ifdef ARMV7
+#include <armv7_neon.h>
+#endif
 
 template <>
 void hgemm_kernel_1x4(unsigned int M, unsigned int N, unsigned int K,
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp
similarity index 99%
rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp
index 3935187065..e1a724b41c 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp
+++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp
@@ -17,6 +17,9 @@
 #include <hgemm_kernel.h>
 #include <hgemm_util.h>
 #include <stdlib.h>
+#ifdef ARMV7
+#include <armv7_neon.h>
+#endif
 
 // 1. Partial sum 64 digits : worst accuracy, best latency
 #define KERNEL_1x8_ACC8()           \
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp
similarity index 99%
rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp
index f446d32b5a..57e04675f8 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp
+++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp
@@ -16,6 +16,9 @@
 #include <hgemm_kernel.h>
 #include <hgemm_util.h>
 #include <stdlib.h>
+#ifdef ARMV7
+#include <armv7_neon.h>
+#endif
 
 #define INIT_KERNEL_4x4()  \
   do {                     \
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp
similarity index 99%
rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp
index 118e99d9db..1c1d26f270 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp
+++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp
@@ -16,6 +16,9 @@
 #include <hgemm_kernel.h>
 #include <hgemm_util.h>
 #include <stdlib.h>
+#ifdef ARMV7
+#include <armv7_neon.h>
+#endif
 
 #define INIT_KERNEL_4X8()  \
   do {                     \
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp
similarity index 99%
rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp
index 3ec2b0306d..cca0d4d3a9 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp
+++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp
@@ -16,6 +16,9 @@
 #include <hgemm_kernel.h>
 #include <hgemm_util.h>
 #include <stdlib.h>
+#ifdef ARMV7
+#include <armv7_neon.h>
+#endif
 
 #define INIT_KERNEL_8X16()       \
   do {                           \
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16_experimental.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x16_experimental.cpp
similarity index 99%
rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16_experimental.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x16_experimental.cpp
index ebd75fecb4..6b577469be 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16_experimental.cpp
+++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x16_experimental.cpp
@@ -17,6 +17,9 @@
 #include <hgemm_util.h>
 #include <stdexcept>
 #include <stdlib.h>
+#ifdef ARMV7
+#include <armv7_neon.h>
+#endif
 
 #define INIT_KERNEL_8X16()       \
   do {                           \
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp
similarity index 99%
rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp
index b072c3255e..5fdf52e83a 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp
+++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp
@@ -16,6 +16,9 @@
 #include <hgemm_kernel.h>
 #include <hgemm_util.h>
 #include <stdlib.h>
+#ifdef ARMV7
+#include <armv7_neon.h>
+#endif
 
 #define INIT_KERNEL_8x8()   \
   do {                      \
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/meson.build b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/meson.build
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_kernel/meson.build
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/meson.build
diff --git a/nntrainer/tensor/hgemm/hgemm_noTrans.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_noTrans.cpp
similarity index 99%
rename from nntrainer/tensor/hgemm/hgemm_noTrans.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_noTrans.cpp
index b1497329dd..e6a562e045 100644
--- a/nntrainer/tensor/hgemm/hgemm_noTrans.cpp
+++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_noTrans.cpp
@@ -20,6 +20,9 @@
 #include <hgemm_util.h>
 #include <limits>
 #include <matrix_transpose_neon.h>
+#ifdef ARMV7
+#include <armv7_neon.h>
+#endif
 
 void hgemm_noTrans(const __fp16 *A, const __fp16 *B, float *C32, unsigned int M,
                    unsigned int N, unsigned int K, float alpha, float beta) {
diff --git a/nntrainer/tensor/hgemm/hgemm_noTrans.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_noTrans.h
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_noTrans.h
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_noTrans.h
diff --git a/nntrainer/tensor/hgemm/hgemm_pack.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_pack.cpp
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_pack.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_pack.cpp
diff --git a/nntrainer/tensor/hgemm/hgemm_pack.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_pack.h
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_pack.h
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_pack.h
diff --git a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding.h
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding.h
diff --git a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_a.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_a.cpp
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_a.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_a.cpp
diff --git a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_a.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_a.h
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_a.h
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_a.h
diff --git a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_b.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_b.cpp
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_b.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_b.cpp
diff --git a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_b.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_b.h
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_b.h
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_b.h
diff --git a/nntrainer/tensor/hgemm/hgemm_padding/meson.build b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/meson.build
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_padding/meson.build
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/meson.build
diff --git a/nntrainer/tensor/hgemm/hgemm_transA.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transA.cpp
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_transA.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transA.cpp
diff --git a/nntrainer/tensor/hgemm/hgemm_transA.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transA.h
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_transA.h
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transA.h
diff --git a/nntrainer/tensor/hgemm/hgemm_transAB.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transAB.cpp
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_transAB.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transAB.cpp
diff --git a/nntrainer/tensor/hgemm/hgemm_transAB.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transAB.h
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_transAB.h
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transAB.h
diff --git a/nntrainer/tensor/hgemm/hgemm_transB.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transB.cpp
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_transB.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transB.cpp
diff --git a/nntrainer/tensor/hgemm/hgemm_transB.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transB.h
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_transB.h
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transB.h
diff --git a/nntrainer/tensor/hgemm/hgemm_util.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_util.cpp
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_util.cpp
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_util.cpp
diff --git a/nntrainer/tensor/hgemm/hgemm_util.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_util.h
similarity index 100%
rename from nntrainer/tensor/hgemm/hgemm_util.h
rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_util.h
diff --git a/nntrainer/tensor/hgemm/meson.build b/nntrainer/tensor/cpu_backend/arm/hgemm/meson.build
similarity index 100%
rename from nntrainer/tensor/hgemm/meson.build
rename to nntrainer/tensor/cpu_backend/arm/hgemm/meson.build
diff --git a/nntrainer/tensor/matrix_transpose_neon/mask_neon.h b/nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/mask_neon.h
similarity index 100%
rename from nntrainer/tensor/matrix_transpose_neon/mask_neon.h
rename to nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/mask_neon.h
diff --git a/nntrainer/tensor/matrix_transpose_neon/matrix_transpose_kernels_neon.h b/nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/matrix_transpose_kernels_neon.h
similarity index 100%
rename from nntrainer/tensor/matrix_transpose_neon/matrix_transpose_kernels_neon.h
rename to nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/matrix_transpose_kernels_neon.h
diff --git a/nntrainer/tensor/matrix_transpose_neon/matrix_transpose_neon.cpp b/nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/matrix_transpose_neon.cpp
similarity index 100%
rename from nntrainer/tensor/matrix_transpose_neon/matrix_transpose_neon.cpp
rename to nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/matrix_transpose_neon.cpp
diff --git a/nntrainer/tensor/matrix_transpose_neon/matrix_transpose_neon.h b/nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/matrix_transpose_neon.h
similarity index 100%
rename from nntrainer/tensor/matrix_transpose_neon/matrix_transpose_neon.h
rename to nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/matrix_transpose_neon.h
diff --git a/nntrainer/tensor/matrix_transpose_neon/meson.build b/nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/meson.build
similarity index 100%
rename from nntrainer/tensor/matrix_transpose_neon/meson.build
rename to nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/meson.build
diff --git a/nntrainer/tensor/cpu_backend/arm/meson.build b/nntrainer/tensor/cpu_backend/arm/meson.build
new file mode 100644
index 0000000000..0be7034abe
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/arm/meson.build
@@ -0,0 +1,37 @@
+arm_compute_backend_headers = [
+    'arm_compute_backend.h',
+    'neon_impl.h',
+    'neon_setting.h',
+    'neon_mathfun.h',
+    'neon_mathfun.hxx'
+]
+arm_compute_backend_sources = [
+    'arm_compute_backend.cpp',
+    'neon_impl.cpp'
+]
+
+if get_option('enable-fp16')
+    arm_compute_backend_sources += 'arm_compute_backend_fp16.cpp'
+    arm_compute_backend_sources += 'neon_impl_fp16.cpp'
+
+    subdir('hgemm')
+    nntrainer_inc += include_directories('hgemm')
+    nntrainer_inc_abs += meson.current_source_dir() / 'hgemm'
+
+    subdir('matrix_transpose_neon')
+    nntrainer_inc += include_directories('matrix_transpose_neon')
+    nntrainer_inc_abs += meson.current_source_dir() / 'matrix_transpose_neon'
+endif
+
+arch = host_machine.cpu_family()
+if arch == 'arm'
+  arm_compute_backend_headers += 'armv7_neon.h'
+endif
+
+foreach s : arm_compute_backend_sources
+  nntrainer_sources += meson.current_source_dir() / s
+endforeach
+
+foreach h : arm_compute_backend_headers
+  nntrainer_headers += meson.current_source_dir() / h
+endforeach
diff --git a/nntrainer/tensor/cpu_backend/arm/neon_impl.cpp b/nntrainer/tensor/cpu_backend/arm/neon_impl.cpp
new file mode 100644
index 0000000000..86edaddde1
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/arm/neon_impl.cpp
@@ -0,0 +1,592 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file neon_impl.cpp
+ * @date   23 April 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Single-precision computation functions based on NEON
+ *
+ */
+
+#include <memory>
+#include <neon_impl.h>
+#include <neon_setting.h>
+#include <nntrainer_error.h>
+#ifdef ARMV7
+#include <armv7_neon.h>
+#endif
+
+namespace nntrainer::neon {
+
+void sgemv(const float *A, const float *X, float *Y, uint32_t rows,
+           uint32_t cols, float alpha, float beta) {
+  const float *__restrict x;
+
+  for (unsigned int i = 0; i < rows; ++i) {
+    Y[i] = Y[i] * beta;
+  }
+
+  float32x4_t v_alpha = vmovq_n_f32(alpha);
+
+  if (cols % 16 == 0) {
+    for (unsigned i = 0; i < cols; i += 16) {
+      float32x4_t x0_3 = vld1q_f32(&X[i]);
+      float32x4_t x4_7 = vld1q_f32(&X[i + 4]);
+      float32x4_t x8_11 = vld1q_f32(&X[i + 8]);
+      float32x4_t x12_15 = vld1q_f32(&X[i + 12]);
+
+      if (alpha != 1.0) {
+        x0_3 = vmulq_f32(x0_3, v_alpha);
+        x4_7 = vmulq_f32(x4_7, v_alpha);
+        x8_11 = vmulq_f32(x8_11, v_alpha);
+        x12_15 = vmulq_f32(x12_15, v_alpha);
+      }
+
+      float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15;
+
+      const float *__restrict w;
+
+      float32x4_t y0;
+
+      for (unsigned int j = 0; j < rows; ++j) {
+        w = &A[j * cols + i];
+        y0 = vmovq_n_f32(0);
+
+        float r[4];
+        wvec0_3 = vld1q_f32(&w[0]);
+        wvec4_7 = vld1q_f32(&w[4]);
+        wvec8_11 = vld1q_f32(&w[8]);
+        wvec12_15 = vld1q_f32(&w[12]);
+
+        y0 = vmlaq_f32(y0, wvec0_3, x0_3);
+        y0 = vmlaq_f32(y0, wvec4_7, x4_7);
+        y0 = vmlaq_f32(y0, wvec8_11, x8_11);
+        y0 = vmlaq_f32(y0, wvec12_15, x12_15);
+
+        vst1q_f32(r, y0);
+        for (unsigned int k = 0; k < 4; ++k) {
+          Y[j] = Y[j] + r[k];
+        }
+      }
+    }
+
+  } else if (cols % 8 == 0) {
+    for (unsigned i = 0; i < cols; i += 8) {
+      float32x4_t x0_3 = vld1q_f32(&X[i]);
+      float32x4_t x4_7 = vld1q_f32(&X[i + 4]);
+
+      if (alpha != 1.0) {
+        x0_3 = vmulq_f32(x0_3, v_alpha);
+        x4_7 = vmulq_f32(x4_7, v_alpha);
+      }
+
+      float32x4_t wvec0_3, wvec4_7;
+
+      const float *__restrict w;
+
+      float32x4_t y0;
+
+      for (unsigned int j = 0; j < rows; ++j) {
+        w = &A[j * cols + i];
+        y0 = vmovq_n_f32(0);
+
+        float r[4];
+        wvec0_3 = vld1q_f32(&w[0]);
+        wvec4_7 = vld1q_f32(&w[4]);
+
+        y0 = vmlaq_f32(y0, wvec0_3, x0_3);
+        y0 = vmlaq_f32(y0, wvec4_7, x4_7);
+
+        vst1q_f32(r, y0);
+        for (unsigned int k = 0; k < 4; ++k) {
+          Y[j] = Y[j] + r[k];
+        }
+      }
+    }
+  } else if (cols % 4 == 0) {
+    for (unsigned i = 0; i < cols; i += 4) {
+      float32x4_t x0_3 = vld1q_f32(&X[i]);
+
+      if (alpha != 1.0) {
+        x0_3 = vmulq_f32(x0_3, v_alpha);
+      }
+
+      float32x4_t wvec0_3, wvec4_7;
+
+      const float *__restrict w;
+
+      float32x4_t y0;
+
+      for (unsigned int j = 0; j < rows; ++j) {
+        w = &A[j * cols + i];
+        y0 = vmovq_n_f32(0);
+
+        float r[4];
+        wvec0_3 = vld1q_f32(&w[0]);
+
+        y0 = vmlaq_f32(y0, wvec0_3, x0_3);
+
+        vst1q_f32(r, y0);
+        for (unsigned int k = 0; k < 4; ++k) {
+          Y[j] = Y[j] + r[k];
+        }
+      }
+    }
+  }
+}
+
+void sgemv_transpose(const float *A, const float *X, float *Y, uint32_t rows,
+                     uint32_t cols, float alpha, float beta) {
+  const float *__restrict x;
+
+  const float32x4_t v_beta = vdupq_n_f32(beta);
+  const float32x4_t v_alpha = vdupq_n_f32(alpha);
+
+  if (cols % 16 == 0) {
+    unsigned int n = cols / 16;
+    bool *initialized = (bool *)malloc(sizeof(bool) * n);
+    unsigned int step;
+    for (unsigned int i = 0; i < cols / 16; ++i) {
+      initialized[i] = false;
+    }
+
+    for (unsigned int i = 0; i < rows; ++i) {
+      float32x4_t x = vld1q_dup_f32(&X[i]);
+      x = vmulq_f32(x, v_alpha);
+
+      for (unsigned int j = 0; j < cols; j += 16) {
+        float *__restrict y = &Y[j];
+
+        float32x4_t y0_3 = vld1q_f32(&y[0]);
+        float32x4_t y4_7 = vld1q_f32(&y[4]);
+        float32x4_t y8_11 = vld1q_f32(&y[8]);
+        float32x4_t y12_15 = vld1q_f32(&y[12]);
+        step = j / 16;
+        if (!initialized[step]) {
+          y0_3 = vmulq_f32(y0_3, v_beta);
+          y4_7 = vmulq_f32(y4_7, v_beta);
+          y8_11 = vmulq_f32(y8_11, v_beta);
+          y12_15 = vmulq_f32(y12_15, v_beta);
+          initialized[step] = true;
+        }
+
+        float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15;
+        const float *__restrict w;
+
+        w = &A[i * cols + j];
+
+        wvec0_3 = vld1q_f32(&w[0]);
+        wvec4_7 = vld1q_f32(&w[4]);
+        wvec8_11 = vld1q_f32(&w[8]);
+        wvec12_15 = vld1q_f32(&w[12]);
+
+        y0_3 = vmlaq_f32(y0_3, wvec0_3, x);
+        y4_7 = vmlaq_f32(y4_7, wvec4_7, x);
+        y8_11 = vmlaq_f32(y8_11, wvec8_11, x);
+        y12_15 = vmlaq_f32(y12_15, wvec12_15, x);
+
+        vst1q_f32(&y[0], y0_3);
+        vst1q_f32(&y[4], y4_7);
+        vst1q_f32(&y[8], y8_11);
+        vst1q_f32(&y[12], y12_15);
+      }
+    }
+    free(initialized);
+    return;
+  } else if (cols % 8 == 0) {
+    unsigned int n = cols / 8;
+    bool *initialized = (bool *)malloc(sizeof(bool) * n);
+    unsigned int step;
+    for (unsigned int i = 0; i < cols / 8; ++i) {
+      initialized[i] = false;
+    }
+
+    for (unsigned int i = 0; i < rows; ++i) {
+      float32x4_t x = vld1q_dup_f32(&X[i]);
+      x = vmulq_f32(x, v_alpha);
+
+      for (unsigned int j = 0; j < cols; j += 8) {
+        float *__restrict y = &Y[j];
+
+        float32x4_t y0_3 = vld1q_f32(&y[0]);
+        float32x4_t y4_7 = vld1q_f32(&y[4]);
+
+        step = j / 8;
+        if (!initialized[step]) {
+          y0_3 = vmulq_f32(y0_3, v_beta);
+          y4_7 = vmulq_f32(y4_7, v_beta);
+          initialized[step] = true;
+        }
+
+        float32x4_t wvec0_3, wvec4_7;
+        const float *__restrict w;
+
+        w = &A[i * cols + j];
+
+        wvec0_3 = vld1q_f32(&w[0]);
+        wvec4_7 = vld1q_f32(&w[4]);
+
+        y0_3 = vmlaq_f32(y0_3, wvec0_3, x);
+        y4_7 = vmlaq_f32(y4_7, wvec4_7, x);
+        vst1q_f32(&y[0], y0_3);
+        vst1q_f32(&y[4], y4_7);
+      }
+    }
+    free(initialized);
+    return;
+  } else if (cols % 4 == 0) {
+    unsigned int n = cols / 4;
+    bool *initialized = (bool *)malloc(sizeof(bool) * n);
+
+    unsigned int step;
+    for (unsigned int i = 0; i < cols / 4; ++i) {
+      initialized[i] = false;
+    }
+    for (unsigned int i = 0; i < rows; ++i) {
+      float32x4_t x = vld1q_dup_f32(&X[i]);
+      x = vmulq_f32(x, v_alpha);
+
+      for (unsigned int j = 0; j < cols; j += 4) {
+        float *__restrict y = &Y[j];
+
+        float32x4_t y0_3 = vld1q_f32(&y[0]);
+        step = j / 4;
+        if (!initialized[step]) {
+          y0_3 = vmulq_f32(y0_3, v_beta);
+          initialized[step] = true;
+        }
+
+        float32x4_t wvec0_3;
+        const float *__restrict w;
+
+        w = &A[i * cols + j];
+
+        wvec0_3 = vld1q_f32(&w[0]);
+
+        y0_3 = vmlaq_f32(y0_3, wvec0_3, x);
+        vst1q_f32(&y[0], y0_3);
+      }
+    }
+    free(initialized);
+  }
+
+  return;
+}
+
+void copy_int4_to_fp32(const unsigned int N, const uint8_t *X, float *Y) {
+
+  unsigned int idx = 0;
+
+  // keep in mind that : len(X) = N, and len(Y) = 2*N
+
+  // processing batch of 16
+  float32x4_t y0, y1, y2, y3;
+  float32x4_t y4, y5, y6, y7;
+
+  uint8_t low0, low1, high0, high1;
+
+  for (; (N - idx) >= 16; idx += 16) {
+    uint8x16_t batch = vld1q_u8(&X[idx]);
+
+    uint8x8_t low = vget_low_u8(batch);
+    uint8x8_t high = vget_high_u8(batch);
+    unsigned int i = 0;
+    for (; i < 8; ++i) {
+      low0 = low[i] >> 4;
+      low1 = low[i] & 0x0f;
+
+      high0 = high[i] >> 4;
+      high1 = high[i] & 0x0f;
+
+      // 0 ~ 8
+      if (i < 2) {
+        y0[2 * i] = low0;
+        y0[2 * i + 1] = low1;
+      } else if (i < 4) {
+        y1[2 * (i - 2)] = low0;
+        y1[2 * (i - 2) + 1] = low1;
+      } else if (i < 6) {
+        y2[2 * (i - 4)] = low0;
+        y2[2 * (i - 4) + 1] = low1;
+      } else {
+        y3[2 * (i - 6)] = low0;
+        y3[2 * (i - 6) + 1] = low1;
+      }
+
+      // 8 ~ 16
+      if (i < 2) {
+        y4[2 * i] = high0;
+        y4[2 * i + 1] = high1;
+      } else if (i < 4) {
+        y5[2 * (i - 2)] = high0;
+        y5[2 * (i - 2) + 1] = high1;
+      } else if (i < 6) {
+        y6[2 * (i - 4)] = high0;
+        y6[2 * (i - 4) + 1] = high1;
+      } else {
+        y7[2 * (i - 6)] = high0;
+        y7[2 * (i - 6) + 1] = high1;
+      }
+    }
+    vst1q_f32(&Y[2 * idx], y0);
+    vst1q_f32(&Y[2 * idx + 4], y1);
+    vst1q_f32(&Y[2 * idx + 8], y2);
+    vst1q_f32(&Y[2 * idx + 12], y3);
+    vst1q_f32(&Y[2 * idx + 16], y4);
+    vst1q_f32(&Y[2 * idx + 20], y5);
+    vst1q_f32(&Y[2 * idx + 24], y6);
+    vst1q_f32(&Y[2 * idx + 28], y7);
+  }
+
+  // processing remaining batch of 8
+  for (; (N - idx) >= 8; idx += 8) {
+    uint8x8_t batch = vld1_u8(&X[idx]);
+
+    unsigned int i = 0;
+    for (; i < 8; ++i) {
+      low0 = batch[i] >> 4;
+      low1 = batch[i] & 0x0f;
+
+      if (i < 2) {
+        y0[2 * i] = low0;
+        y0[2 * i + 1] = low1;
+      } else if (i < 4) {
+        y1[2 * (i - 2)] = low0;
+        y1[2 * (i - 2) + 1] = low1;
+      } else if (i < 6) {
+        y2[2 * (i - 4)] = low0;
+        y2[2 * (i - 4) + 1] = low1;
+      } else {
+        y3[2 * (i - 6)] = low0;
+        y3[2 * (i - 6) + 1] = low1;
+      }
+    }
+
+    vst1q_f32(&Y[2 * idx], y0);
+    vst1q_f32(&Y[2 * idx + 4], y1);
+    vst1q_f32(&Y[2 * idx + 8], y2);
+    vst1q_f32(&Y[2 * idx + 12], y3);
+  }
+
+  // pocessing remaining values
+  for (; idx < N; idx++) {
+    Y[2 * idx] = X[idx] >> 4;
+    Y[2 * idx + 1] = X[idx] & 0x0f;
+  }
+}
+
+void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y) {
+  ///@note int8 Tensor and int4 Tensor share the same memory offset
+  unsigned int idx = 0;
+  for (; N - idx >= 16; idx += 16) {
+    uint8x16_t batch = vld1q_u8(&X[idx]);
+    vst1q_u8(&Y[idx], batch);
+  }
+  for (; N - idx >= 8; idx += 8) {
+    uint8x8_t batch = vld1_u8(&X[idx]);
+    vst1_u8(&Y[idx], batch);
+  }
+  for (; N - idx >= 1; ++idx) {
+    Y[idx] = X[idx];
+  }
+}
+
+void sine(const unsigned int N, float *X, float *Y, float alpha) {
+  unsigned int i = 0;
+  for (; N - i >= 4; i += 4) {
+    float32x4_t x0_3 = vld1q_f32(&X[i]);
+    if (alpha != 1.0)
+      x0_3 = vmulq_n_f32(x0_3, alpha);
+    float32x4_t sinx0_3 = sin_ps(x0_3);
+    vst1q_f32(&Y[i], sinx0_3);
+  }
+  while (i < N) {
+    Y[i] = std::sin(alpha * X[i]);
+    ++i;
+  }
+}
+
+void cosine(const unsigned int N, float *X, float *Y, float alpha) {
+  unsigned int i = 0;
+  for (; N - i >= 4; i += 4) {
+    float32x4_t x0_3 = vld1q_f32(&X[i]);
+    if (alpha != 1.0)
+      x0_3 = vmulq_n_f32(x0_3, alpha);
+    float32x4_t cosx0_3 = cos_ps(x0_3);
+    vst1q_f32(&Y[i], cosx0_3);
+  }
+  while (i < N) {
+    Y[i] = std::cos(alpha * X[i]);
+    ++i;
+  }
+}
+
+void inv_sqrt_inplace(const unsigned int N, float *X) {
+  unsigned int i = 0;
+  for (; N - i >= 4; i += 4) {
+    float32x4_t x0_7 = vld1q_f32(&X[i]);
+    float32x4_t x0_7_sqrt = vsqrtq_f32(x0_7);
+    float32x4_t ones = vmovq_n_f32(1);
+    float32x4_t x0_7_sqrt_div = vdivq_f32(ones, x0_7_sqrt);
+    vst1q_f32(&X[i], x0_7_sqrt_div);
+  }
+  while (i < N) {
+    X[i] = (1 / std::sqrt(static_cast<float>(X[i])));
+    ++i;
+  }
+}
+
+void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha, float beta) {
+  unsigned int i = 0;
+  float32x4_t alpha_vec = vdupq_n_f32(alpha);
+  float32x4_t beta_vec = vdupq_n_f32(beta);
+  for (; N - i >= 4; i += 4) {
+    float32x4_t x0_3 = vld1q_f32(&X[i]);
+    float32x4_t y0_3 = vld1q_f32(&Y[i]);
+    if (alpha != 1.f) {
+      y0_3 = vmulq_f32(y0_3, alpha_vec);
+    }
+    float32x4_t xy0_3 = vmulq_f32(x0_3, y0_3);
+    if (std::abs(beta) > __FLT_MIN__) {
+      float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec);
+      vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3));
+    } else
+      vst1q_f32(&Z[i], xy0_3);
+  }
+  while (i < N) {
+    if (std::abs(beta) > __FLT_MIN__)
+      Z[i] = alpha * X[i] * Y[i] + beta * Z[i];
+    else
+      Z[i] = alpha * X[i] * Y[i];
+    ++i;
+  }
+}
+
+void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha, float beta) {
+  unsigned int i = 0;
+  float32x4_t alpha_vec = vdupq_n_f32(alpha);
+  float32x4_t beta_vec = vdupq_n_f32(beta);
+  for (; N - i >= 4; i += 4) {
+    float32x4_t x0_3 = vld1q_f32(&X[i]);
+    float32x4_t y0_3 = vld1q_f32(&Y[i]);
+    if (alpha != 1.f) {
+      y0_3 = vmulq_f32(y0_3, alpha_vec);
+    }
+    float32x4_t xy0_3 = vaddq_f32(x0_3, y0_3);
+    if (std::abs(beta) > __FLT_MIN__) {
+      float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec);
+      vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3));
+    } else
+      vst1q_f32(&Z[i], xy0_3);
+  }
+  while (i < N) {
+    if (std::abs(beta) > __FLT_MIN__)
+      Z[i] = X[i] + alpha * Y[i] + beta * Z[i];
+    else
+      Z[i] = X[i] + alpha * Y[i];
+    ++i;
+  }
+}
+
+void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha, float beta) {
+  unsigned int i = 0;
+  float32x4_t alpha_vec = vdupq_n_f32(alpha);
+  float32x4_t beta_vec = vdupq_n_f32(beta);
+  for (; N - i >= 4; i += 4) {
+    float32x4_t x0_3 = vld1q_f32(&X[i]);
+    float32x4_t y0_3 = vld1q_f32(&Y[i]);
+    if (alpha != 1.f) {
+      y0_3 = vmulq_f32(y0_3, alpha_vec);
+    }
+    float32x4_t xy0_3 = vsubq_f32(x0_3, y0_3);
+    if (std::abs(beta) > __FLT_MIN__) {
+      float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec);
+      vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3));
+    } else
+      vst1q_f32(&Z[i], xy0_3);
+  }
+  while (i < N) {
+    if (std::abs(beta) > __FLT_MIN__)
+      Z[i] = X[i] - alpha * Y[i] + beta * Z[i];
+    else
+      Z[i] = X[i] - alpha * Y[i];
+    ++i;
+  }
+}
+
+void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha, float beta) {
+  unsigned int i = 0;
+  float32x4_t alpha_vec = vdupq_n_f32(alpha);
+  float32x4_t beta_vec = vdupq_n_f32(beta);
+  for (; N - i >= 4; i += 4) {
+    float32x4_t x0_3 = vld1q_f32(&X[i]);
+    float32x4_t y0_3 = vld1q_f32(&Y[i]);
+    if (alpha != 1.f) {
+      y0_3 = vmulq_f32(y0_3, alpha_vec);
+    }
+    float32x4_t xy0_3 = vdivq_f32(x0_3, y0_3);
+    if (std::abs(beta) > __FLT_MIN__) {
+      float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec);
+      vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3));
+    } else
+      vst1q_f32(&Z[i], xy0_3);
+  }
+  while (i < N) {
+    if (std::abs(beta) > __FLT_MIN__)
+      Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i];
+    else
+      Z[i] = X[i] / (alpha * Y[i]);
+    ++i;
+  }
+}
+
+void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y) {
+  unsigned int idx = 0;
+  for (; (N - idx) >= 16; idx += 16) {
+    uint8x16_t batch = vld1q_u8(&X[idx]);
+    uint8x8_t low = vget_low_u8(batch);
+    uint8x8_t high = vget_high_u8(batch);
+
+    // convert to u16
+    uint16x8_t batch_low_u16 = vmovl_u8(low);
+    uint16x8_t batch_high_u16 = vmovl_u8(high);
+
+    // convert to u32
+    uint32x4_t batch_low_u32_low = vmovl_u16(vget_low_u16(batch_low_u16));
+    uint32x4_t batch_low_u32_high = vmovl_u16(vget_high_u16(batch_low_u16));
+    uint32x4_t batch_high_u32_low = vmovl_u16(vget_low_u16(batch_high_u16));
+    uint32x4_t batch_high_u32_high = vmovl_u16(vget_high_u16(batch_high_u16));
+
+    // todo : experiment with vcvt_f32_u32_ bitwise operation w.r.t.
+    // time/accuracy
+    vst1q_f32(&Y[idx], vcvtq_f32_u32(batch_low_u32_low));
+    vst1q_f32(&Y[idx + 4], vcvtq_f32_u32(batch_low_u32_high));
+    vst1q_f32(&Y[idx + 8], vcvtq_f32_u32(batch_high_u32_low));
+    vst1q_f32(&Y[idx + 12], vcvtq_f32_u32(batch_high_u32_high));
+  }
+  for (; (N - idx) >= 8; idx += 8) {
+    uint8x8_t batch = vld1_u8(&X[idx]);
+
+    // convert to u16
+    uint16x8_t batch_u16 = vmovl_u8(batch);
+
+    // convert to u32
+    uint32x4_t batch_u32_low = vmovl_u16(vget_low_u16(batch_u16));
+    uint32x4_t batch_u32_high = vmovl_u16(vget_high_u16(batch_u16));
+
+    vst1q_f32(&Y[idx], vcvtq_f32_u32(batch_u32_low));
+    vst1q_f32(&Y[idx + 4], vcvtq_f32_u32(batch_u32_high));
+  }
+  for (; (N - idx) >= 1; ++idx) {
+    Y[idx] = X[idx];
+  }
+}
+
+} // namespace nntrainer::neon
diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/cpu_backend/arm/neon_impl.h
similarity index 93%
rename from nntrainer/tensor/blas_neon.h
rename to nntrainer/tensor/cpu_backend/arm/neon_impl.h
index 81f8c060ed..8261c3d2e1 100644
--- a/nntrainer/tensor/blas_neon.h
+++ b/nntrainer/tensor/cpu_backend/arm/neon_impl.h
@@ -1,154 +1,28 @@
+// SPDX-License-Identifier: Apache-2.0
 /**
- * Copyright (C) 2022 Jijoong Moon <jijoong.moon@samsung.com>
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
  *
- * @file   blas_neon.h
- * @date   4 Aug 2022
+ * @file neon_impl.h
+ * @date   23 April 2024
  * @see    https://github.com/nnstreamer/nntrainer
- * @author Jijoong Moon <jijoong.moon@samsung.com>
  * @author Sungsik Kong <ss.kong@samsung.com>
  * @bug    No known bugs except for NYI items
- * @brief  This is header for blas neon implementation
+ * @brief  Single-precision computation functions based on NEON
  *
  */
 
-#ifndef __BLAS_NEON_H_
-#define __BLAS_NEON_H_
+#ifndef __NEON_IMPL_H_
+#define __NEON_IMPL_H_
 #ifdef __cplusplus
 
 #include <arm_neon.h>
 #include <cmath>
 #include <neon_mathfun.h>
+#include <tensor_dim.h>
 
 namespace nntrainer::neon {
-
-/**
- * @brief     sgemv computation with neon : Y = alpha*A*X + beta*Y
- * @param[in] A float * for Matrix A
- * @param[in] X float * for Vector X
- * @param[in] Y float * for Vector Y
- * @param[in] M number of A's row
- * @param[in] N number of A's columns
- * @param[in] alpha float number
- * @param[in] beta float number
- */
-void sgemv(const float *A, const float *X, float *Y, uint32_t M, uint32_t N,
-           const float alpha, const float beta);
-
-/**
- * @brief     transposed sgemv computation with neon
- *            Y = alpha*transpose(A)*X
- * + beta*Y
- * @param[in] A float * for Matrix A
- * @param[in] X float * for Vector X
- * @param[in] Y float * for Vector Y
- * @param[in] M number of A's row
- * @param[in] N number of A's columns
- * @param[in] alpha float number
- * @param[in] beta float number
- */
-void sgemv_transpose(const float *A, const float *X, float *Y, uint32_t M,
-                     uint32_t N, float alpha, float beta);
-
-/**
- * @brief     copy function with neon: Y = X
- * @param[in] N number of elements in X
- * @param[in] X float * for Vector X
- * @param[in] Y uint8_t * for Vector Y
- */
-void copy_int4_to_fp32(const unsigned int N, const uint8_t *X, float *Y);
-
-/**
- * @brief     copy function with neon: Y = X
- * @param[in] N number of elements in X
- * @param[in] X float * for Vector X
- * @param[in] Y uint8_t * for Vector Y
- */
-void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y);
-
-/**
- * @brief     copy function with neon: Y = X
- * @param[in] N number of elements in X
- * @param[in] X uint8_t * for Vector X
- * @param[in] Y uint8_t * for Vector Y
- */
-void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y);
-/**
- * @brief     sine with neon: Y = sin(alpha * X)
- * @param[in] N number of elements in X
- * @param[in] X float * for Vector X
- * @param[in] Y float * for Vector Y
- * @param[in] alpha float * for scaling angle (radian)
- */
-void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f);
-
-/**
- * @brief     cosine with neon: Y = cos(alpha * X)
- * @param[in] N number of elements in X
- * @param[in] X float * for Vector X
- * @param[in] Y float * for Vector Y
- * @param[in] alpha float * for scaling angle (radian)
- */
-void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f);
-
-/**
- * @brief inversed squared root transformation with neon : X = 1 / sqrt(X)
- *
- * @param N number of elements in X
- * @param X float * for Vector X
- */
-void inv_sqrt_inplace(const unsigned int N, float *X);
-
-/**
- * @brief     elementwise vector multiplication : Z = X ⊙ alpha * Y + beta * Z
- * @param[in] N  length of the vector
- * @param[in] X float * for Vector X
- * @param[in] Y float * for Vector Y
- * @param[in] Z float * for Vector Z
- * @param[in] alpha scalar multiplier for input
- * @param[in] beta scalar multiplier for output
- */
-void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
-             float alpha = 1.f, float beta = 0.f);
-
-/**
- * @brief     elementwise vector addition : Z = X + alpha * Y + beta * Z
- * @param[in] N  length of the vector
- * @param[in] X float * for Vector X
- * @param[in] Y float * for Vector Y
- * @param[in] Z float * for Vector Z
- * @param[in] alpha scalar multiplier for input
- * @param[in] beta scalar multiplier for output
- */
-void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
-             float alpha = 1.f, float beta = 0.f);
-/**
- * @brief     elementwise vector subtraction with neon : Z = X - alpha * Y +
- * beta * Z
- * @param[in] N  length of the vector
- * @param[in] X float * for Vector X
- * @param[in] Y float * for Vector Y
- * @param[in] Z float * for Vector Z
- * @param[in] alpha scalar multiplier for input
- * @param[in] beta scalar multiplier for output
- */
-void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
-             float alpha = 1.f, float beta = 0.f);
-
-/**
- * @brief     elementwise vector division with neon : Z = X / (alpha * Y) + beta
- * * Z
- * @note ZeroDivisionError is not guaranteed in this function
- * @param[in] N  length of the vector
- * @param[in] X float * for Vector X
- * @param[in] Y float * for Vector Y
- * @param[in] Z float * for Vector Z
- * @param[in] alpha scalar multiplier for input
- * @param[in] beta scalar multiplier for output
- */
-void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
-             float alpha = 1.f, float beta = 0.f);
-
 #ifdef ENABLE_FP16
+
 /**
  * @brief     hgemv computation with neon : Y = alpha*A*X + beta*Y
  * @param[in] A __fp16 * for Matrix A
@@ -330,7 +204,6 @@ unsigned int isamax(const unsigned int N, const __fp16 *X);
 void custom_hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, uint32_t M,
                   uint32_t N, uint32_t K, float alpha, float beta, bool TransA,
                   bool TransB);
-
 /**
  * @brief squared root transformation with neon : X = sqrt(X)
  *
@@ -338,9 +211,150 @@ void custom_hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, uint32_t M,
  * @param X __fp16 * for Vector X
  */
 void inv_sqrt_inplace(const unsigned int N, __fp16 *X);
+
+/**
+ * @brief Matrix transpose / 2D Tensor transpose
+ *
+ * @param M row length of input matrix
+ * @param N col length of input matrix
+ * @param src src data of input matrix
+ * @param ld_src data offset of input matrix
+ * @param dst destination of output matrix
+ * @param ld_dst data offset of output matrix
+ */
+void transpose_matrix(const unsigned int M, const unsigned int N,
+                      const __fp16 *src, unsigned int ld_src, __fp16 *dst,
+                      unsigned int ld_dst);
 #endif
 
+/**
+ * @brief     sgemv computation with neon : Y = alpha*A*X + beta*Y
+ * @param[in] A float * for Matrix A
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] M number of A's row
+ * @param[in] N number of A's columns
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void sgemv(const float *A, const float *X, float *Y, uint32_t M, uint32_t N,
+           const float alpha, const float beta);
+
+/**
+ * @brief     transposed sgemv computation with neon
+ *            Y = alpha*transpose(A)*X
+ * + beta*Y
+ * @param[in] A float * for Matrix A
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] M number of A's row
+ * @param[in] N number of A's columns
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void sgemv_transpose(const float *A, const float *X, float *Y, uint32_t M,
+                     uint32_t N, float alpha, float beta);
+
+/**
+ * @brief     copy function with neon: Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y uint8_t * for Vector Y
+ */
+void copy_int4_to_fp32(const unsigned int N, const uint8_t *X, float *Y);
+
+/**
+ * @brief     copy function with neon: Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y uint8_t * for Vector Y
+ */
+void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y);
+
+/**
+ * @brief     copy function with neon: Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y uint8_t * for Vector Y
+ */
+void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y);
+/**
+ * @brief     sine with neon: Y = sin(alpha * X)
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] alpha float * for scaling angle (radian)
+ */
+void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f);
+
+/**
+ * @brief     cosine with neon: Y = cos(alpha * X)
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] alpha float * for scaling angle (radian)
+ */
+void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f);
+
+/**
+ * @brief inversed squared root transformation with neon : X = 1 / sqrt(X)
+ *
+ * @param N number of elements in X
+ * @param X float * for Vector X
+ */
+void inv_sqrt_inplace(const unsigned int N, float *X);
+
+/**
+ * @brief     elementwise vector multiplication : Z = X ⊙ alpha * Y + beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ */
+void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f);
+
+/**
+ * @brief     elementwise vector addition : Z = X + alpha * Y + beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ */
+void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f);
+/**
+ * @brief     elementwise vector subtraction with neon : Z = X - alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ */
+void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f);
+
+/**
+ * @brief     elementwise vector division with neon : Z = X / (alpha * Y) + beta
+ * * Z
+ * @note ZeroDivisionError is not guaranteed in this function
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ */
+void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f);
+
 } // namespace nntrainer::neon
 
 #endif /* __cplusplus */
-#endif /* __BLAS_NEON_H__ */
+#endif /* __NEON_SINGLE_H__ */
diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/cpu_backend/arm/neon_impl_fp16.cpp
similarity index 69%
rename from nntrainer/tensor/blas_neon.cpp
rename to nntrainer/tensor/cpu_backend/arm/neon_impl_fp16.cpp
index 4b6c05c72e..60792b3316 100644
--- a/nntrainer/tensor/blas_neon.cpp
+++ b/nntrainer/tensor/cpu_backend/arm/neon_impl_fp16.cpp
@@ -1,553 +1,27 @@
 // SPDX-License-Identifier: Apache-2.0
 /**
- * Copyright (C) 2022 Jijoong Moon <jijoong.moon@samsung.com>
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
  *
- * @file   blas_neon.cpp
- * @date   4 Aug 2022
+ * @file neon_fp16.cpp
+ * @date   23 April 2024
  * @see    https://github.com/nnstreamer/nntrainer
- * @author Jijoong Moon <jijoong.moon@samsung.com>
  * @author Sungsik Kong <ss.kong@samsung.com>
  * @bug    No known bugs except for NYI items
- * @brief  This is Source for blas neon implementation
+ * @brief  Half-precision computation functions based on NEON
  *
  */
 
-#include <blas_neon.h>
-#include <blas_neon_setting.h>
 #include <hgemm.h>
+#include <matrix_transpose_neon.h>
 #include <memory>
-#include <nntrainer_error.h>
+#include <neon_impl.h>
+#include <neon_setting.h>
+#ifdef ARMV7
+#include <armv7_neon.h>
+#endif
 
 namespace nntrainer::neon {
 
-void sgemv(const float *A, const float *X, float *Y, uint32_t rows,
-           uint32_t cols, float alpha, float beta) {
-  const float *__restrict x;
-
-  for (unsigned int i = 0; i < rows; ++i) {
-    Y[i] = Y[i] * beta;
-  }
-
-  float32x4_t v_alpha = vmovq_n_f32(alpha);
-
-  if (cols % 16 == 0) {
-    for (unsigned i = 0; i < cols; i += 16) {
-      float32x4_t x0_3 = vld1q_f32(&X[i]);
-      float32x4_t x4_7 = vld1q_f32(&X[i + 4]);
-      float32x4_t x8_11 = vld1q_f32(&X[i + 8]);
-      float32x4_t x12_15 = vld1q_f32(&X[i + 12]);
-
-      if (alpha != 1.0) {
-        x0_3 = vmulq_f32(x0_3, v_alpha);
-        x4_7 = vmulq_f32(x4_7, v_alpha);
-        x8_11 = vmulq_f32(x8_11, v_alpha);
-        x12_15 = vmulq_f32(x12_15, v_alpha);
-      }
-
-      float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15;
-
-      const float *__restrict w;
-
-      float32x4_t y0;
-
-      for (unsigned int j = 0; j < rows; ++j) {
-        w = &A[j * cols + i];
-        y0 = vmovq_n_f32(0);
-
-        float r[4];
-        wvec0_3 = vld1q_f32(&w[0]);
-        wvec4_7 = vld1q_f32(&w[4]);
-        wvec8_11 = vld1q_f32(&w[8]);
-        wvec12_15 = vld1q_f32(&w[12]);
-
-        y0 = vmlaq_f32(y0, wvec0_3, x0_3);
-        y0 = vmlaq_f32(y0, wvec4_7, x4_7);
-        y0 = vmlaq_f32(y0, wvec8_11, x8_11);
-        y0 = vmlaq_f32(y0, wvec12_15, x12_15);
-
-        vst1q_f32(r, y0);
-        for (unsigned int k = 0; k < 4; ++k) {
-          Y[j] = Y[j] + r[k];
-        }
-      }
-    }
-
-  } else if (cols % 8 == 0) {
-    for (unsigned i = 0; i < cols; i += 8) {
-      float32x4_t x0_3 = vld1q_f32(&X[i]);
-      float32x4_t x4_7 = vld1q_f32(&X[i + 4]);
-
-      if (alpha != 1.0) {
-        x0_3 = vmulq_f32(x0_3, v_alpha);
-        x4_7 = vmulq_f32(x4_7, v_alpha);
-      }
-
-      float32x4_t wvec0_3, wvec4_7;
-
-      const float *__restrict w;
-
-      float32x4_t y0;
-
-      for (unsigned int j = 0; j < rows; ++j) {
-        w = &A[j * cols + i];
-        y0 = vmovq_n_f32(0);
-
-        float r[4];
-        wvec0_3 = vld1q_f32(&w[0]);
-        wvec4_7 = vld1q_f32(&w[4]);
-
-        y0 = vmlaq_f32(y0, wvec0_3, x0_3);
-        y0 = vmlaq_f32(y0, wvec4_7, x4_7);
-
-        vst1q_f32(r, y0);
-        for (unsigned int k = 0; k < 4; ++k) {
-          Y[j] = Y[j] + r[k];
-        }
-      }
-    }
-  } else if (cols % 4 == 0) {
-    for (unsigned i = 0; i < cols; i += 4) {
-      float32x4_t x0_3 = vld1q_f32(&X[i]);
-
-      if (alpha != 1.0) {
-        x0_3 = vmulq_f32(x0_3, v_alpha);
-      }
-
-      float32x4_t wvec0_3, wvec4_7;
-
-      const float *__restrict w;
-
-      float32x4_t y0;
-
-      for (unsigned int j = 0; j < rows; ++j) {
-        w = &A[j * cols + i];
-        y0 = vmovq_n_f32(0);
-
-        float r[4];
-        wvec0_3 = vld1q_f32(&w[0]);
-
-        y0 = vmlaq_f32(y0, wvec0_3, x0_3);
-
-        vst1q_f32(r, y0);
-        for (unsigned int k = 0; k < 4; ++k) {
-          Y[j] = Y[j] + r[k];
-        }
-      }
-    }
-  }
-}
-
-void sgemv_transpose(const float *A, const float *X, float *Y, uint32_t rows,
-                     uint32_t cols, float alpha, float beta) {
-  const float *__restrict x;
-
-  const float32x4_t v_beta = vdupq_n_f32(beta);
-  const float32x4_t v_alpha = vdupq_n_f32(alpha);
-
-  if (cols % 16 == 0) {
-    unsigned int n = cols / 16;
-    bool *initialized = (bool *)malloc(sizeof(bool) * n);
-    unsigned int step;
-    for (unsigned int i = 0; i < cols / 16; ++i) {
-      initialized[i] = false;
-    }
-
-    for (unsigned int i = 0; i < rows; ++i) {
-      float32x4_t x = vld1q_dup_f32(&X[i]);
-      x = vmulq_f32(x, v_alpha);
-
-      for (unsigned int j = 0; j < cols; j += 16) {
-        float *__restrict y = &Y[j];
-
-        float32x4_t y0_3 = vld1q_f32(&y[0]);
-        float32x4_t y4_7 = vld1q_f32(&y[4]);
-        float32x4_t y8_11 = vld1q_f32(&y[8]);
-        float32x4_t y12_15 = vld1q_f32(&y[12]);
-        step = j / 16;
-        if (!initialized[step]) {
-          y0_3 = vmulq_f32(y0_3, v_beta);
-          y4_7 = vmulq_f32(y4_7, v_beta);
-          y8_11 = vmulq_f32(y8_11, v_beta);
-          y12_15 = vmulq_f32(y12_15, v_beta);
-          initialized[step] = true;
-        }
-
-        float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15;
-        const float *__restrict w;
-
-        w = &A[i * cols + j];
-
-        wvec0_3 = vld1q_f32(&w[0]);
-        wvec4_7 = vld1q_f32(&w[4]);
-        wvec8_11 = vld1q_f32(&w[8]);
-        wvec12_15 = vld1q_f32(&w[12]);
-
-        y0_3 = vmlaq_f32(y0_3, wvec0_3, x);
-        y4_7 = vmlaq_f32(y4_7, wvec4_7, x);
-        y8_11 = vmlaq_f32(y8_11, wvec8_11, x);
-        y12_15 = vmlaq_f32(y12_15, wvec12_15, x);
-
-        vst1q_f32(&y[0], y0_3);
-        vst1q_f32(&y[4], y4_7);
-        vst1q_f32(&y[8], y8_11);
-        vst1q_f32(&y[12], y12_15);
-      }
-    }
-    free(initialized);
-    return;
-  } else if (cols % 8 == 0) {
-    unsigned int n = cols / 8;
-    bool *initialized = (bool *)malloc(sizeof(bool) * n);
-    unsigned int step;
-    for (unsigned int i = 0; i < cols / 8; ++i) {
-      initialized[i] = false;
-    }
-
-    for (unsigned int i = 0; i < rows; ++i) {
-      float32x4_t x = vld1q_dup_f32(&X[i]);
-      x = vmulq_f32(x, v_alpha);
-
-      for (unsigned int j = 0; j < cols; j += 8) {
-        float *__restrict y = &Y[j];
-
-        float32x4_t y0_3 = vld1q_f32(&y[0]);
-        float32x4_t y4_7 = vld1q_f32(&y[4]);
-
-        step = j / 8;
-        if (!initialized[step]) {
-          y0_3 = vmulq_f32(y0_3, v_beta);
-          y4_7 = vmulq_f32(y4_7, v_beta);
-          initialized[step] = true;
-        }
-
-        float32x4_t wvec0_3, wvec4_7;
-        const float *__restrict w;
-
-        w = &A[i * cols + j];
-
-        wvec0_3 = vld1q_f32(&w[0]);
-        wvec4_7 = vld1q_f32(&w[4]);
-
-        y0_3 = vmlaq_f32(y0_3, wvec0_3, x);
-        y4_7 = vmlaq_f32(y4_7, wvec4_7, x);
-        vst1q_f32(&y[0], y0_3);
-        vst1q_f32(&y[4], y4_7);
-      }
-    }
-    free(initialized);
-    return;
-  } else if (cols % 4 == 0) {
-    unsigned int n = cols / 4;
-    bool *initialized = (bool *)malloc(sizeof(bool) * n);
-
-    unsigned int step;
-    for (unsigned int i = 0; i < cols / 4; ++i) {
-      initialized[i] = false;
-    }
-    for (unsigned int i = 0; i < rows; ++i) {
-      float32x4_t x = vld1q_dup_f32(&X[i]);
-      x = vmulq_f32(x, v_alpha);
-
-      for (unsigned int j = 0; j < cols; j += 4) {
-        float *__restrict y = &Y[j];
-
-        float32x4_t y0_3 = vld1q_f32(&y[0]);
-        step = j / 4;
-        if (!initialized[step]) {
-          y0_3 = vmulq_f32(y0_3, v_beta);
-          initialized[step] = true;
-        }
-
-        float32x4_t wvec0_3;
-        const float *__restrict w;
-
-        w = &A[i * cols + j];
-
-        wvec0_3 = vld1q_f32(&w[0]);
-
-        y0_3 = vmlaq_f32(y0_3, wvec0_3, x);
-        vst1q_f32(&y[0], y0_3);
-      }
-    }
-    free(initialized);
-  }
-
-  return;
-}
-
-void copy_int4_to_fp32(const unsigned int N, const uint8_t *X, float *Y) {
-
-  unsigned int idx = 0;
-
-  // keep in mind that : len(X) = N, and len(Y) = 2*N
-
-  // processing batch of 16
-  float32x4_t y0, y1, y2, y3;
-  float32x4_t y4, y5, y6, y7;
-
-  uint8_t low0, low1, high0, high1;
-
-  for (; (N - idx) >= 16; idx += 16) {
-    uint8x16_t batch = vld1q_u8(&X[idx]);
-
-    uint8x8_t low = vget_low_u8(batch);
-    uint8x8_t high = vget_high_u8(batch);
-    unsigned int i = 0;
-    for (; i < 8; ++i) {
-      low0 = low[i] >> 4;
-      low1 = low[i] & 0x0f;
-
-      high0 = high[i] >> 4;
-      high1 = high[i] & 0x0f;
-
-      // 0 ~ 8
-      if (i < 2) {
-        y0[2 * i] = low0;
-        y0[2 * i + 1] = low1;
-      } else if (i < 4) {
-        y1[2 * (i - 2)] = low0;
-        y1[2 * (i - 2) + 1] = low1;
-      } else if (i < 6) {
-        y2[2 * (i - 4)] = low0;
-        y2[2 * (i - 4) + 1] = low1;
-      } else {
-        y3[2 * (i - 6)] = low0;
-        y3[2 * (i - 6) + 1] = low1;
-      }
-
-      // 8 ~ 16
-      if (i < 2) {
-        y4[2 * i] = high0;
-        y4[2 * i + 1] = high1;
-      } else if (i < 4) {
-        y5[2 * (i - 2)] = high0;
-        y5[2 * (i - 2) + 1] = high1;
-      } else if (i < 6) {
-        y6[2 * (i - 4)] = high0;
-        y6[2 * (i - 4) + 1] = high1;
-      } else {
-        y7[2 * (i - 6)] = high0;
-        y7[2 * (i - 6) + 1] = high1;
-      }
-    }
-    vst1q_f32(&Y[2 * idx], y0);
-    vst1q_f32(&Y[2 * idx + 4], y1);
-    vst1q_f32(&Y[2 * idx + 8], y2);
-    vst1q_f32(&Y[2 * idx + 12], y3);
-    vst1q_f32(&Y[2 * idx + 16], y4);
-    vst1q_f32(&Y[2 * idx + 20], y5);
-    vst1q_f32(&Y[2 * idx + 24], y6);
-    vst1q_f32(&Y[2 * idx + 28], y7);
-  }
-
-  // processing remaining batch of 8
-  for (; (N - idx) >= 8; idx += 8) {
-    uint8x8_t batch = vld1_u8(&X[idx]);
-
-    unsigned int i = 0;
-    for (; i < 8; ++i) {
-      low0 = batch[i] >> 4;
-      low1 = batch[i] & 0x0f;
-
-      if (i < 2) {
-        y0[2 * i] = low0;
-        y0[2 * i + 1] = low1;
-      } else if (i < 4) {
-        y1[2 * (i - 2)] = low0;
-        y1[2 * (i - 2) + 1] = low1;
-      } else if (i < 6) {
-        y2[2 * (i - 4)] = low0;
-        y2[2 * (i - 4) + 1] = low1;
-      } else {
-        y3[2 * (i - 6)] = low0;
-        y3[2 * (i - 6) + 1] = low1;
-      }
-    }
-
-    vst1q_f32(&Y[2 * idx], y0);
-    vst1q_f32(&Y[2 * idx + 4], y1);
-    vst1q_f32(&Y[2 * idx + 8], y2);
-    vst1q_f32(&Y[2 * idx + 12], y3);
-  }
-
-  // pocessing remaining values
-  for (; idx < N; idx++) {
-    Y[2 * idx] = X[idx] >> 4;
-    Y[2 * idx + 1] = X[idx] & 0x0f;
-  }
-}
-
-void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y) {
-  ///@note int8 Tensor and int4 Tensor share the same memory offset
-  unsigned int idx = 0;
-  for (; N - idx >= 16; idx += 16) {
-    uint8x16_t batch = vld1q_u8(&X[idx]);
-    vst1q_u8(&Y[idx], batch);
-  }
-  for (; N - idx >= 8; idx += 8) {
-    uint8x8_t batch = vld1_u8(&X[idx]);
-    vst1_u8(&Y[idx], batch);
-  }
-  for (; N - idx >= 1; ++idx) {
-    Y[idx] = X[idx];
-  }
-}
-
-void sine(const unsigned int N, float *X, float *Y, float alpha) {
-  unsigned int i = 0;
-  for (; N - i >= 4; i += 4) {
-    float32x4_t x0_3 = vld1q_f32(&X[i]);
-    if (alpha != 1.0)
-      x0_3 = vmulq_n_f32(x0_3, alpha);
-    float32x4_t sinx0_3 = sin_ps(x0_3);
-    vst1q_f32(&Y[i], sinx0_3);
-  }
-  while (i < N) {
-    Y[i] = std::sin(alpha * X[i]);
-    ++i;
-  }
-}
-
-void cosine(const unsigned int N, float *X, float *Y, float alpha) {
-  unsigned int i = 0;
-  for (; N - i >= 4; i += 4) {
-    float32x4_t x0_3 = vld1q_f32(&X[i]);
-    if (alpha != 1.0)
-      x0_3 = vmulq_n_f32(x0_3, alpha);
-    float32x4_t cosx0_3 = cos_ps(x0_3);
-    vst1q_f32(&Y[i], cosx0_3);
-  }
-  while (i < N) {
-    Y[i] = std::cos(alpha * X[i]);
-    ++i;
-  }
-}
-
-void inv_sqrt_inplace(const unsigned int N, float *X) {
-  unsigned int i = 0;
-  for (; N - i >= 4; i += 4) {
-    float32x4_t x0_7 = vld1q_f32(&X[i]);
-    float32x4_t x0_7_sqrt = vsqrtq_f32(x0_7);
-    float32x4_t ones = vmovq_n_f32(1);
-    float32x4_t x0_7_sqrt_div = vdivq_f32(ones, x0_7_sqrt);
-    vst1q_f32(&X[i], x0_7_sqrt_div);
-  }
-  while (i < N) {
-    X[i] = (1 / std::sqrt(static_cast<float>(X[i])));
-    ++i;
-  }
-}
-
-void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
-             float alpha, float beta) {
-  unsigned int i = 0;
-  float32x4_t alpha_vec = vdupq_n_f32(alpha);
-  float32x4_t beta_vec = vdupq_n_f32(beta);
-  for (; N - i >= 4; i += 4) {
-    float32x4_t x0_3 = vld1q_f32(&X[i]);
-    float32x4_t y0_3 = vld1q_f32(&Y[i]);
-    if (alpha != 1.f) {
-      y0_3 = vmulq_f32(y0_3, alpha_vec);
-    }
-    float32x4_t xy0_3 = vmulq_f32(x0_3, y0_3);
-    if (std::abs(beta) > __FLT_MIN__) {
-      float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec);
-      vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3));
-    } else
-      vst1q_f32(&Z[i], xy0_3);
-  }
-  while (i < N) {
-    if (std::abs(beta) > __FLT_MIN__)
-      Z[i] = alpha * X[i] * Y[i] + beta * Z[i];
-    else
-      Z[i] = alpha * X[i] * Y[i];
-    ++i;
-  }
-}
-
-void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
-             float alpha, float beta) {
-  unsigned int i = 0;
-  float32x4_t alpha_vec = vdupq_n_f32(alpha);
-  float32x4_t beta_vec = vdupq_n_f32(beta);
-  for (; N - i >= 4; i += 4) {
-    float32x4_t x0_3 = vld1q_f32(&X[i]);
-    float32x4_t y0_3 = vld1q_f32(&Y[i]);
-    if (alpha != 1.f) {
-      y0_3 = vmulq_f32(y0_3, alpha_vec);
-    }
-    float32x4_t xy0_3 = vaddq_f32(x0_3, y0_3);
-    if (std::abs(beta) > __FLT_MIN__) {
-      float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec);
-      vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3));
-    } else
-      vst1q_f32(&Z[i], xy0_3);
-  }
-  while (i < N) {
-    if (std::abs(beta) > __FLT_MIN__)
-      Z[i] = X[i] + alpha * Y[i] + beta * Z[i];
-    else
-      Z[i] = X[i] + alpha * Y[i];
-    ++i;
-  }
-}
-
-void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
-             float alpha, float beta) {
-  unsigned int i = 0;
-  float32x4_t alpha_vec = vdupq_n_f32(alpha);
-  float32x4_t beta_vec = vdupq_n_f32(beta);
-  for (; N - i >= 4; i += 4) {
-    float32x4_t x0_3 = vld1q_f32(&X[i]);
-    float32x4_t y0_3 = vld1q_f32(&Y[i]);
-    if (alpha != 1.f) {
-      y0_3 = vmulq_f32(y0_3, alpha_vec);
-    }
-    float32x4_t xy0_3 = vsubq_f32(x0_3, y0_3);
-    if (std::abs(beta) > __FLT_MIN__) {
-      float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec);
-      vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3));
-    } else
-      vst1q_f32(&Z[i], xy0_3);
-  }
-  while (i < N) {
-    if (std::abs(beta) > __FLT_MIN__)
-      Z[i] = X[i] - alpha * Y[i] + beta * Z[i];
-    else
-      Z[i] = X[i] - alpha * Y[i];
-    ++i;
-  }
-}
-
-void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
-             float alpha, float beta) {
-  unsigned int i = 0;
-  float32x4_t alpha_vec = vdupq_n_f32(alpha);
-  float32x4_t beta_vec = vdupq_n_f32(beta);
-  for (; N - i >= 4; i += 4) {
-    float32x4_t x0_3 = vld1q_f32(&X[i]);
-    float32x4_t y0_3 = vld1q_f32(&Y[i]);
-    if (alpha != 1.f) {
-      y0_3 = vmulq_f32(y0_3, alpha_vec);
-    }
-    float32x4_t xy0_3 = vdivq_f32(x0_3, y0_3);
-    if (std::abs(beta) > __FLT_MIN__) {
-      float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec);
-      vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3));
-    } else
-      vst1q_f32(&Z[i], xy0_3);
-  }
-  while (i < N) {
-    if (std::abs(beta) > __FLT_MIN__)
-      Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i];
-    else
-      Z[i] = X[i] / (alpha * Y[i]);
-    ++i;
-  }
-}
-
-#ifdef ENABLE_FP16
-
 void hgemv(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t M, uint32_t N,
            float alpha, float beta) {
   const unsigned int batch = 0;
@@ -1405,48 +879,6 @@ void copy_int8_to_fp16(const unsigned int N, const uint8_t *X, __fp16 *Y) {
   }
 }
 
-void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y) {
-  unsigned int idx = 0;
-  for (; (N - idx) >= 16; idx += 16) {
-    uint8x16_t batch = vld1q_u8(&X[idx]);
-    uint8x8_t low = vget_low_u8(batch);
-    uint8x8_t high = vget_high_u8(batch);
-
-    // convert to u16
-    uint16x8_t batch_low_u16 = vmovl_u8(low);
-    uint16x8_t batch_high_u16 = vmovl_u8(high);
-
-    // convert to u32
-    uint32x4_t batch_low_u32_low = vmovl_u16(vget_low_u16(batch_low_u16));
-    uint32x4_t batch_low_u32_high = vmovl_u16(vget_high_u16(batch_low_u16));
-    uint32x4_t batch_high_u32_low = vmovl_u16(vget_low_u16(batch_high_u16));
-    uint32x4_t batch_high_u32_high = vmovl_u16(vget_high_u16(batch_high_u16));
-
-    // todo : experiment with vcvt_f32_u32_ bitwise operation w.r.t.
-    // time/accuracy
-    vst1q_f32(&Y[idx], vcvtq_f32_u32(batch_low_u32_low));
-    vst1q_f32(&Y[idx + 4], vcvtq_f32_u32(batch_low_u32_high));
-    vst1q_f32(&Y[idx + 8], vcvtq_f32_u32(batch_high_u32_low));
-    vst1q_f32(&Y[idx + 12], vcvtq_f32_u32(batch_high_u32_high));
-  }
-  for (; (N - idx) >= 8; idx += 8) {
-    uint8x8_t batch = vld1_u8(&X[idx]);
-
-    // convert to u16
-    uint16x8_t batch_u16 = vmovl_u8(batch);
-
-    // convert to u32
-    uint32x4_t batch_u32_low = vmovl_u16(vget_low_u16(batch_u16));
-    uint32x4_t batch_u32_high = vmovl_u16(vget_high_u16(batch_u16));
-
-    vst1q_f32(&Y[idx], vcvtq_f32_u32(batch_u32_low));
-    vst1q_f32(&Y[idx + 4], vcvtq_f32_u32(batch_u32_high));
-  }
-  for (; (N - idx) >= 1; ++idx) {
-    Y[idx] = X[idx];
-  }
-}
-
 void copy_fp16_to_fp32(const unsigned int N, const __fp16 *X, float *Y) {
   unsigned int idx = 0;
 
@@ -1719,5 +1151,9 @@ void inv_sqrt_inplace(const unsigned int N, __fp16 *X) {
   }
 }
 
-#endif
+void transpose_matrix(const unsigned int M, const unsigned int N,
+                      const __fp16 *src, unsigned int ld_src, __fp16 *dst,
+                      unsigned int ld_dst) {
+  transpose_neon<__fp16>(M, N, src, ld_src, dst, ld_dst);
+}
 } // namespace nntrainer::neon
diff --git a/nntrainer/tensor/cpu_backend/arm/neon_mathfun.h b/nntrainer/tensor/cpu_backend/arm/neon_mathfun.h
new file mode 100644
index 0000000000..3023560b1e
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/arm/neon_mathfun.h
@@ -0,0 +1,85 @@
+/**
+ * @file   neon_mathfun.h
+ * @date   15 Jan 2024
+ * @brief  This is collection of sin, cos, exp, log function with NEON SIMD
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Julien Pommier
+ * @bug    No known bugs except for NYI items
+ *
+ */
+
+/** NEON implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+*/
+
+/** gCopyright (C) 2011  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#ifndef NEON_MATHFUN_H_
+#define NEON_MATHFUN_H_
+
+#include <arm_neon.h>
+/**
+ * @brief typedef for vector register.
+ *
+ */
+typedef float32x4_t v4sf; // vector of 4 float
+
+// prototypes
+/**
+ * @brief     log function with neon x = log(x)
+ * @param[in] x register variable (float32x4_t)
+ */
+inline v4sf log_ps(v4sf x);
+
+/**
+ * @brief     exp function with neon x = exp(x)
+ * @param[in] x register variable (float32x4_t)
+ */
+inline v4sf exp_ps(v4sf x);
+
+/**
+ * @brief     sin_ps function with neon x = sin(x)
+ * @param[in] x register variable (float32x4_t)
+ */
+inline v4sf sin_ps(v4sf x);
+
+/**
+ * @brief     cos_ps function with neon x = cos(x)
+ * @param[in] x register variable (float32x4_t)
+ */
+inline v4sf cos_ps(v4sf x);
+
+/**
+ * @brief     sincos_ps function with neon x = sin(x) or cos(x)
+ * @param[in] x register variable (float32x4_t)
+ * @param[in] s sin register variable (float32x4_t)
+ * @param[in] c cos register variable (float32x4_t)
+ */
+inline void sincos_ps(v4sf x, v4sf *s, v4sf *c);
+
+#include "neon_mathfun.hxx"
+
+#endif
+#endif
diff --git a/nntrainer/tensor/cpu_backend/arm/neon_mathfun.hxx b/nntrainer/tensor/cpu_backend/arm/neon_mathfun.hxx
new file mode 100644
index 0000000000..93e4af8b66
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/arm/neon_mathfun.hxx
@@ -0,0 +1,341 @@
+/**
+ * @file   neon_mathfun.hxx
+ * @date   15 Jan 2024
+ * @brief  This is collection of sin, cos, exp, log function with NEON SIMD
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Julien Pommier
+ * @bug    No known bugs except for NYI items
+ *
+ */
+
+/* NEON implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+*/
+
+/* Copyright (C) 2011  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+typedef uint32x4_t v4su; // vector of 4 uint32
+typedef int32x4_t v4si;  // vector of 4 uint32
+
+#define c_inv_mant_mask ~0x7f800000u
+#define c_cephes_SQRTHF 0.707106781186547524
+#define c_cephes_log_p0 7.0376836292E-2
+#define c_cephes_log_p1 -1.1514610310E-1
+#define c_cephes_log_p2 1.1676998740E-1
+#define c_cephes_log_p3 -1.2420140846E-1
+#define c_cephes_log_p4 +1.4249322787E-1
+#define c_cephes_log_p5 -1.6668057665E-1
+#define c_cephes_log_p6 +2.0000714765E-1
+#define c_cephes_log_p7 -2.4999993993E-1
+#define c_cephes_log_p8 +3.3333331174E-1
+#define c_cephes_log_q1 -2.12194440e-4
+#define c_cephes_log_q2 0.693359375
+
+/* natural logarithm computed for 4 simultaneous float
+   return NaN for x <= 0
+*/
+/**
+ * @brief log function with simd
+ *
+ * @param x input register variable
+ * @return v4sf
+ */
+v4sf log_ps(v4sf x) {
+  v4sf one = vdupq_n_f32(1);
+
+  x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
+  v4su invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
+
+  v4si ux = vreinterpretq_s32_f32(x);
+
+  v4si emm0 = vshrq_n_s32(ux, 23);
+
+  /* keep only the fractional part */
+  ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
+  ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
+  x = vreinterpretq_f32_s32(ux);
+
+  emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
+  v4sf e = vcvtq_f32_s32(emm0);
+
+  e = vaddq_f32(e, one);
+
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  v4su mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
+  v4sf tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
+  x = vsubq_f32(x, one);
+  e = vsubq_f32(
+    e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
+  x = vaddq_f32(x, tmp);
+
+  v4sf z = vmulq_f32(x, x);
+
+  v4sf y = vdupq_n_f32(c_cephes_log_p0);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
+  y = vmulq_f32(y, x);
+
+  y = vmulq_f32(y, z);
+
+  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
+  y = vaddq_f32(y, tmp);
+
+  tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
+  y = vsubq_f32(y, tmp);
+
+  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
+  x = vaddq_f32(x, y);
+  x = vaddq_f32(x, tmp);
+  x = vreinterpretq_f32_u32(vorrq_u32(
+    vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
+  return x;
+}
+
+#define c_exp_hi 88.3762626647949f
+#define c_exp_lo -88.3762626647949f
+
+#define c_cephes_LOG2EF 1.44269504088896341
+#define c_cephes_exp_C1 0.693359375
+#define c_cephes_exp_C2 -2.12194440e-4
+
+#define c_cephes_exp_p0 1.9875691500E-4
+#define c_cephes_exp_p1 1.3981999507E-3
+#define c_cephes_exp_p2 8.3334519073E-3
+#define c_cephes_exp_p3 4.1665795894E-2
+#define c_cephes_exp_p4 1.6666665459E-1
+#define c_cephes_exp_p5 5.0000001201E-1
+
+/* exp() computed for 4 float at once */
+/**
+ * @brief exponential function with simd
+ *
+ * @param x input register variable
+ * @return v4sf
+ */
+v4sf exp_ps(v4sf x) {
+  v4sf tmp, fx;
+
+  v4sf one = vdupq_n_f32(1);
+  x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
+  x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
+
+  /* perform a floorf */
+  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+
+  /* if greater, substract 1 */
+  v4su mask = vcgtq_f32(tmp, fx);
+  mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
+
+  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+
+  tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
+  v4sf z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
+  x = vsubq_f32(x, tmp);
+  x = vsubq_f32(x, z);
+
+  static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1,
+                                        c_cephes_exp_p2, c_cephes_exp_p3,
+                                        c_cephes_exp_p4, c_cephes_exp_p5};
+  v4sf y = vld1q_dup_f32(cephes_exp_p + 0);
+  v4sf c1 = vld1q_dup_f32(cephes_exp_p + 1);
+  v4sf c2 = vld1q_dup_f32(cephes_exp_p + 2);
+  v4sf c3 = vld1q_dup_f32(cephes_exp_p + 3);
+  v4sf c4 = vld1q_dup_f32(cephes_exp_p + 4);
+  v4sf c5 = vld1q_dup_f32(cephes_exp_p + 5);
+
+  y = vmulq_f32(y, x);
+  z = vmulq_f32(x, x);
+  y = vaddq_f32(y, c1);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c2);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c3);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c4);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c5);
+
+  y = vmulq_f32(y, z);
+  y = vaddq_f32(y, x);
+  y = vaddq_f32(y, one);
+
+  /* build 2^n */
+  int32x4_t mm;
+  mm = vcvtq_s32_f32(fx);
+  mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
+  mm = vshlq_n_s32(mm, 23);
+  v4sf pow2n = vreinterpretq_f32_s32(mm);
+
+  y = vmulq_f32(y, pow2n);
+  return y;
+}
+
+#define c_minus_cephes_DP1 -0.78515625
+#define c_minus_cephes_DP2 -2.4187564849853515625e-4
+#define c_minus_cephes_DP3 -3.77489497744594108e-8
+#define c_sincof_p0 -1.9515295891E-4
+#define c_sincof_p1 8.3321608736E-3
+#define c_sincof_p2 -1.6666654611E-1
+#define c_coscof_p0 2.443315711809948E-005
+#define c_coscof_p1 -1.388731625493765E-003
+#define c_coscof_p2 4.166664568298827E-002
+#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI
+
+/* evaluation of 4 sines & cosines at once.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+   Note also that when you compute sin(x), cos(x) is available at
+   almost no extra price so both sin_ps and cos_ps make use of
+   sincos_ps..
+  */
+/**
+ * @brief sincos function with simd
+ *
+ * @param x input register variable
+ * @return v4sf
+ */
+void sincos_ps(v4sf x, v4sf *ysin, v4sf *ycos) { // any x
+  v4sf xmm1, xmm2, xmm3, y;
+
+  v4su emm2;
+
+  v4su sign_mask_sin, sign_mask_cos;
+  sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
+  x = vabsq_f32(x);
+
+  /* scale by 4/Pi */
+  y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
+
+  /* store the integer part of y in mm0 */
+  emm2 = vcvtq_u32_f32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
+  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
+  y = vcvtq_f32_u32(emm2);
+
+  /* get the polynom selection mask
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  v4su poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
+  xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
+  xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
+  x = vaddq_f32(x, xmm1);
+  x = vaddq_f32(x, xmm2);
+  x = vaddq_f32(x, xmm3);
+
+  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
+  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
+     and the second polynom      (Pi/4 <= x <= 0) in y2 */
+  v4sf z = vmulq_f32(x, x);
+  v4sf y1, y2;
+
+  y1 = vmulq_n_f32(z, c_coscof_p0);
+  y2 = vmulq_n_f32(z, c_sincof_p0);
+  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
+  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, z);
+  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
+  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, z);
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, x);
+  y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
+  y2 = vaddq_f32(y2, x);
+  y1 = vaddq_f32(y1, vdupq_n_f32(1));
+
+  /* select the correct result from the two polynoms */
+  v4sf ys = vbslq_f32(poly_mask, y1, y2);
+  v4sf yc = vbslq_f32(poly_mask, y2, y1);
+  *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
+  *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
+}
+
+/**
+ * @brief sin function with simd
+ *
+ * @param x input register variable
+ * @return v4sf
+ */
+v4sf sin_ps(v4sf x) {
+  v4sf ysin, ycos;
+  sincos_ps(x, &ysin, &ycos);
+  return ysin;
+}
+
+/**
+ * @brief cos function with simd
+ *
+ * @param x input register variable
+ * @return v4sf
+ */
+v4sf cos_ps(v4sf x) {
+  v4sf ysin, ycos;
+  sincos_ps(x, &ysin, &ycos);
+  return ycos;
+}
+
+#endif
diff --git a/nntrainer/tensor/blas_neon_setting.h b/nntrainer/tensor/cpu_backend/arm/neon_setting.h
similarity index 98%
rename from nntrainer/tensor/blas_neon_setting.h
rename to nntrainer/tensor/cpu_backend/arm/neon_setting.h
index e10aa88f98..fa9bdc62b4 100644
--- a/nntrainer/tensor/blas_neon_setting.h
+++ b/nntrainer/tensor/cpu_backend/arm/neon_setting.h
@@ -2,7 +2,7 @@
 /**
  * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
  *
- * @file   blas_neon_setting.h
+ * @file   neon_setting.h
  * @date   18 Jan 2024
  * @see    https://github.com/nnstreamer/nntrainer
  *         https://arxiv.org/abs/1706.03762
diff --git a/nntrainer/tensor/cpu_backend/cblas_interface/cblas_interface.cpp b/nntrainer/tensor/cpu_backend/cblas_interface/cblas_interface.cpp
new file mode 100644
index 0000000000..00c0462a50
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/cblas_interface/cblas_interface.cpp
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file cblas_interface.cpp
+ * @date   23 April 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Single-precision computation functions based on NEON
+ *
+ */
+
+#include <cblas.h>
+#include <cblas_interface.h>
+
+namespace nntrainer {
+void __cblas_saxpy(const unsigned int N, const float alpha, const float *X,
+                   const unsigned int incX, float *Y, const unsigned int incY) {
+#ifdef BLAS_NUM_THREADS
+  openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
+  cblas_saxpy(N, alpha, X, incX, Y, incY);
+}
+
+void __cblas_sgemv(const unsigned int TStorageOrder, bool TransA,
+                   const unsigned int M, const unsigned int N,
+                   const float alpha, const float *A, const unsigned int lda,
+                   const float *X, const unsigned int incX, const float beta,
+                   float *Y, const unsigned int incY) {
+  CBLAS_TRANSPOSE transA = TransA ? CblasTrans : CblasNoTrans;
+  CBLAS_ORDER order = TStorageOrder ? CblasColMajor : CblasRowMajor;
+#ifdef BLAS_NUM_THREADS
+  openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
+  cblas_sgemv(order, transA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
+}
+
+float __cblas_sdot(const unsigned int N, const float *X,
+                   const unsigned int incX, const float *Y,
+                   const unsigned int incY) {
+#ifdef BLAS_NUM_THREADS
+  openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
+  return cblas_sdot(N, X, incX, Y, incY);
+}
+
+void __cblas_scopy(const unsigned int N, const float *X,
+                   const unsigned int incX, float *Y, const unsigned int incY) {
+#ifdef BLAS_NUM_THREADS
+  openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
+  cblas_scopy(N, X, incX, Y, incY);
+}
+
+void __cblas_sscal(const unsigned int N, const float alpha, float *X,
+                   const unsigned int incX) {
+#ifdef BLAS_NUM_THREADS
+  openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
+  cblas_sscal(N, alpha, X, incX);
+}
+
+float __cblas_snrm2(const unsigned int N, const float *X,
+                    const unsigned int incX) {
+#ifdef BLAS_NUM_THREADS
+  openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
+  return cblas_snrm2(N, X, incX);
+}
+
+void __cblas_sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
+                   const unsigned int M, const unsigned int N,
+                   const unsigned int K, const float alpha, const float *A,
+                   const unsigned int lda, const float *B,
+                   const unsigned int ldb, const float beta, float *C,
+                   const unsigned int ldc) {
+  CBLAS_TRANSPOSE transA = TransA ? CblasTrans : CblasNoTrans;
+  CBLAS_TRANSPOSE transB = TransB ? CblasTrans : CblasNoTrans;
+  CBLAS_ORDER order = TStorageOrder ? CblasColMajor : CblasRowMajor;
+#ifdef BLAS_NUM_THREADS
+  openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
+  cblas_sgemm(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C,
+              ldc);
+}
+
+unsigned int __cblas_isamax(const unsigned int N, const float *X,
+                            const unsigned int incX) {
+#ifdef BLAS_NUM_THREADS
+  openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
+  return cblas_isamax(N, X, incX);
+}
+} // namespace nntrainer
diff --git a/nntrainer/tensor/cpu_backend/cblas_interface/cblas_interface.h b/nntrainer/tensor/cpu_backend/cblas_interface/cblas_interface.h
new file mode 100644
index 0000000000..fdae5b9215
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/cblas_interface/cblas_interface.h
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file cblas_interface.h
+ * @date   23 April 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Single-precision computation functions based on NEON
+ *
+ */
+
+#ifndef __CBLAS_INTERFACE_H__
+#define __CBLAS_INTERFACE_H__
+#ifdef __cplusplus
+
+namespace nntrainer {
+/**
+ * @brief     saxpy computation : Y = alpha*X + Y
+ * @param[in] N number of elements in Y
+ * @param[in] alpha float number
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void __cblas_saxpy(const unsigned int N, const float alpha, const float *X,
+                   const unsigned int incX, float *Y, const unsigned int incY);
+/**
+ * @brief     sgemv computation : Y = alpha*A*X + beta*Y
+ * @param[in] TStorageOrder Row major / Col major
+ * @param[in] A float * for Matrix A
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] rows number of A's row
+ * @param[in] cols number of A's columns
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void __cblas_sgemv(const unsigned int TStorageOrder, bool TransA,
+                   const unsigned int M, const unsigned int N,
+                   const float alpha, const float *A, const unsigned int lda,
+                   const float *X, const unsigned int incX, const float beta,
+                   float *Y, const unsigned int incY);
+/**
+ * @brief     sdot computation : sum of all X * Y
+ * @param[in] N number of elements in Y
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+float __cblas_sdot(const unsigned int N, const float *X,
+                   const unsigned int incX, const float *Y,
+                   const unsigned int incY);
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void __cblas_scopy(const unsigned int N, const float *X,
+                   const unsigned int incX, float *Y, const unsigned int incY);
+/**
+ * @brief     sscal computation : X = alpha * X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] alpha float number
+ */
+void __cblas_sscal(const unsigned int N, const float alpha, float *X,
+                   const unsigned int incX);
+/**
+ * @brief     snrm2 computation : Euclidean norm
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ */
+float __cblas_snrm2(const unsigned int N, const float *X,
+                    const unsigned int incX);
+/**
+ * @brief     sgemm computation  : Y = alpha*op(A)*op(B) + beta*C,
+ * where op(X) is one of X or X**T
+ * @param[in] TStorageOrder Row major / Col major
+ * @param[in] A float * for Matrix A
+ * @param[in] B float * for Matrix B
+ * @param[in] C float * for Matrix C
+ * @param[in] M number of op(A)'s and C's row
+ * @param[in] N number of op(B)'s and C's columns
+ * @param[in] K number of op(A)'s and columns and op(B)'s rows
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void __cblas_sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
+                   const unsigned int M, const unsigned int N,
+                   const unsigned int K, const float alpha, const float *A,
+                   const unsigned int lda, const float *B,
+                   const unsigned int ldb, const float beta, float *C,
+                   const unsigned int ldc);
+/**
+ * @brief     isamax function : index of first maxima
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ */
+unsigned int __cblas_isamax(const unsigned int N, const float *X,
+                            const unsigned int incX);
+} // namespace nntrainer
+
+#endif
+#endif
diff --git a/nntrainer/tensor/cpu_backend/cblas_interface/meson.build b/nntrainer/tensor/cpu_backend/cblas_interface/meson.build
new file mode 100644
index 0000000000..104bfca3e2
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/cblas_interface/meson.build
@@ -0,0 +1,14 @@
+cblas_interface_headers = [
+    'cblas_interface.h',
+]
+cblas_interface_sources = [
+    'cblas_interface.cpp',
+]
+
+foreach s : cblas_interface_sources
+  nntrainer_sources += meson.current_source_dir() / s
+endforeach
+
+foreach h : cblas_interface_headers
+  nntrainer_headers += meson.current_source_dir() / h
+endforeach
diff --git a/nntrainer/tensor/cpu_backend/cpu_backend.h b/nntrainer/tensor/cpu_backend/cpu_backend.h
new file mode 100644
index 0000000000..eb50f12335
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/cpu_backend.h
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file   cpu_backend.h
+ * @date   23 April 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Computational backend for CPU considering architecture dependency
+ *
+ */
+
+#ifndef __CPU_BACKEND_H__
+#define __CPU_BACKEND_H__
+#ifdef __cplusplus
+
+#ifdef ARM
+#include <arm_compute_backend.h>
+#elif X86
+#include <x86_compute_backend.h>
+#else
+#include <fallback.h>
+#endif
+
+#include <cstdint>
+#include <tensor_dim.h>
+
+#ifdef ENABLE_FP16
+/**
+ * @brief     sscal computation : X = alpha * X
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] alpha float number
+ */
+extern void sscal(const unsigned int N, const float alpha, _FP16 *X,
+                  const unsigned int incX);
+
+/**
+ * @brief     snrm2 computation : Euclidean norm
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ */
+extern _FP16 snrm2(const unsigned int N, const _FP16 *X,
+                   const unsigned int incX);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+extern void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX,
+                  _FP16 *Y, const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+extern void scopy(const unsigned int N, const float *X, const unsigned int incX,
+                  _FP16 *Y, const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+extern void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX,
+                  float *Y, const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+extern void scopy_int4_to_float16(const unsigned int N, const uint8_t *X,
+                                  const unsigned int incX, _FP16 *Y,
+                                  const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+extern void scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
+                                  const unsigned int incX, _FP16 *Y,
+                                  const unsigned int incY);
+
+/**
+ * @brief     sdot computation : sum of all X * Y
+ * @param[in] N number of elements in Y
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+extern _FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX,
+                  const _FP16 *Y, const unsigned int incY);
+
+/**
+ * @brief     saxpy computation : Y = alpha*X + Y
+ * @param[in] N number of elements in Y
+ * @param[in] alpha float number
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+extern void saxpy(const unsigned int N, const float alpha, const _FP16 *X,
+                  const unsigned int incX, _FP16 *Y, const unsigned int incY);
+
+/**
+ * @brief     sgemm computation : Y = alpha*op(A)*op(B) + beta*C,
+ * where op(X) is one of X or X**T
+ * @param[in] A __fp16 * for Matrix A
+ * @param[in] B __fp16 * for Matrix B
+ * @param[in] C __fp16 * for Matrix C
+ * @param[in] M number of op(A)'s and C's row
+ * @param[in] N number of op(B)'s and C's columns
+ * @param[in] K number of op(A)'s and columns and op(B)'s rows
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+extern void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
+                  const unsigned int M, const unsigned int N,
+                  const unsigned int K, const float alpha, const _FP16 *A,
+                  const unsigned int lda, const _FP16 *B,
+                  const unsigned int ldb, const float beta, _FP16 *C,
+                  const unsigned int ldc);
+/**
+ * @brief     sgemv computation : Y = alpha*A*X + beta*Y
+ * @param[in] A float * for Matrix A
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] rows number of A's row
+ * @param[in] cols number of A's columns
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+extern void sgemv(const unsigned int TStorageOrder, bool TransA,
+                  const unsigned int M, const unsigned int N, const float alpha,
+                  const _FP16 *A, const unsigned int lda, const _FP16 *X,
+                  const unsigned int incX, const float beta, _FP16 *Y,
+                  const unsigned int incY);
+/**
+ * @brief     elementwise vector multiplication : Z = X ⊙ alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+extern void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y,
+                    _FP16 *Z, float alpha = 1.f, float beta = 0.f,
+                    unsigned int i_stride = 1, unsigned int o_stride = 1);
+
+/**
+ * @brief     elementwise vector addition : Z = X + alpha * Y + beta *
+ * Z
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+extern void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y,
+                    _FP16 *Z, float alpha = 1.f, float beta = 0.f,
+                    unsigned int i_stride = 1, unsigned int o_stride = 1);
+/**
+ * @brief     elementwise vector subtraction with neon : Z = X - alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+extern void ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+                    float alpha = 1.f, float beta = 0.f,
+                    unsigned int i_stride = 1, unsigned int o_stride = 1);
+
+/**
+ * @brief     elementwise vector division with neon : Z = X / (alpha * Y) + beta
+ * * Z
+ * @note ZeroDivisionError is not guaranteed in this function
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+extern void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+                    float alpha = 1.f, float beta = 0.f,
+                    unsigned int i_stride = 1, unsigned int o_stride = 1);
+
+/**
+ * @brief     isamax function : index of first maxima
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ */
+extern unsigned int isamax(const unsigned int N, const _FP16 *X,
+                           const unsigned int incX);
+
+/**
+ * @brief squared root transformation inplace : X = sqrt(X)
+ *
+ * @param N size of X
+ * @param X __fp16 * for Vector X
+ */
+extern void inv_sqrt_inplace(const unsigned int N, _FP16 *X);
+
+/**
+ * @brief Matrix transpose / 2D Tensor transpose
+ *
+ * @param M row length of input matrix
+ * @param N col length of input matrix
+ * @param src src data of input matrix
+ * @param ld_src data offset of input matrix
+ * @param dst destination of output matrix
+ * @param ld_dst data offset of output matrix
+ */
+extern void transpose_matrix(const unsigned int M, const unsigned int N,
+                             const _FP16 *src, unsigned int ld_src, _FP16 *dst,
+                             unsigned int ld_dst);
+#endif
+/**
+ * @brief     sscal computation : X = alpha * X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] alpha float number
+ */
+extern void sscal(const unsigned int N, const float alpha, float *X,
+                  const unsigned int incX);
+/**
+ * @brief     snrm2 computation : Euclidean norm
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ */
+extern float snrm2(const unsigned int N, const float *X,
+                   const unsigned int incX);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+extern void scopy(const unsigned int N, const float *X, const unsigned int incX,
+                  float *Y, const unsigned int incY);
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y uint8_t * for Vector Y
+ */
+extern void scopy(const unsigned int N, const uint8_t *X,
+                  const unsigned int incX, uint8_t *Y, const unsigned int incY);
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+extern void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
+                                  const unsigned int incX, float *Y,
+                                  const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+extern void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
+                                  const unsigned int incX, float *Y,
+                                  const unsigned int incY);
+
+/**
+ * @brief     sdot computation : sum of all X * Y
+ * @param[in] N number of elements in Y
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+extern float sdot(const unsigned int N, const float *X, const unsigned int incX,
+                  const float *Y, const unsigned int incY);
+
+/**
+ * @brief     saxpy computation : Y = alpha*X + Y
+ * @param[in] N number of elements in Y
+ * @param[in] alpha float number
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+extern void saxpy(const unsigned int N, const float alpha, const float *X,
+                  const unsigned int incX, float *Y, const unsigned int incY);
+/**
+ * @brief     sgemm computation  : Y = alpha*op(A)*op(B) + beta*C,
+ * where op(X) is one of X or X**T
+ * @param[in] A float * for Matrix A
+ * @param[in] B float * for Matrix B
+ * @param[in] C float * for Matrix C
+ * @param[in] M number of op(A)'s and C's row
+ * @param[in] N number of op(B)'s and C's columns
+ * @param[in] K number of op(A)'s and columns and op(B)'s rows
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+extern void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
+                  const unsigned int M, const unsigned int N,
+                  const unsigned int K, const float alpha, const float *A,
+                  const unsigned int lda, const float *B,
+                  const unsigned int ldb, const float beta, float *C,
+                  const unsigned int ldc);
+/**
+ * @brief     sgemv computation  : Y = alpha*A*X + beta*Y
+ * @param[in] A float * for Matrix A
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] rows number of A's row
+ * @param[in] cols number of A's columns
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+extern void sgemv(const unsigned int TStorageOrder, bool TransA,
+                  const unsigned int M, const unsigned int N, const float alpha,
+                  const float *A, const unsigned int lda, const float *X,
+                  const unsigned int incX, const float beta, float *Y,
+                  const unsigned int incY);
+/**
+ * @brief     isamax function : index of first maxima
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ */
+extern unsigned int isamax(const unsigned int N, const float *X,
+                           const unsigned int incX);
+
+/**
+ * @brief     sine with neon: Y = sin(alpha * X)
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] alpha float * for scaling angle (radian)
+ */
+extern void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f);
+
+/**
+ * @brief     cosine with neon: Y = cos(alpha * X)
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] alpha float * for scaling angle (radian)
+ */
+extern void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f);
+
+/**
+ * @brief inversed squared root transformation inplace : X = 1 / sqrt(X)
+ *
+ * @param N size of X
+ * @param X float * for Vector X
+ */
+extern void inv_sqrt_inplace(const unsigned int N, float *X);
+/**
+ * @brief     elementwise vector multiplication : Z = X ⊙ alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+extern void ele_mul(const unsigned int N, const float *X, const float *Y,
+                    float *Z, float alpha = 1.f, float beta = 0.f,
+                    unsigned int i_stride = 1, unsigned int o_stride = 1);
+
+/**
+ * @brief     elementwise vector addition : Z = X + alpha * Y + beta *
+ * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+extern void ele_add(const unsigned int N, const float *X, const float *Y,
+                    float *Z, float alpha = 1.f, float beta = 0.f,
+                    unsigned int i_stride = 1, unsigned int o_stride = 1);
+/**
+ * @brief     elementwise vector subtraction with neon : Z = X - alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+extern void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
+                    float alpha = 1.f, float beta = 0.f,
+                    unsigned int i_stride = 1, unsigned int o_stride = 1);
+
+/**
+ * @brief     elementwise vector division with neon : Z = X / (alpha * Y) + beta
+ * * Z
+ * @note ZeroDivisionError is not guaranteed in this function
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+extern void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
+                    float alpha = 1.f, float beta = 0.f,
+                    unsigned int i_stride = 1, unsigned int o_stride = 1);
+
+#endif
+#endif
diff --git a/nntrainer/tensor/cpu_backend/fallback/fallback.cpp b/nntrainer/tensor/cpu_backend/fallback/fallback.cpp
new file mode 100644
index 0000000000..1a06dbcd87
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/fallback/fallback.cpp
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file fallback.cpp
+ * @date   23 April 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Fallback interface (Raw implementations)
+ *
+ */
+
+#include <assert.h>
+#include <fallback_internal.h>
+#include <nntrainer_error.h>
+
+namespace nntrainer {
+
+void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, float *Y,
+                           const unsigned int incY) {
+  __fallback_scopy_int4_to_float32(N, X, incX, Y, incY);
+}
+
+void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, float *Y,
+                           const unsigned int incY) {
+  __fallback_scopy_int8_to_float32(N, X, incX, Y, incY);
+}
+
+void sine(const unsigned int N, float *X, float *Y, float alpha) {
+  __fallback_sine(N, X, Y, alpha);
+}
+
+void cosine(const unsigned int N, float *X, float *Y, float alpha) {
+  __fallback_cosine(N, X, Y, alpha);
+}
+
+void inv_sqrt_inplace(const unsigned int N, float *X) {
+  __fallback_inv_sqrt_inplace(N, X);
+}
+
+void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void saxpy(const unsigned int N, const float alpha, const float *X,
+           const unsigned int incX, float *Y, const unsigned int incY) {
+  __fallback_saxpy(N, alpha, X, incX, Y, incY);
+}
+
+void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
+           const unsigned int N, const float alpha, const float *A,
+           const unsigned int lda, const float *X, const unsigned int incX,
+           const float beta, float *Y, const unsigned int incY) {
+  __fallback_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, Y, incX, beta, Y,
+                   incY);
+}
+
+float sdot(const unsigned int N, const float *X, const unsigned int incX,
+           const float *Y, const unsigned int incY) {
+  return __fallback_sdot(N, X, incX, Y, incY);
+}
+
+void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX,
+           uint8_t *Y, const unsigned int incY) {
+  __fallback_scopy(N, X, incX, Y, incY);
+}
+
+void scopy(const unsigned int N, const float *X, const unsigned int incX,
+           float *Y, const unsigned int incY) {
+  __fallback_scopy(N, X, incX, Y, incY);
+}
+
+void sscal(const unsigned int N, const float alpha, float *X,
+           const unsigned int incX) {
+  __fallback_sscal(N, alpha, X, incX);
+}
+
+float snrm2(const unsigned int N, const float *X, const unsigned int incX) {
+  return __fallback_snrm2(N, X, incX);
+}
+
+void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
+           const unsigned int M, const unsigned int N, const unsigned int K,
+           const float alpha, const float *A, const unsigned int lda,
+           const float *B, const unsigned int ldb, const float beta, float *C,
+           const unsigned int ldc) {
+  __fallback_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B,
+                   ldb, beta, C, ldc);
+}
+
+unsigned int isamax(const unsigned int N, const float *X,
+                    const unsigned int incX) {
+  return __fallback_isamax(N, X, incX);
+}
+
+} /* namespace nntrainer */
diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/cpu_backend/fallback/fallback.h
similarity index 74%
rename from nntrainer/tensor/blas_interface.h
rename to nntrainer/tensor/cpu_backend/fallback/fallback.h
index b57ea3e057..268cda78d2 100644
--- a/nntrainer/tensor/blas_interface.h
+++ b/nntrainer/tensor/cpu_backend/fallback/fallback.h
@@ -1,28 +1,23 @@
 // SPDX-License-Identifier: Apache-2.0
 /**
- * Copyright (C) 2020 Jijoong Moon <jijoong.moon@samsung.com>
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
  *
- * @file   blas_interface.h
- * @date   28 Aug 2020
+ * @file fallback.h
+ * @date   23 April 2024
  * @see    https://github.com/nnstreamer/nntrainer
- * @author Jijoong Moon <jijoong.moon@samsung.com>
  * @author Sungsik Kong <ss.kong@samsung.com>
  * @bug    No known bugs except for NYI items
- * @brief  This is dummy header for blas support
+ * @brief  Fallback interface (Raw implementations)
  *
  */
 
-#ifndef __BLAS_INTERFACE_H_
-#define __BLAS_INTERFACE_H_
+#ifndef __FALLBACK_H__
+#define __FALLBACK_H__
 #ifdef __cplusplus
 
-#ifdef USE_CUBLAS
-#include <helper_cuda.h>
-#include <helper_functions.h>
-#endif
-
 #include <cstdint>
 #include <tensor_dim.h>
+
 namespace nntrainer {
 
 #ifdef ENABLE_FP16
@@ -32,14 +27,15 @@ namespace nntrainer {
  * @param[in] X __fp16 * for Vector X
  * @param[in] alpha float number
  */
-void sscal(const unsigned int N, const float alpha, _FP16 *X, const int incX);
+void sscal(const unsigned int N, const float alpha, _FP16 *X,
+           const unsigned int incX);
 
 /**
  * @brief     snrm2 computation : Euclidean norm
  * @param[in] N number of elements in X
  * @param[in] X __fp16 * for Vector X
  */
-_FP16 snrm2(const int N, const _FP16 *X, const int incX);
+_FP16 snrm2(const unsigned int N, const _FP16 *X, const unsigned int incX);
 
 /**
  * @brief     copy function : Y = X
@@ -47,8 +43,8 @@ _FP16 snrm2(const int N, const _FP16 *X, const int incX);
  * @param[in] X __fp16 * for Vector X
  * @param[in] Y __fp16 * for Vector Y
  */
-void scopy(const unsigned int N, const _FP16 *X, const int incX, _FP16 *Y,
-           const int incY);
+void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           _FP16 *Y, const unsigned int incY);
 
 /**
  * @brief     copy function : Y = X
@@ -56,8 +52,8 @@ void scopy(const unsigned int N, const _FP16 *X, const int incX, _FP16 *Y,
  * @param[in] X float * for Vector X
  * @param[in] Y __fp16 * for Vector Y
  */
-void scopy(const unsigned int N, const float *X, const int incX, _FP16 *Y,
-           const int incY);
+void scopy(const unsigned int N, const float *X, const unsigned int incX,
+           _FP16 *Y, const unsigned int incY);
 
 /**
  * @brief     copy function : Y = X
@@ -65,8 +61,8 @@ void scopy(const unsigned int N, const float *X, const int incX, _FP16 *Y,
  * @param[in] X __fp16 * for Vector X
  * @param[in] Y float * for Vector Y
  */
-void scopy(const unsigned int N, const _FP16 *X, const int incX, float *Y,
-           const int incY);
+void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           float *Y, const unsigned int incY);
 
 /**
  * @brief     copy function : Y = X
@@ -75,7 +71,8 @@ void scopy(const unsigned int N, const _FP16 *X, const int incX, float *Y,
  * @param[in] Y __fp16 * for Vector Y
  */
 void scopy_int4_to_float16(const unsigned int N, const uint8_t *X,
-                           const int incX, _FP16 *Y, const int incY);
+                           const unsigned int incX, _FP16 *Y,
+                           const unsigned int incY);
 
 /**
  * @brief     copy function : Y = X
@@ -84,7 +81,8 @@ void scopy_int4_to_float16(const unsigned int N, const uint8_t *X,
  * @param[in] Y __fp16 * for Vector Y
  */
 void scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
-                           const int incX, _FP16 *Y, const int incY);
+                           const unsigned int incX, _FP16 *Y,
+                           const unsigned int incY);
 
 /**
  * @brief     sdot computation : sum of all X * Y
@@ -103,7 +101,7 @@ _FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX,
  * @param[in] Y __fp16 * for Vector Y
  */
 void saxpy(const unsigned int N, const float alpha, const _FP16 *X,
-           const int incX, _FP16 *Y, const int incY);
+           const unsigned int incX, _FP16 *Y, const unsigned int incY);
 
 /**
  * @brief     sgemm computation : Y = alpha*op(A)*op(B) + beta*C,
@@ -134,8 +132,8 @@ void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
  */
 void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
            const unsigned int N, const float alpha, const _FP16 *A,
-           const unsigned int lda, const _FP16 *X, const int incX,
-           const float beta, _FP16 *Y, const int incY);
+           const unsigned int lda, const _FP16 *X, const unsigned int incX,
+           const float beta, _FP16 *Y, const unsigned int incY);
 /**
  * @brief     elementwise vector multiplication : Z = X ⊙ alpha * Y +
  * beta * Z
@@ -205,7 +203,8 @@ void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
  * @param[in] N number of elements in X
  * @param[in] X __fp16 * for Vector X
  */
-unsigned int isamax(const unsigned int N, const _FP16 *X, const int incX);
+unsigned int isamax(const unsigned int N, const _FP16 *X,
+                    const unsigned int incX);
 
 /**
  * @brief squared root transformation inplace : X = sqrt(X)
@@ -229,51 +228,36 @@ void transpose_matrix(const unsigned int M, const unsigned int N,
                       const _FP16 *src, unsigned int ld_src, _FP16 *dst,
                       unsigned int ld_dst);
 #endif
-/**
- * @brief     sscal computation : X = alpha * X
- * @param[in] N number of elements in X
- * @param[in] X void * for Vector X
- * @param[in] alpha float number
- */
-void sscal(const unsigned int N, const float alpha, void *X, const int incX,
-           ml::train::TensorDim::DataType d_type);
 /**
  * @brief     sscal computation : X = alpha * X
  * @param[in] N number of elements in X
  * @param[in] X float * for Vector X
  * @param[in] alpha float number
  */
-void sscal(const unsigned int N, const float alpha, float *X, const int incX);
+void sscal(const unsigned int N, const float alpha, float *X,
+           const unsigned int incX);
 /**
  * @brief     snrm2 computation : Euclidean norm
  * @param[in] N number of elements in X
  * @param[in] X float * for Vector X
  */
-float snrm2(const int N, const float *X, const int incX);
-/**
- * @brief     copy function : Y = X
- * @param[in] N number of elements in X
- * @param[in] X void * for Vector X
- * @param[in] Y void * for Vector Y
- */
-void scopy(const unsigned int N, const void *X, const int incX, void *Y,
-           const int incY, ml::train::TensorDim::DataType d_type);
+float snrm2(const unsigned int N, const float *X, const unsigned int incX);
 /**
  * @brief     copy function : Y = X
  * @param[in] N number of elements in X
  * @param[in] X float * for Vector X
  * @param[in] Y float * for Vector Y
  */
-void scopy(const unsigned int N, const float *X, const int incX, float *Y,
-           const int intY);
+void scopy(const unsigned int N, const float *X, const unsigned int incX,
+           float *Y, const unsigned int incY);
 /**
  * @brief     copy function : Y = X
  * @param[in] N number of elements in X
  * @param[in] X uint8_t * for Vector X
  * @param[in] Y uint8_t * for Vector Y
  */
-void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y,
-           const int intY);
+void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX,
+           uint8_t *Y, const unsigned int incY);
 /**
  * @brief     copy function : Y = X
  * @param[in] N number of elements in X
@@ -281,7 +265,8 @@ void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y,
  * @param[in] Y float * for Vector Y
  */
 void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
-                           const int incX, float *Y, const int intY);
+                           const unsigned int incX, float *Y,
+                           const unsigned int incY);
 
 /**
  * @brief     copy function : Y = X
@@ -290,7 +275,8 @@ void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
  * @param[in] Y float * for Vector Y
  */
 void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
-                           const int incX, float *Y, const int intY);
+                           const unsigned int incX, float *Y,
+                           const unsigned int incY);
 
 /**
  * @brief     sdot computation : sum of all X * Y
@@ -300,16 +286,6 @@ void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
  */
 float sdot(const unsigned int N, const float *X, const unsigned int incX,
            const float *Y, const unsigned int incY);
-/**
- * @brief     saxpy computation : Y = alpha*X + Y
- * @param[in] N number of elements in Y
- * @param[in] alpha float number
- * @param[in] X void * for Vector X
- * @param[in] Y void * for Vector Y
- */
-void saxpy(const unsigned int N, const float alpha, const void *X,
-           const int incX, void *Y, const int incY,
-           ml::train::TensorDim::DataType d_type);
 /**
  * @brief     saxpy computation : Y = alpha*X + Y
  * @param[in] N number of elements in Y
@@ -318,24 +294,7 @@ void saxpy(const unsigned int N, const float alpha, const void *X,
  * @param[in] Y float * for Vector Y
  */
 void saxpy(const unsigned int N, const float alpha, const float *X,
-           const int incX, float *Y, const int incY);
-/**
- * @brief     sgemm computation  : Y = alpha*op(A)*op(B) + beta*C,
- * where op(X) is one of X or X**T
- * @param[in] A void * for Matrix A
- * @param[in] B void * for Matrix B
- * @param[in] C void * for Matrix C
- * @param[in] M number of op(A)'s and C's row
- * @param[in] N number of op(B)'s and C's columns
- * @param[in] K number of op(A)'s and columns and op(B)'s rows
- * @param[in] alpha float number
- * @param[in] beta float number
- */
-void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
-           const unsigned int M, const unsigned int N, const unsigned int K,
-           const float alpha, const void *A, const unsigned int lda,
-           const void *B, const unsigned int ldb, const float beta, void *C,
-           const unsigned int ldc, ml::train::TensorDim::DataType d_type);
+           const unsigned int incX, float *Y, const unsigned int incY);
 /**
  * @brief     sgemm computation  : Y = alpha*op(A)*op(B) + beta*C,
  * where op(X) is one of X or X**T
@@ -353,21 +312,6 @@ void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
            const float alpha, const float *A, const unsigned int lda,
            const float *B, const unsigned int ldb, const float beta, float *C,
            const unsigned int ldc);
-/**
- * @brief     sgemv computation  : Y = alpha*A*X + beta*Y
- * @param[in] A void * for Matrix A
- * @param[in] X void * for Vector X
- * @param[in] Y void * for Vector Y
- * @param[in] rows number of A's row
- * @param[in] cols number of A's columns
- * @param[in] alpha float number
- * @param[in] beta float number
- */
-void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
-           const unsigned int N, const float alpha, const void *A,
-           const unsigned int lda, const void *X, const int incX,
-           const float beta, void *Y, const int incY,
-           ml::train::TensorDim::DataType d_type);
 /**
  * @brief     sgemv computation  : Y = alpha*A*X + beta*Y
  * @param[in] A float * for Matrix A
@@ -380,14 +324,15 @@ void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
  */
 void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
            const unsigned int N, const float alpha, const float *A,
-           const unsigned int lda, const float *X, const int incX,
-           const float beta, float *Y, const int incY);
+           const unsigned int lda, const float *X, const unsigned int incX,
+           const float beta, float *Y, const unsigned int incY);
 /**
  * @brief     isamax function : index of first maxima
  * @param[in] N number of elements in X
  * @param[in] X float * for Vector X
  */
-unsigned int isamax(const unsigned int N, const float *X, const int incX);
+unsigned int isamax(const unsigned int N, const float *X,
+                    const unsigned int incX);
 
 /**
  * @brief     sine with neon: Y = sin(alpha * X)
@@ -479,4 +424,4 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
              unsigned int o_stride = 1);
 } /* namespace nntrainer */
 #endif /* __cplusplus */
-#endif /* __BLAS_INTERFACE_H__ */
+#endif /* __FALLBACK_H__ */
diff --git a/nntrainer/tensor/cpu_backend/fallback/fallback_fp16.cpp b/nntrainer/tensor/cpu_backend/fallback/fallback_fp16.cpp
new file mode 100644
index 0000000000..214eb2936d
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/fallback/fallback_fp16.cpp
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file fallback.cpp
+ * @date   23 April 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Fallback interface (Raw implementations)
+ *
+ */
+
+#include <assert.h>
+#include <fallback_internal.h>
+#include <nntrainer_error.h>
+
+namespace nntrainer {
+
+void sscal(const unsigned int N, const float alpha, _FP16 *X,
+           const unsigned int incX) {
+  __fallback_sscal(N, alpha, X, incX);
+}
+
+_FP16 snrm2(const unsigned int N, const _FP16 *X, const unsigned int incX) {
+  assert(incX > 0);
+  _FP16 sum = 0;
+  _FP16 tmp;
+  sum = __fallback_snrm2(N, X, incX);
+  return sum;
+}
+
+void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           _FP16 *Y, const unsigned int incY) {
+  __fallback_scopy(N, X, incX, Y, incY);
+}
+
+void scopy(const unsigned int N, const float *X, const unsigned int incX,
+           _FP16 *Y, const unsigned int incY) {
+  __fallback_scopy(N, X, incX, Y, incY);
+}
+
+void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           float *Y, const unsigned int incY) {
+  __fallback_scopy(N, X, incX, Y, incY);
+}
+
+void scopy_int4_to_float16(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, _FP16 *Y,
+                           const unsigned int incY) {
+  __fallback_scopy_int4_to_float16(N, X, incX, Y, incY);
+}
+
+void scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, _FP16 *Y,
+                           const unsigned int incY) {
+  __fallback_scopy_int8_to_float16(N, X, incX, Y, incY);
+}
+
+_FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           const _FP16 *Y, const unsigned int incY) {
+  assert(incX > 0 && incY > 0);
+  _FP16 ret = 0;
+  return __fallback_sdot(N, X, incX, Y, incY);
+}
+
+void saxpy(const unsigned int N, const float alpha, const _FP16 *X,
+           const unsigned int incX, _FP16 *Y, const unsigned int incY) {
+  __fallback_saxpy(N, alpha, X, incX, Y, incY);
+}
+
+void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
+           const unsigned int M, const unsigned int N, const unsigned int K,
+           const float alpha, const _FP16 *A, const unsigned int lda,
+           const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C,
+           const unsigned int ldc) {
+  __fallback_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B,
+                   ldb, beta, C, ldc);
+}
+
+void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
+           const unsigned int N, const float alpha, const _FP16 *A,
+           const unsigned int lda, const _FP16 *X, const unsigned int incX,
+           const float beta, _FP16 *Y, const unsigned int incY) {
+  __fallback_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
+                   incY);
+}
+
+void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+unsigned int isamax(const unsigned int N, const _FP16 *X,
+                    const unsigned int incX) {
+  unsigned int max_idx = 0;
+  max_idx = __fallback_isamax(N, X, incX);
+  return max_idx;
+}
+
+void inv_sqrt_inplace(const unsigned int N, _FP16 *X) {
+  __fallback_inv_sqrt_inplace(N, X);
+}
+
+void transpose_matrix(const unsigned int M, const unsigned int N,
+                      const _FP16 *src, unsigned int ld_src, _FP16 *dst,
+                      unsigned int ld_dst) {
+  __fallback_transpose_matrix(M, N, src, ld_src, dst, ld_dst);
+}
+
+} /* namespace nntrainer */
diff --git a/nntrainer/tensor/cpu_backend/fallback/fallback_internal.cpp b/nntrainer/tensor/cpu_backend/fallback/fallback_internal.cpp
new file mode 100644
index 0000000000..2ea2b6c407
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/fallback/fallback_internal.cpp
@@ -0,0 +1,220 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file fallback_internal.cpp
+ * @date   23 April 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Single-precision computation functions based on NEON
+ *
+ */
+
+#include <assert.h>
+#include <cmath>
+#include <cstdint>
+#include <fallback_internal.h>
+#include <tensor_dim.h>
+
+#define sgemv_loop(ci, cj, cM, cN)           \
+  do {                                       \
+    float y0;                                \
+    unsigned int i, j;                       \
+    for (ci = 0; ci != cM; ci++) {           \
+      y0 = Y[ci * incY] * beta;              \
+      for (cj = 0; cj != cN; cj++)           \
+        y0 += A[i + j * lda] * X[cj * incX]; \
+      Y[ci * incY] = y0;                     \
+    }                                        \
+  } while (0);
+namespace nntrainer {
+
+void __fallback_sscal(const unsigned int N, const float alpha, float *X,
+                      const unsigned int incX) {
+  assert(incX > 0);
+  for (unsigned int i = 0; i < N; ++i)
+    X[i * incX] = alpha * X[i * incX];
+}
+
+float __fallback_snrm2(const unsigned int N, const float *X,
+                       const unsigned int incX) {
+  assert(incX > 0);
+  float sum = 0.0f;
+  float tmp;
+
+  for (unsigned int i = 0; i < N; i++) {
+    tmp = X[i * incX];
+    sum += tmp * tmp;
+  }
+  return sqrt(sum);
+}
+
+void __fallback_scopy(const unsigned int N, const float *X,
+                      const unsigned int incX, float *Y,
+                      const unsigned int incY) {
+  assert(incX > 0 && incY > 0);
+  for (unsigned int i = 0; i < N; ++i)
+    Y[i * incY] = X[i * incX];
+}
+
+void __fallback_scopy(const unsigned int N, const uint8_t *X,
+                      const unsigned int incX, uint8_t *Y,
+                      const unsigned int incY) {
+  for (unsigned int idx = 0; idx < N; idx++) {
+    Y[idx * incX] = X[idx * incY];
+  }
+}
+
+void __fallback_scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
+                                      const unsigned int incX, float *Y,
+                                      const unsigned int incY) {
+  for (unsigned int idx = 0; idx < N; idx++) {
+    Y[2 * idx] = X[idx] >> 4;
+    Y[2 * idx + 1] = X[idx] & 0x0f;
+  }
+}
+
+void __fallback_scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
+                                      const unsigned int incX, float *Y,
+                                      const unsigned int incY) {
+  for (unsigned int idx = 0; idx < N; idx++) {
+    Y[idx * incX] = X[idx * incY];
+  }
+}
+
+float __fallback_sdot(const unsigned int N, const float *X,
+                      const unsigned int incX, const float *Y,
+                      const unsigned int incY) {
+  float ret = 0;
+  for (unsigned int i = 0; i < N; ++i) {
+    ret += X[i * incX] * Y[i * incY];
+  }
+  return ret;
+}
+
+void __fallback_saxpy(const unsigned int N, const float alpha, const float *X,
+                      const unsigned int incX, float *Y,
+                      const unsigned int incY) {
+  assert(incX > 0 && incY > 0);
+  for (unsigned int i = 0; i < N; ++i)
+    Y[i * incY] = Y[i * incY] + X[i * incX] * alpha;
+}
+
+void __fallback_sgemm(const unsigned int TStorageOrder, bool TransA,
+                      bool TransB, const unsigned int M, const unsigned int N,
+                      const unsigned int K, const float alpha, const float *A,
+                      const unsigned int lda, const float *B,
+                      const unsigned int ldb, const float beta, float *C,
+                      const unsigned int ldc) {
+  for (unsigned int m = 0; m < M; ++m) {
+    for (unsigned int n = 0; n < N; ++n) {
+      double c = 0.0;
+      float c_old = C[m * ldc + n];
+      for (unsigned int k = 0; k < K; ++k) {
+        float a, b;
+        a = ((TransA == true) ? A[k * lda + m] : A[m * lda + k]);
+        b = ((TransB == true) ? B[n * ldb + k] : B[k * ldb + n]);
+        c += a * b;
+      }
+      C[m * ldc + n] = alpha * c;
+      if (beta != 0.0)
+        C[m * ldc + n] += beta * c_old;
+    }
+  }
+}
+
+void __fallback_sgemv(const unsigned int TStorageOrder, bool TransA,
+                      const unsigned int M, const unsigned int N,
+                      const float alpha, const float *A, const unsigned int lda,
+                      const float *X, const unsigned int incX, const float beta,
+                      float *Y, const unsigned int incY) {
+
+  if (TransA == true) {
+    sgemv_loop(i, j, N, M);
+  } else {
+    sgemv_loop(j, i, M, N);
+  }
+}
+
+unsigned int __fallback_isamax(const unsigned int N, const float *X,
+                               const unsigned int incX) {
+  unsigned int max_idx = 0;
+  float max_val = X[0];
+  for (unsigned int n = 1; n < N; n += incX) {
+    float cur_val = abs(X[n]);
+    if (cur_val > max_val) {
+      max_val = cur_val;
+      max_idx = n;
+    }
+  }
+
+  return max_idx;
+}
+
+void __fallback_sine(const unsigned int N, float *X, float *Y, float alpha) {
+  unsigned int i = 0;
+  while (i < N) {
+    Y[i] = std::sin(alpha * X[i]);
+    ++i;
+  }
+}
+
+void __fallback_cosine(const unsigned int N, float *X, float *Y, float alpha) {
+  unsigned int i = 0;
+  while (i < N) {
+    Y[i] = std::cos(alpha * X[i]);
+    ++i;
+  }
+}
+
+void __fallback_inv_sqrt_inplace(const unsigned int N, float *X) {
+  for (unsigned int i = 0; i < N; ++i) {
+    X[i] = 1 / std::sqrt(static_cast<float>(X[i]));
+  }
+}
+
+void __fallback_ele_mul(const unsigned int N, const float *X, const float *Y,
+                        float *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride) {
+  for (unsigned int i = 0; i < N; ++i) {
+    *Z = *X * alpha * *Y + beta * *Z;
+    X += o_stride;
+    Y += i_stride;
+    Z += o_stride;
+  }
+}
+
+void __fallback_ele_add(const unsigned int N, const float *X, const float *Y,
+                        float *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride) {
+  for (unsigned int i = 0; i < N; ++i) {
+    *Z = *X + alpha * *Y + beta * *Z;
+    X += o_stride;
+    Y += i_stride;
+    Z += o_stride;
+  }
+}
+
+void __fallback_ele_sub(const unsigned N, const float *X, const float *Y,
+                        float *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride) {
+  for (unsigned int i = 0; i < N; ++i) {
+    *Z = *X - alpha * *Y + beta * *Z;
+    X += o_stride;
+    Y += i_stride;
+    Z += o_stride;
+  }
+}
+
+void __fallback_ele_div(const unsigned N, const float *X, const float *Y,
+                        float *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride) {
+  for (unsigned int i = 0; i < N; ++i) {
+    *Z = *X / (alpha * *Y) + beta * *Z;
+    X += o_stride;
+    Y += i_stride;
+    Z += o_stride;
+  }
+}
+} // namespace nntrainer
diff --git a/nntrainer/tensor/cpu_backend/fallback/fallback_internal.h b/nntrainer/tensor/cpu_backend/fallback/fallback_internal.h
new file mode 100644
index 0000000000..22703bbd1b
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/fallback/fallback_internal.h
@@ -0,0 +1,444 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file fallback_internal.h
+ * @date   23 April 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Single-precision computation functions based on NEON
+ *
+ */
+
+#ifndef __FALLBACK_INTERNAL_H__
+#define __FALLBACK_INTERNAL_H__
+#ifdef __cplusplus
+
+#include <cstdint>
+#include <tensor_dim.h>
+
+namespace nntrainer {
+
+#ifdef ENABLE_FP16
+/**
+ * @brief     sscal computation : X = alpha * X
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] alpha float number
+ */
+void __fallback_sscal(const unsigned int N, const float alpha, _FP16 *X,
+                      const unsigned int incX);
+
+/**
+ * @brief     snrm2 computation : Euclidean norm
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ */
+_FP16 __fallback_snrm2(const unsigned int N, const _FP16 *X,
+                       const unsigned int incX);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void __fallback_scopy(const unsigned int N, const _FP16 *X,
+                      const unsigned int incX, _FP16 *Y,
+                      const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void __fallback_scopy(const unsigned int N, const float *X,
+                      const unsigned int incX, _FP16 *Y,
+                      const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void __fallback_scopy(const unsigned int N, const _FP16 *X,
+                      const unsigned int incX, float *Y,
+                      const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void __fallback_scopy_int4_to_float16(const unsigned int N, const uint8_t *X,
+                                      const unsigned int incX, _FP16 *Y,
+                                      const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void __fallback_scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
+                                      const unsigned int incX, _FP16 *Y,
+                                      const unsigned int incY);
+
+/**
+ * @brief     sdot computation : sum of all X * Y
+ * @param[in] N number of elements in Y
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+_FP16 __fallback_sdot(const unsigned int N, const _FP16 *X,
+                      const unsigned int incX, const _FP16 *Y,
+                      const unsigned int incY);
+
+/**
+ * @brief     saxpy computation : Y = alpha*X + Y
+ * @param[in] N number of elements in Y
+ * @param[in] alpha float number
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void __fallback_saxpy(const unsigned int N, const float alpha, const _FP16 *X,
+                      const unsigned int incX, _FP16 *Y,
+                      const unsigned int incY);
+
+/**
+ * @brief     sgemm computation : Y = alpha*op(A)*op(B) + beta*C,
+ * where op(X) is one of X or X**T
+ * @param[in] A __fp16 * for Matrix A
+ * @param[in] B __fp16 * for Matrix B
+ * @param[in] C __fp16 * for Matrix C
+ * @param[in] M number of op(A)'s and C's row
+ * @param[in] N number of op(B)'s and C's columns
+ * @param[in] K number of op(A)'s and columns and op(B)'s rows
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void __fallback_sgemm(const unsigned int TStorageOrder, bool TransA,
+                      bool TransB, const unsigned int M, const unsigned int N,
+                      const unsigned int K, const float alpha, const _FP16 *A,
+                      const unsigned int lda, const _FP16 *B,
+                      const unsigned int ldb, const float beta, _FP16 *C,
+                      const unsigned int ldc);
+/**
+ * @brief     sgemv computation : Y = alpha*A*X + beta*Y
+ * @param[in] A float * for Matrix A
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] rows number of A's row
+ * @param[in] cols number of A's columns
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void __fallback_sgemv(const unsigned int TStorageOrder, bool TransA,
+                      const unsigned int M, const unsigned int N,
+                      const float alpha, const _FP16 *A, const unsigned int lda,
+                      const _FP16 *X, const unsigned int incX, const float beta,
+                      _FP16 *Y, const unsigned int incY);
+/**
+ * @brief     elementwise vector multiplication : Z = X ⊙ alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void __fallback_ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y,
+                        _FP16 *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride);
+
+/**
+ * @brief     elementwise vector addition : Z = X + alpha * Y + beta *
+ * Z
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void __fallback_ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y,
+                        _FP16 *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride);
+/**
+ * @brief     elementwise vector subtraction with neon : Z = X - alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void __fallback_ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y,
+                        _FP16 *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride);
+
+/**
+ * @brief     elementwise vector division with neon : Z = X / (alpha * Y) + beta
+ * * Z
+ * @note ZeroDivisionError is not guaranteed in this function
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void __fallback_ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y,
+                        _FP16 *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride);
+
+/**
+ * @brief     isamax function : index of first maxima
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ */
+unsigned int __fallback_isamax(const unsigned int N, const _FP16 *X,
+                               const unsigned int incX);
+
+/**
+ * @brief squared root transformation inplace : X = sqrt(X)
+ *
+ * @param N size of X
+ * @param X __fp16 * for Vector X
+ */
+void __fallback_inv_sqrt_inplace(const unsigned int N, _FP16 *X);
+
+/**
+ * @brief Matrix transpose / 2D Tensor transpose
+ *
+ * @param M row length of input matrix
+ * @param N col length of input matrix
+ * @param src src data of input matrix
+ * @param ld_src data offset of input matrix
+ * @param dst destination of output matrix
+ * @param ld_dst data offset of output matrix
+ */
+void __fallback_transpose_matrix(const unsigned int M, const unsigned int N,
+                                 const _FP16 *src, unsigned int ld_src,
+                                 _FP16 *dst, unsigned int ld_dst);
+#endif
+/**
+ * @brief     sscal computation : X = alpha * X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] alpha float number
+ */
+void __fallback_sscal(const unsigned int N, const float alpha, float *X,
+                      const unsigned int incX);
+/**
+ * @brief     snrm2 computation : Euclidean norm
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ */
+float __fallback_snrm2(const unsigned int N, const float *X,
+                       const unsigned int incX);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void __fallback_scopy(const unsigned int N, const float *X,
+                      const unsigned int incX, float *Y,
+                      const unsigned int incY);
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y uint8_t * for Vector Y
+ */
+void __fallback_scopy(const unsigned int N, const uint8_t *X,
+                      const unsigned int incX, uint8_t *Y,
+                      const unsigned int incY);
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void __fallback_scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
+                                      const unsigned int incX, float *Y,
+                                      const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void __fallback_scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
+                                      const unsigned int incX, float *Y,
+                                      const unsigned int incY);
+
+/**
+ * @brief     sdot computation : sum of all X * Y
+ * @param[in] N number of elements in Y
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+float __fallback_sdot(const unsigned int N, const float *X,
+                      const unsigned int incX, const float *Y,
+                      const unsigned int incY);
+
+/**
+ * @brief     saxpy computation : Y = alpha*X + Y
+ * @param[in] N number of elements in Y
+ * @param[in] alpha float number
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void __fallback_saxpy(const unsigned int N, const float alpha, const float *X,
+                      const unsigned int incX, float *Y,
+                      const unsigned int incY);
+/**
+ * @brief     sgemm computation  : Y = alpha*op(A)*op(B) + beta*C,
+ * where op(X) is one of X or X**T
+ * @param[in] A float * for Matrix A
+ * @param[in] B float * for Matrix B
+ * @param[in] C float * for Matrix C
+ * @param[in] M number of op(A)'s and C's row
+ * @param[in] N number of op(B)'s and C's columns
+ * @param[in] K number of op(A)'s and columns and op(B)'s rows
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void __fallback_sgemm(const unsigned int TStorageOrder, bool TransA,
+                      bool TransB, const unsigned int M, const unsigned int N,
+                      const unsigned int K, const float alpha, const float *A,
+                      const unsigned int lda, const float *B,
+                      const unsigned int ldb, const float beta, float *C,
+                      const unsigned int ldc);
+/**
+ * @brief     sgemv computation  : Y = alpha*A*X + beta*Y
+ * @param[in] A float * for Matrix A
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] rows number of A's row
+ * @param[in] cols number of A's columns
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void __fallback_sgemv(const unsigned int TStorageOrder, bool TransA,
+                      const unsigned int M, const unsigned int N,
+                      const float alpha, const float *A, const unsigned int lda,
+                      const float *X, const unsigned int incX, const float beta,
+                      float *Y, const unsigned int incY);
+/**
+ * @brief     isamax function : index of first maxima
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ */
+unsigned int __fallback_isamax(const unsigned int N, const float *X,
+                               const unsigned int incX);
+
+/**
+ * @brief     sine with neon: Y = sin(alpha * X)
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] alpha float * for scaling angle (radian)
+ */
+void __fallback_sine(const unsigned int N, float *X, float *Y, float alpha);
+
+/**
+ * @brief     cosine with neon: Y = cos(alpha * X)
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] alpha float * for scaling angle (radian)
+ */
+void __fallback_cosine(const unsigned int N, float *X, float *Y, float alpha);
+
+/**
+ * @brief inversed squared root transformation inplace : X  / sqrt(X)
+ *
+ * @param N size of X
+ * @param X float * for Vector X
+ */
+void __fallback_inv_sqrt_inplace(const unsigned int N, float *X);
+/**
+ * @brief     elementwise vector multiplication : Z = X ⊙ alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void __fallback_ele_mul(const unsigned int N, const float *X, const float *Y,
+                        float *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride);
+
+/**
+ * @brief     elementwise vector addition : Z = X + alpha * Y + beta *
+ * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void __fallback_ele_add(const unsigned int N, const float *X, const float *Y,
+                        float *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride);
+/**
+ * @brief     elementwise vector subtraction with neon : Z = X - alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void __fallback_ele_sub(const unsigned N, const float *X, const float *Y,
+                        float *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride);
+
+/**
+ * @brief     elementwise vector division with neon : Z = X / (alpha * Y) + beta
+ * * Z
+ * @note ZeroDivisionError is not guaranteed in this function
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void __fallback_ele_div(const unsigned N, const float *X, const float *Y,
+                        float *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride);
+} // namespace nntrainer
+#endif
+#endif
diff --git a/nntrainer/tensor/cpu_backend/fallback/fallback_internal_fp16.cpp b/nntrainer/tensor/cpu_backend/fallback/fallback_internal_fp16.cpp
new file mode 100644
index 0000000000..ad794aa792
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/fallback/fallback_internal_fp16.cpp
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file fallback_internal.cpp
+ * @date   23 April 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Single-precision computation functions based on NEON
+ *
+ */
+
+#include <assert.h>
+#include <cmath>
+#include <cstdint>
+#include <fallback_internal.h>
+#include <tensor_dim.h>
+
+#define hgemv_loop(ci, cj, cM, cN)                                      \
+  do {                                                                  \
+    float y0;                                                           \
+    unsigned int i, j;                                                  \
+    for (ci = 0; ci != cM; ci++) {                                      \
+      y0 = static_cast<float>(Y[ci * incY] * static_cast<_FP16>(beta)); \
+      for (cj = 0; cj != cN; cj++)                                      \
+        y0 += static_cast<float>(A[i + j * lda] * X[cj * incX]);        \
+      Y[ci * incY] = static_cast<_FP16>(y0);                            \
+    }                                                                   \
+  } while (0);
+
+#define hgemm_loop()                                          \
+  do {                                                        \
+    for (unsigned int m = 0; m < M; ++m) {                    \
+      for (unsigned int n = 0; n < N; ++n) {                  \
+        float c = 0;                                          \
+        _FP16 c_old = C[m * ldc + n];                         \
+        for (unsigned int k = 0; k < K; ++k) {                \
+          _FP16 a, b;                                         \
+          a = ((TransA) ? A[k * lda + m] : A[m * lda + k]);   \
+          b = ((TransB) ? B[n * ldb + k] : B[k * ldb + n]);   \
+          c += static_cast<float>(a * b);                     \
+        }                                                     \
+        C[m * ldc + n] = static_cast<_FP16>(alpha * c);       \
+        if (beta != 0.0)                                      \
+          C[m * ldc + n] += static_cast<_FP16>(beta) * c_old; \
+      }                                                       \
+    }                                                         \
+  } while (0);
+
+#define haxpy_loop()                                                       \
+  do {                                                                     \
+    unsigned int i;                                                        \
+    for (i = 0; i < N; ++i)                                                \
+      Y[i * incY] = Y[i * incY] + static_cast<_FP16>(alpha) * X[i * incX]; \
+  } while (0);
+
+namespace nntrainer {
+
+void __fallback_sscal(const unsigned int N, const float alpha, _FP16 *X,
+                      const unsigned int incX) {
+  for (unsigned int i = 0; i < N; ++i)
+    X[i * incX] = static_cast<_FP16>(alpha) * X[i * incX];
+}
+
+_FP16 __fallback_snrm2(const unsigned int N, const _FP16 *X,
+                       const unsigned int incX) {
+  float sum = 0;
+  float tmp;
+  for (unsigned int i = 0; i < N; i++) {
+    tmp = static_cast<float>(X[i * incX]);
+    sum += tmp * tmp;
+  }
+  return static_cast<_FP16>(sqrt(sum));
+}
+
+void __fallback_scopy(const unsigned int N, const _FP16 *X,
+                      const unsigned int incX, _FP16 *Y,
+                      const unsigned int incY) {
+  for (unsigned int i = 0; i < N; ++i)
+    Y[i * incY] = X[i * incX];
+}
+
+void __fallback_scopy(const unsigned int N, const float *X,
+                      const unsigned int incX, _FP16 *Y,
+                      const unsigned int incY) {
+  for (unsigned int i = 0; i < N; ++i)
+    Y[i * incY] = static_cast<_FP16>(X[i * incX]);
+}
+
+void __fallback_scopy(const unsigned int N, const _FP16 *X,
+                      const unsigned int incX, float *Y,
+                      const unsigned int incY) {
+  for (unsigned int i = 0; i < N; ++i)
+    Y[i * incY] = static_cast<float>(X[i * incX]);
+}
+
+void __fallback_scopy_int4_to_float16(const unsigned int N, const uint8_t *X,
+                                      const unsigned int incX, _FP16 *Y,
+                                      const unsigned int incY) {
+  for (unsigned int idx = 0; idx < N; idx++) {
+    Y[2 * idx] = X[idx] >> 4;
+    Y[2 * idx + 1] = X[idx] & 0x0f;
+  }
+}
+
+void __fallback_scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
+                                      const unsigned int incX, _FP16 *Y,
+                                      const unsigned int incY) {
+  for (unsigned int idx = 0; idx < N; idx++) {
+    Y[idx] = X[idx];
+  }
+}
+
+_FP16 __fallback_sdot(const unsigned int N, const _FP16 *X,
+                      const unsigned int incX, const _FP16 *Y,
+                      const unsigned int incY) {
+  assert(incX > 0 && incY > 0);
+  float ret = 0;
+  for (unsigned int i = 0; i < N; ++i) {
+    ret += static_cast<float>(X[i * incX]) * static_cast<float>(Y[i * incY]);
+  }
+  return static_cast<_FP16>(ret);
+}
+
+void __fallback_saxpy(const unsigned int N, const float alpha, const _FP16 *X,
+                      const unsigned int incX, _FP16 *Y,
+                      const unsigned int incY) {
+  haxpy_loop();
+}
+
+void __fallback_sgemm(const unsigned int TStorageOrder, bool TransA,
+                      bool TransB, const unsigned int M, const unsigned int N,
+                      const unsigned int K, const float alpha, const _FP16 *A,
+                      const unsigned int lda, const _FP16 *B,
+                      const unsigned int ldb, const float beta, _FP16 *C,
+                      const unsigned int ldc) {
+  hgemm_loop();
+}
+
+void __fallback_sgemv(const unsigned int TStorageOrder, bool TransA,
+                      const unsigned int M, const unsigned int N,
+                      const float alpha, const _FP16 *A, const unsigned int lda,
+                      const _FP16 *X, const unsigned int incX, const float beta,
+                      _FP16 *Y, const unsigned int incY) {
+
+  if (TransA == true) {
+    hgemv_loop(i, j, N, M);
+  } else {
+    hgemv_loop(j, i, M, N);
+  }
+}
+
+void __fallback_ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y,
+                        _FP16 *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride) {
+  for (unsigned int i = 0; i < N; ++i) {
+    *Z = *X * static_cast<_FP16>(alpha) * *Y + static_cast<_FP16>(beta) * *Z;
+    X += o_stride;
+    Y += i_stride;
+    Z += o_stride;
+  }
+}
+
+void __fallback_ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y,
+                        _FP16 *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride) {
+  for (unsigned int i = 0; i < N; ++i) {
+    *Z = *X + static_cast<_FP16>(alpha) * *Y + static_cast<_FP16>(beta) * *Z;
+    X += o_stride;
+    Y += i_stride;
+    Z += o_stride;
+  }
+}
+
+void __fallback_ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y,
+                        _FP16 *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride) {
+  for (unsigned int i = 0; i < N; ++i) {
+    *Z = *X - static_cast<_FP16>(alpha) * *Y + static_cast<_FP16>(beta) * *Z;
+    X += o_stride;
+    Y += i_stride;
+    Z += o_stride;
+  }
+}
+
+void __fallback_ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y,
+                        _FP16 *Z, float alpha, float beta,
+                        unsigned int i_stride, unsigned int o_stride) {
+  for (unsigned int i = 0; i < N; ++i) {
+    *Z = *X / (static_cast<_FP16>(alpha) * *Y) + static_cast<_FP16>(beta) * *Z;
+    X += o_stride;
+    Y += i_stride;
+    Z += o_stride;
+  }
+}
+
+unsigned int __fallback_isamax(const unsigned int N, const _FP16 *X,
+                               const unsigned int incX) {
+  unsigned int max_idx = 0;
+  _FP16 max_val = X[0];
+  for (unsigned int n = 1; n < N; n += incX) {
+    _FP16 cur_val = (X[n] >= 0) ? X[n] : -1 * X[n];
+    if (cur_val > max_val) {
+      max_val = cur_val;
+      max_idx = n;
+    }
+  }
+  return max_idx;
+}
+
+void __fallback_inv_sqrt_inplace(const unsigned int N, _FP16 *X) {
+  for (unsigned int i = 0; i < N; ++i) {
+    X[i] = static_cast<_FP16>(1 / std::sqrt(static_cast<float>(X[i])));
+  }
+}
+
+void __fallback_transpose_matrix(const unsigned int M, const unsigned int N,
+                                 const _FP16 *src, unsigned int ld_src,
+                                 _FP16 *dst, unsigned int ld_dst) {
+  for (unsigned int i = 0; i < M; i++) {
+    for (unsigned int j = 0; j < N; j++) {
+      dst[i + j * ld_dst] = src[i * ld_src + j];
+    }
+  }
+}
+
+} // namespace nntrainer
diff --git a/nntrainer/tensor/cpu_backend/fallback/meson.build b/nntrainer/tensor/cpu_backend/fallback/meson.build
new file mode 100644
index 0000000000..86d31d5ffe
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/fallback/meson.build
@@ -0,0 +1,24 @@
+fallback_headers = ['fallback_internal.h']
+fallback_sources = ['fallback_internal.cpp']
+
+arch = host_machine.cpu_family()
+
+if arch != 'arm' and arch != 'aarch64' and arch != 'x86_64' and arch != 'x86'
+  fallback_headers += 'fallback.h'
+  fallback_sources += 'fallback.cpp'
+  if get_option('enable-fp16')
+    fallback_sources += 'fallback_fp16.cpp'
+  endif
+endif
+
+if get_option('enable-fp16')
+  fallback_sources += 'fallback_internal_fp16.cpp'
+endif
+
+foreach s : fallback_sources
+  nntrainer_sources += meson.current_source_dir() / s
+endforeach
+
+foreach h : fallback_headers
+  nntrainer_headers += meson.current_source_dir() / h
+endforeach
diff --git a/nntrainer/tensor/cpu_backend/meson.build b/nntrainer/tensor/cpu_backend/meson.build
new file mode 100644
index 0000000000..e75f1f9ad6
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/meson.build
@@ -0,0 +1,28 @@
+cpu_backend_headers = [
+  'cpu_backend.h',
+]
+
+subdir('fallback')
+nntrainer_inc += include_directories('fallback')
+nntrainer_inc_abs += meson.current_source_dir() / 'fallback'
+
+if get_option('enable-blas')
+  subdir('cblas_interface')
+  nntrainer_inc += include_directories('cblas_interface')
+  nntrainer_inc_abs += meson.current_source_dir() / 'cblas_interface'
+endif
+
+arch = host_machine.cpu_family()
+if arch == 'arm' or arch == 'aarch64' or get_option('platform') == 'android'
+    subdir('arm')
+    nntrainer_inc += include_directories('arm')
+    nntrainer_inc_abs += meson.current_source_dir() / 'arm'
+elif arch == 'x86_64' or arch == 'x86'
+    subdir('x86')
+    nntrainer_inc += include_directories('x86')
+    nntrainer_inc_abs += meson.current_source_dir() / 'x86'
+endif
+
+foreach h : cpu_backend_headers
+  nntrainer_headers += meson.current_source_dir() / h
+endforeach
diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/cpu_backend/x86/blas_avx.cpp
similarity index 98%
rename from nntrainer/tensor/blas_avx.cpp
rename to nntrainer/tensor/cpu_backend/x86/blas_avx.cpp
index ce59583d6f..c923cb340f 100644
--- a/nntrainer/tensor/blas_avx.cpp
+++ b/nntrainer/tensor/cpu_backend/x86/blas_avx.cpp
@@ -18,7 +18,7 @@
 
 #include <blas_avx.h>
 
-namespace nntrainer::avx {
+namespace nntrainer {
 
 void vcvt_f16_f32(size_t N, const void *input, float *output) {
   assert(N != 0);
@@ -114,4 +114,4 @@ void vcvt_f32_f16(size_t N, const float *input, void *output) {
   }
 }
 
-} // namespace nntrainer::avx
+} // namespace nntrainer
diff --git a/nntrainer/tensor/blas_avx.h b/nntrainer/tensor/cpu_backend/x86/blas_avx.h
similarity index 95%
rename from nntrainer/tensor/blas_avx.h
rename to nntrainer/tensor/cpu_backend/x86/blas_avx.h
index ab1270a208..c1a714ddca 100644
--- a/nntrainer/tensor/blas_avx.h
+++ b/nntrainer/tensor/cpu_backend/x86/blas_avx.h
@@ -18,7 +18,7 @@
 #include <cmath>
 #include <immintrin.h>
 
-namespace nntrainer::avx {
+namespace nntrainer {
 
 /**
  * @brief Converts half-precision floating point values to single-precision
@@ -40,7 +40,7 @@ void vcvt_f16_f32(size_t N, const void *input, float *output);
  */
 void vcvt_f32_f16(size_t N, const float *input, void *output);
 
-} // namespace nntrainer::avx
+} // namespace nntrainer
 
 #endif /* __cplusplus */
 #endif /* __BLAS_AVX_H_ */
diff --git a/nntrainer/tensor/cpu_backend/x86/meson.build b/nntrainer/tensor/cpu_backend/x86/meson.build
new file mode 100644
index 0000000000..301bb8d54c
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/x86/meson.build
@@ -0,0 +1,20 @@
+simd_interface_x86_headers = [
+  'x86_compute_backend.h',
+]
+simd_interface_x86_sources = [
+  'x86_compute_backend.cpp',
+]
+
+if get_option('enable-fp16')
+    simd_interface_x86_headers += 'blas_avx.h'
+    simd_interface_x86_sources += 'blas_avx.cpp'
+    simd_interface_x86_sources += 'x86_compute_backend_fp16.cpp'
+endif
+
+foreach s : simd_interface_x86_sources
+  nntrainer_sources += meson.current_source_dir() / s
+endforeach
+
+foreach h : simd_interface_x86_headers
+  nntrainer_headers += meson.current_source_dir() / h
+endforeach
diff --git a/nntrainer/tensor/cpu_backend/x86/x86_compute_backend.cpp b/nntrainer/tensor/cpu_backend/x86/x86_compute_backend.cpp
new file mode 100644
index 0000000000..bf35984b32
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/x86/x86_compute_backend.cpp
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file   x86_compute_backend.cpp
+ * @date   23 April 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Compute backend for x86
+ *
+ */
+
+#include <assert.h>
+
+#include <cblas_interface.h>
+#include <fallback_internal.h>
+#include <nntrainer_error.h>
+#include <x86_compute_backend.h>
+
+#define ROW_MAJOR 0
+#define COL_MAJOR 1
+
+namespace nntrainer {
+
+void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, float *Y,
+                           const unsigned int incY) {
+  __fallback_scopy_int4_to_float32(N, X, incX, Y, incY);
+}
+
+void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, float *Y,
+                           const unsigned int incY) {
+  __fallback_scopy_int8_to_float32(N, X, incX, Y, incY);
+}
+
+void sine(const unsigned int N, float *X, float *Y, float alpha) {
+  __fallback_sine(N, X, Y, alpha);
+}
+
+void cosine(const unsigned int N, float *X, float *Y, float alpha) {
+  __fallback_cosine(N, X, Y, alpha);
+}
+
+void inv_sqrt_inplace(const unsigned int N, float *X) {
+  __fallback_inv_sqrt_inplace(N, X);
+}
+
+void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void saxpy(const unsigned int N, const float alpha, const float *X,
+           const unsigned int incX, float *Y, const unsigned int incY) {
+  __cblas_saxpy(N, alpha, X, incX, Y, incY);
+}
+
+void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
+           const unsigned int N, const float alpha, const float *A,
+           const unsigned int lda, const float *X, const unsigned int incX,
+           const float beta, float *Y, const unsigned int incY) {
+  __cblas_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
+                incY);
+}
+
+float sdot(const unsigned int N, const float *X, const unsigned int incX,
+           const float *Y, const unsigned int incY) {
+  return __cblas_sdot(N, X, incX, Y, incY);
+}
+
+void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX,
+           uint8_t *Y, const unsigned int incY) {
+  __fallback_scopy(N, X, incX, Y, incY);
+}
+
+void scopy(const unsigned int N, const float *X, const unsigned int incX,
+           float *Y, const unsigned int incY) {
+  __cblas_scopy(N, X, incX, Y, incY);
+}
+
+void sscal(const unsigned int N, const float alpha, float *X,
+           const unsigned int incX) {
+  __cblas_sscal(N, alpha, X, incX);
+}
+
+float snrm2(const unsigned int N, const float *X, const unsigned int incX) {
+  return __cblas_snrm2(N, X, incX);
+}
+
+void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
+           const unsigned int M, const unsigned int N, const unsigned int K,
+           const float alpha, const float *A, const unsigned int lda,
+           const float *B, const unsigned int ldb, const float beta, float *C,
+           const unsigned int ldc) {
+  __cblas_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
+                beta, C, ldc);
+}
+
+unsigned int isamax(const unsigned int N, const float *X,
+                    const unsigned int incX) {
+  return __cblas_isamax(N, X, incX);
+}
+
+} /* namespace nntrainer */
diff --git a/nntrainer/tensor/cpu_backend/x86/x86_compute_backend.h b/nntrainer/tensor/cpu_backend/x86/x86_compute_backend.h
new file mode 100644
index 0000000000..840abc94df
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/x86/x86_compute_backend.h
@@ -0,0 +1,427 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file   x86_compute_backend.h
+ * @date   23 April 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Compute backend for x86
+ *
+ */
+
+#ifndef __x86_COMPUTE_BACKEND_H__
+#define __x86_COMPUTE_BACKEND_H__
+#ifdef __cplusplus
+
+#include <cstdint>
+#include <tensor_dim.h>
+
+namespace nntrainer {
+
+#ifdef ENABLE_FP16
+/**
+ * @brief     sscal computation : X = alpha * X
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] alpha float number
+ */
+void sscal(const unsigned int N, const float alpha, _FP16 *X,
+           const unsigned int incX);
+
+/**
+ * @brief     snrm2 computation : Euclidean norm
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ */
+_FP16 snrm2(const unsigned int N, const _FP16 *X, const unsigned int incX);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           _FP16 *Y, const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void scopy(const unsigned int N, const float *X, const unsigned int incX,
+           _FP16 *Y, const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           float *Y, const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void scopy_int4_to_float16(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, _FP16 *Y,
+                           const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, _FP16 *Y,
+                           const unsigned int incY);
+
+/**
+ * @brief     sdot computation : sum of all X * Y
+ * @param[in] N number of elements in Y
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+_FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           const _FP16 *Y, const unsigned int incY);
+
+/**
+ * @brief     saxpy computation : Y = alpha*X + Y
+ * @param[in] N number of elements in Y
+ * @param[in] alpha float number
+ * @param[in] X __fp16 * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void saxpy(const unsigned int N, const float alpha, const _FP16 *X,
+           const unsigned int incX, _FP16 *Y, const unsigned int incY);
+
+/**
+ * @brief     sgemm computation : Y = alpha*op(A)*op(B) + beta*C,
+ * where op(X) is one of X or X**T
+ * @param[in] A __fp16 * for Matrix A
+ * @param[in] B __fp16 * for Matrix B
+ * @param[in] C __fp16 * for Matrix C
+ * @param[in] M number of op(A)'s and C's row
+ * @param[in] N number of op(B)'s and C's columns
+ * @param[in] K number of op(A)'s and columns and op(B)'s rows
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
+           const unsigned int M, const unsigned int N, const unsigned int K,
+           const float alpha, const _FP16 *A, const unsigned int lda,
+           const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C,
+           const unsigned int ldc);
+/**
+ * @brief     sgemv computation : Y = alpha*A*X + beta*Y
+ * @param[in] A float * for Matrix A
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] rows number of A's row
+ * @param[in] cols number of A's columns
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
+           const unsigned int N, const float alpha, const _FP16 *A,
+           const unsigned int lda, const _FP16 *X, const unsigned int incX,
+           const float beta, _FP16 *Y, const unsigned int incY);
+/**
+ * @brief     elementwise vector multiplication : Z = X ⊙ alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+
+/**
+ * @brief     elementwise vector addition : Z = X + alpha * Y + beta *
+ * Z
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+/**
+ * @brief     elementwise vector subtraction with neon : Z = X - alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+
+/**
+ * @brief     elementwise vector division with neon : Z = X / (alpha * Y) + beta
+ * * Z
+ * @note ZeroDivisionError is not guaranteed in this function
+ * @param[in] N  length of the vector
+ * @param[in] X _FP16 * for Vector X
+ * @param[in] Y _FP16 * for Vector Y
+ * @param[in] Z _FP16 * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+
+/**
+ * @brief     isamax function : index of first maxima
+ * @param[in] N number of elements in X
+ * @param[in] X __fp16 * for Vector X
+ */
+unsigned int isamax(const unsigned int N, const _FP16 *X,
+                    const unsigned int incX);
+
+/**
+ * @brief squared root transformation inplace : X = sqrt(X)
+ *
+ * @param N size of X
+ * @param X __fp16 * for Vector X
+ */
+void inv_sqrt_inplace(const unsigned int N, _FP16 *X);
+
+/**
+ * @brief Matrix transpose / 2D Tensor transpose
+ *
+ * @param M row length of input matrix
+ * @param N col length of input matrix
+ * @param src src data of input matrix
+ * @param ld_src data offset of input matrix
+ * @param dst destination of output matrix
+ * @param ld_dst data offset of output matrix
+ */
+void transpose_matrix(const unsigned int M, const unsigned int N,
+                      const _FP16 *src, unsigned int ld_src, _FP16 *dst,
+                      unsigned int ld_dst);
+#endif
+/**
+ * @brief     sscal computation : X = alpha * X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] alpha float number
+ */
+void sscal(const unsigned int N, const float alpha, float *X,
+           const unsigned int incX);
+/**
+ * @brief     snrm2 computation : Euclidean norm
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ */
+float snrm2(const unsigned int N, const float *X, const unsigned int incX);
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void scopy(const unsigned int N, const float *X, const unsigned int incX,
+           float *Y, const unsigned int incY);
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y uint8_t * for Vector Y
+ */
+void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX,
+           uint8_t *Y, const unsigned int incY);
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, float *Y,
+                           const unsigned int incY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, float *Y,
+                           const unsigned int incY);
+
+/**
+ * @brief     sdot computation : sum of all X * Y
+ * @param[in] N number of elements in Y
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+float sdot(const unsigned int N, const float *X, const unsigned int incX,
+           const float *Y, const unsigned int incY);
+/**
+ * @brief     saxpy computation : Y = alpha*X + Y
+ * @param[in] N number of elements in Y
+ * @param[in] alpha float number
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void saxpy(const unsigned int N, const float alpha, const float *X,
+           const unsigned int incX, float *Y, const unsigned int incY);
+/**
+ * @brief     sgemm computation  : Y = alpha*op(A)*op(B) + beta*C,
+ * where op(X) is one of X or X**T
+ * @param[in] A float * for Matrix A
+ * @param[in] B float * for Matrix B
+ * @param[in] C float * for Matrix C
+ * @param[in] M number of op(A)'s and C's row
+ * @param[in] N number of op(B)'s and C's columns
+ * @param[in] K number of op(A)'s and columns and op(B)'s rows
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
+           const unsigned int M, const unsigned int N, const unsigned int K,
+           const float alpha, const float *A, const unsigned int lda,
+           const float *B, const unsigned int ldb, const float beta, float *C,
+           const unsigned int ldc);
+/**
+ * @brief     sgemv computation  : Y = alpha*A*X + beta*Y
+ * @param[in] A float * for Matrix A
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] rows number of A's row
+ * @param[in] cols number of A's columns
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
+           const unsigned int N, const float alpha, const float *A,
+           const unsigned int lda, const float *X, const unsigned int incX,
+           const float beta, float *Y, const unsigned int incY);
+/**
+ * @brief     isamax function : index of first maxima
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ */
+unsigned int isamax(const unsigned int N, const float *X,
+                    const unsigned int incX);
+
+/**
+ * @brief     sine with neon: Y = sin(alpha * X)
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] alpha float * for scaling angle (radian)
+ */
+void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f);
+
+/**
+ * @brief     cosine with neon: Y = cos(alpha * X)
+ * @param[in] N number of elements in X
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] alpha float * for scaling angle (radian)
+ */
+void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f);
+
+/**
+ * @brief inversed squared root transformation inplace : X = 1 / sqrt(X)
+ *
+ * @param N size of X
+ * @param X float * for Vector X
+ */
+void inv_sqrt_inplace(const unsigned int N, float *X);
+/**
+ * @brief     elementwise vector multiplication : Z = X ⊙ alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+
+/**
+ * @brief     elementwise vector addition : Z = X + alpha * Y + beta *
+ * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+/**
+ * @brief     elementwise vector subtraction with neon : Z = X - alpha * Y +
+ * beta * Z
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+
+/**
+ * @brief     elementwise vector division with neon : Z = X / (alpha * Y) + beta
+ * * Z
+ * @note ZeroDivisionError is not guaranteed in this function
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[in] Y float * for Vector Y
+ * @param[in] Z float * for Vector Z
+ * @param[in] alpha scalar multiplier for input
+ * @param[in] beta scalar multiplier for output
+ * @param[in] i_stride input stride
+ * @param[in] o_stride output stride
+ */
+void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
+             float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
+             unsigned int o_stride = 1);
+} /* namespace nntrainer */
+#endif /* __cplusplus */
+#endif /* __x86_COMPUTE_BACKEND_H__ */
diff --git a/nntrainer/tensor/cpu_backend/x86/x86_compute_backend_fp16.cpp b/nntrainer/tensor/cpu_backend/x86/x86_compute_backend_fp16.cpp
new file mode 100644
index 0000000000..b0d3d97cbb
--- /dev/null
+++ b/nntrainer/tensor/cpu_backend/x86/x86_compute_backend_fp16.cpp
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
+ *
+ * @file   x86_compute_backend_fp16.cpp
+ * @date   23 April 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Sungsik Kong <ss.kong@samsung.com>
+ * @bug    No known bugs except for NYI items
+ * @brief  Compute backend for x86
+ *
+ */
+
+#include <assert.h>
+
+#include <cblas_interface.h>
+#include <fallback_internal.h>
+#include <nntrainer_error.h>
+#include <x86_compute_backend.h>
+
+#ifdef ENABLE_FP16
+#include <blas_avx.h>
+#endif
+
+#define ROW_MAJOR 0
+#define COL_MAJOR 1
+
+namespace nntrainer {
+
+void sscal(const unsigned int N, const float alpha, _FP16 *X,
+           const unsigned int incX) {
+  __fallback_sscal(N, alpha, X, incX);
+}
+
+_FP16 snrm2(const unsigned int N, const _FP16 *X, const unsigned int incX) {
+  assert(incX > 0);
+  _FP16 sum = __fallback_snrm2(N, X, incX);
+  return sum;
+}
+
+void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           _FP16 *Y, const unsigned int incY) {
+  if (incX == 1 && incY == 1) {
+    __fallback_scopy(N, X, incX, Y, incY);
+  }
+}
+
+void scopy(const unsigned int N, const float *X, const unsigned int incX,
+           _FP16 *Y, const unsigned int incY) {
+  if (incX == 1 && incY == 1) {
+    nntrainer::vcvt_f32_f16(N, X, Y);
+
+  } else {
+    __fallback_scopy(N, X, incX, Y, incY);
+  }
+}
+
+void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           float *Y, const unsigned int incY) {
+  if (incX == 1 && incY == 1) {
+    nntrainer::vcvt_f16_f32(N, X, Y);
+  } else {
+    __fallback_scopy(N, X, incX, Y, incY);
+  }
+}
+
+void scopy_int4_to_float16(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, _FP16 *Y,
+                           const unsigned int incY) {
+  if (incX == 1 && incY == 1) {
+    __fallback_scopy_int4_to_float16(N, X, incX, Y, incY);
+  }
+}
+
+void scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
+                           const unsigned int incX, _FP16 *Y,
+                           const unsigned int incY) {
+  __fallback_scopy_int8_to_float16(N, X, incX, Y, incY);
+}
+
+_FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX,
+           const _FP16 *Y, const unsigned int incY) {
+  assert(incX > 0 && incY > 0);
+  _FP16 ret = 0;
+  return __fallback_sdot(N, X, incX, Y, incY);
+}
+
+void saxpy(const unsigned int N, const float alpha, const _FP16 *X,
+           const unsigned int incX, _FP16 *Y, const unsigned int incY) {
+  __fallback_saxpy(N, alpha, X, incX, Y, incY);
+}
+
+void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
+           const unsigned int M, const unsigned int N, const unsigned int K,
+           const float alpha, const _FP16 *A, const unsigned int lda,
+           const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C,
+           const unsigned int ldc) {
+  float *A_ = new float[M * K];
+  float *B_ = new float[N * K];
+  float *C_ = new float[M * N];
+
+  scopy(M * K, A, 1, A_, 1);
+  scopy(N * K, B, 1, B_, 1);
+  scopy(M * N, C, 1, C_, 1);
+
+  __cblas_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A_, lda, B_, ldb,
+                beta, C_, ldc);
+  scopy(M * N, C_, 1, C, 1);
+
+  delete[] A_;
+  delete[] B_;
+  delete[] C_;
+}
+
+void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
+           const unsigned int N, const float alpha, const _FP16 *A,
+           const unsigned int lda, const _FP16 *X, const unsigned int incX,
+           const float beta, _FP16 *Y, const unsigned int incY) {
+  unsigned int lenX = (TransA) ? 1 + (M - 1) * (incX) : 1 + (N - 1) * (incX);
+  unsigned int lenY = (TransA) ? 1 + (N - 1) * (incY) : 1 + (M - 1) * (incY);
+
+  float *A_ = new float[M * N];
+  float *X_ = new float[lenX];
+  float *Y_ = new float[lenY];
+
+  scopy(M * N, A, 1, A_, 1);
+  scopy(lenX, X, 1, X_, 1);
+  scopy(lenY, Y, 1, Y_, 1);
+
+  __cblas_sgemv(TStorageOrder, TransA, M, N, alpha, A_, lda, X_, incX, beta, Y_,
+                incY);
+
+  scopy(lenY, Y_, 1, Y, 1);
+
+  delete[] A_;
+  delete[] X_;
+  delete[] Y_;
+}
+
+void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
+             float alpha, float beta, unsigned int i_stride,
+             unsigned int o_stride) {
+  __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride);
+}
+
+unsigned int isamax(const unsigned int N, const _FP16 *X,
+                    const unsigned int incX) {
+  unsigned int max_idx = 0;
+  max_idx = __fallback_isamax(N, X, incX);
+  return max_idx;
+}
+
+void inv_sqrt_inplace(const unsigned int N, _FP16 *X) {
+  __fallback_inv_sqrt_inplace(N, X);
+}
+
+void transpose_matrix(const unsigned int M, const unsigned int N,
+                      const _FP16 *src, unsigned int ld_src, _FP16 *dst,
+                      unsigned int ld_dst) {
+  __fallback_transpose_matrix(M, N, src, ld_src, dst, ld_dst);
+}
+
+} /* namespace nntrainer */
diff --git a/nntrainer/tensor/float_tensor.cpp b/nntrainer/tensor/float_tensor.cpp
index 7ca18a7b40..31e7e6b202 100644
--- a/nntrainer/tensor/float_tensor.cpp
+++ b/nntrainer/tensor/float_tensor.cpp
@@ -12,7 +12,7 @@
 #include <iomanip>
 #include <iostream>
 
-#include <blas_interface.h>
+#include <cpu_backend.h>
 #include <float_tensor.h>
 #include <tensor.h>
 #include <util_func.h>
diff --git a/nntrainer/tensor/half_tensor.cpp b/nntrainer/tensor/half_tensor.cpp
index 6753d51d34..7a36bcf44a 100644
--- a/nntrainer/tensor/half_tensor.cpp
+++ b/nntrainer/tensor/half_tensor.cpp
@@ -12,7 +12,7 @@
 #include <iomanip>
 #include <iostream>
 
-#include <blas_interface.h>
+#include <cpu_backend.h>
 #include <half_tensor.h>
 #include <tensor.h>
 #include <util_func.h>
diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build
index 397654300e..e83debcd1c 100644
--- a/nntrainer/tensor/meson.build
+++ b/nntrainer/tensor/meson.build
@@ -1,5 +1,4 @@
 tensor_sources = [
-  'blas_interface.cpp',
   'cache_elem.cpp',
   'cache_loader.cpp',
   'cache_pool.cpp',
@@ -33,7 +32,6 @@ tensor_headers = [
   'weight.h',
   'var_grad.h',    
   'tensor_wrap_specs.h',
-  'blas_interface.h',
   'manager.h',
   'basic_planner.h',
   'memory_planner.h',
@@ -47,27 +45,9 @@ tensor_headers = [
   'task.h'
 ]
 
-arch = host_machine.cpu_family()
-if get_option('enable-fp16') 
-  if arch == 'arm'
-    error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.')
-  elif arch == 'aarch64' or get_option('platform') == 'android'
-    if get_option('enable-neon')
-      tensor_sources += 'blas_neon.cpp'
-      tensor_headers += 'blas_neon.h'
-      subdir('hgemm')
-      nntrainer_inc += include_directories('hgemm')
-      nntrainer_inc_abs += meson.current_source_dir() / 'hgemm'
-
-      subdir('matrix_transpose_neon')
-      nntrainer_inc += include_directories('matrix_transpose_neon')
-      nntrainer_inc_abs += meson.current_source_dir() / 'matrix_transpose_neon'
-    endif
-  elif get_option('enable-avx')
-    tensor_sources += 'blas_avx.cpp'
-    tensor_headers += 'blas_avx.h'
-  endif
-endif
+subdir('cpu_backend')
+nntrainer_inc += include_directories('cpu_backend')
+nntrainer_inc_abs += meson.current_source_dir() / 'cpu_backend'
 
 if get_option('enable-fp16')
   tensor_headers += 'half_tensor.h'
diff --git a/nntrainer/tensor/short_tensor.cpp b/nntrainer/tensor/short_tensor.cpp
index 0e9be0d723..11418ef784 100644
--- a/nntrainer/tensor/short_tensor.cpp
+++ b/nntrainer/tensor/short_tensor.cpp
@@ -11,7 +11,7 @@
 #include <iomanip>
 #include <iostream>
 
-#include <blas_interface.h>
+#include <cpu_backend.h>
 #include <short_tensor.h>
 #include <tensor.h>
 
diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h
index 74e0a3437e..d61e6b84ff 100644
--- a/nntrainer/tensor/tensor.h
+++ b/nntrainer/tensor/tensor.h
@@ -23,8 +23,8 @@
 
 #include <cstddef>
 
-#include <blas_interface.h>
 #include <char_tensor.h>
+#include <cpu_backend.h>
 #include <float_tensor.h>
 #include <nntrainer_log.h>
 #include <short_tensor.h>
diff --git a/nntrainer/utils/util_simd_neon.cpp b/nntrainer/utils/util_simd_neon.cpp
index d823897047..6de2419555 100644
--- a/nntrainer/utils/util_simd_neon.cpp
+++ b/nntrainer/utils/util_simd_neon.cpp
@@ -8,8 +8,11 @@
  * @bug		No known bugs except for NYI items
  */
 
-#include <blas_neon.h>
+#include <neon_impl.h>
 #include <util_simd_neon.h>
+#ifdef ARMV7
+#include <armv7_neon.h>
+#endif
 
 namespace nntrainer::neon {
 
diff --git a/packaging/nntrainer.spec b/packaging/nntrainer.spec
index 122d3a8d0c..17f6de1d04 100644
--- a/packaging/nntrainer.spec
+++ b/packaging/nntrainer.spec
@@ -57,7 +57,7 @@
 %define fp16_support -Denable-fp16=true
 %else
 %define fp16_support -Denable-fp16=false
-%endif # enalbe_fp16
+%endif # enable_fp16
 
 %ifarch aarch64
 %define neon_support -Denable-neon=true
@@ -540,7 +540,37 @@ cp -r result %{buildroot}%{_datadir}/nntrainer/unittest/
 %{_includedir}/nntrainer/half_tensor.h
 %endif
 %{_includedir}/nntrainer/tensor_wrap_specs.h
-%{_includedir}/nntrainer/blas_interface.h
+%{_includedir}/nntrainer/cpu_backend.h
+%{_includedir}/nntrainer/fallback_internal.h
+%{_includedir}/nntrainer/cblas_interface.h
+%ifarch %{ix86} x86_64
+%{_includedir}/nntrainer/x86_compute_backend.h
+%if 0%{?enable_fp16}
+%{_includedir}/nntrainer/blas_avx.h
+%endif
+%endif
+%ifarch aarch64
+%{_includedir}/nntrainer/arm_compute_backend.h
+%{_includedir}/nntrainer/neon_impl.h
+%{_includedir}/nntrainer/neon_setting.h
+%{_includedir}/nntrainer/neon_mathfun.h
+%{_includedir}/nntrainer/neon_mathfun.hxx
+%if 0%{?enable_fp16}
+%{_includedir}/nntrainer/hgemm.h
+%{_includedir}/nntrainer/matrix_transpose_neon.h
+%endif
+%endif
+%ifarch %arm
+    %{_includedir}/nntrainer/arm_compute_backend.h
+    %{_includedir}/nntrainer/armv7_neon.h
+    %{_includedir}/nntrainer/neon_impl.h
+    %{_includedir}/nntrainer/neon_setting.h
+    %{_includedir}/nntrainer/neon_mathfun.h
+    %{_includedir}/nntrainer/neon_mathfun.hxx
+%endif
+%ifnarch %{ix86} %arm x86_64 aarch64
+%{_includedir}/nntrainer/fallback.h
+%endif
 %{_includedir}/nntrainer/var_grad.h
 %{_includedir}/nntrainer/weight.h
 # @todo: update dataset headers
diff --git a/test/jni/Android.mk b/test/jni/Android.mk
index 153b4eb840..a249188160 100644
--- a/test/jni/Android.mk
+++ b/test/jni/Android.mk
@@ -22,6 +22,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/opencl \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/tensor \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/aarch64 \
 	$(NNTRAINER_ROOT)/nntrainer/tensor/cl_operations \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/api \
diff --git a/test/unittest/jni/Android.mk b/test/unittest/jni/Android.mk
index 40fe50d28e..8a1d9af5bc 100644
--- a/test/unittest/jni/Android.mk
+++ b/test/unittest/jni/Android.mk
@@ -20,6 +20,7 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/graph \
 	$(NNTRAINER_ROOT)/nntrainer/optimizers \
 	$(NNTRAINER_ROOT)/nntrainer/tensor \
+	$(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \
 	$(NNTRAINER_ROOT)/nntrainer/utils \
 	$(NNTRAINER_ROOT)/api \
 	$(NNTRAINER_ROOT)/api/ccapi/include \