From 475ef62b7114a6b54da58559dc7eebe65b18f9a7 Mon Sep 17 00:00:00 2001
From: Sayan Shaw <sayanshaw@microsoft.com>
Date: Thu, 21 Dec 2023 16:45:07 -0800
Subject: [PATCH 01/11] Add initial CUDA native UT

---
 test/data/cuda/test_fastgelu.onnx     | Bin 0 -> 128 bytes
 test/data/cuda/test_fastgelu_f16.onnx | Bin 0 -> 128 bytes
 test/data/cuda/test_negpos.onnx       | Bin 0 -> 175 bytes
 test/shared_test/test_kernel.hpp      |   3 ++-
 test/shared_test/test_ortops.cc       |  22 ++++++++++++++++-
 test/shared_test/test_ortops_cuda.cc  |  34 ++++++++++++++++++++++++++
 6 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 test/data/cuda/test_fastgelu.onnx
 create mode 100644 test/data/cuda/test_fastgelu_f16.onnx
 create mode 100644 test/data/cuda/test_negpos.onnx
 create mode 100644 test/shared_test/test_ortops_cuda.cc
diff --git a/test/data/cuda/test_fastgelu.onnx b/test/data/cuda/test_fastgelu.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..13eb488129f056fd81dc0dc8ce605eb15535db06
GIT binary patch
literal 128
zcmd<!u!`bR=VGkjVoAzOEEZy{RN`<;EG}_R%_+6wPt4TI&&#XOOU}<LDauR|Vl7E6
uE-{Sa2C5fg<6_}p6k>?tgX)CxlYo+yFi9tFE_SFH0zyv0T<8W0FaiK|!5v2c

literal 0
HcmV?d00001

diff --git a/test/data/cuda/test_fastgelu_f16.onnx b/test/data/cuda/test_fastgelu_f16.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..83f90e176dd57bbef79fe2b5ddb7123f77b10f64
GIT binary patch
literal 128
zcmd<!u!`bR=VGkjVoAzOEEZy{RN`<;EG}_R%_+6wPt4TI&&#XOOU}<LDauR|Vl7E6
uE-{Sa2C5fg<6`095@LwrgX)CxlYo+yFi9tFE_SFH0zyv0T<8W0FaiL3A{}A?

literal 0
HcmV?d00001

diff --git a/test/data/cuda/test_negpos.onnx b/test/data/cuda/test_negpos.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..b72717e488a61a10dc76b64effb35f66236a8273
GIT binary patch
literal 175
zcmd<!u<BsslHg*j5aP^CNzE(CEU7e9;_!qrx%9XY!a~e>sp&$@1^LBFY<{Wf0r|yN
z{E3-*`FVL2ddc~DB}JJ@LaZgJ#U%z&{2&9lxHvc%g;=;4xEPWIxxkvygn&jNt8(Jz
RVuu<iAmk*>g>Je4BLEC+DGUGr

literal 0
HcmV?d00001

diff --git a/test/shared_test/test_kernel.hpp b/test/shared_test/test_kernel.hpp
index 026048b99..549e4d699 100644
--- a/test/shared_test/test_kernel.hpp
+++ b/test/shared_test/test_kernel.hpp
@@ -52,6 +52,7 @@ void RunSession(Ort::Session& session_object,
 void TestInference(Ort::Env& env, const ORTCHAR_T* model_uri,
                    const std::vector<TestValue>& inputs,
                    const std::vector<TestValue>& outputs,
-                   OutputValidator output_validator = nullptr);
+                   OutputValidator output_validator = nullptr,
+                   void* cuda_compute_stream = nullptr);
 
 void GetTensorMutableDataString(const OrtApi& api, const OrtValue* value, std::vector<std::string>& output);
diff --git a/test/shared_test/test_ortops.cc b/test/shared_test/test_ortops.cc
index 4c68a1b36..be981cc43 100644
--- a/test/shared_test/test_ortops.cc
+++ b/test/shared_test/test_ortops.cc
@@ -296,11 +296,31 @@ void ValidateOutputEqual(size_t output_idx, Ort::Value& actual, TestValue expect
   }
 }
 
+OrtCUDAProviderOptions CreateDefaultOrtCudaProviderOptionsWithCustomStream(void* cuda_compute_stream) {
+  OrtCUDAProviderOptions cuda_options;
+
+  cuda_options.device_id = 0;
+  cuda_options.cudnn_conv_algo_search = OrtCudnnConvAlgoSearch::OrtCudnnConvAlgoSearchExhaustive;
+  cuda_options.gpu_mem_limit = std::numeric_limits<size_t>::max();
+  cuda_options.arena_extend_strategy = 0;
+  cuda_options.do_copy_in_default_stream = true;
+  cuda_options.has_user_compute_stream = cuda_compute_stream != nullptr ? 1 : 0;
+  cuda_options.user_compute_stream = cuda_compute_stream;
+  cuda_options.default_memory_arena_cfg = nullptr;
+
+  return cuda_options;
+}
+
 void TestInference(Ort::Env& env, const ORTCHAR_T* model_uri,
                    const std::vector<TestValue>& inputs,
                    const std::vector<TestValue>& outputs,
-                   OutputValidator output_validator) {
+                   OutputValidator output_validator,
+                   void* cuda_compute_stream) {
   Ort::SessionOptions session_options;
+#ifdef USE_CUDA
+  auto cuda_options = CreateDefaultOrtCudaProviderOptionsWithCustomStream(cuda_compute_stream);
+  session_options.AppendExecutionProvider_CUDA(cuda_options);
+#endif
   auto library_handle = RegisterExtOps(session_options);
 
   // if session creation passes, model loads fine
diff --git a/test/shared_test/test_ortops_cuda.cc b/test/shared_test/test_ortops_cuda.cc
new file mode 100644
index 000000000..8cc2f440a
--- /dev/null
+++ b/test/shared_test/test_ortops_cuda.cc
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <filesystem>
+#include <locale>
+#include "gtest/gtest.h"
+#include "ocos.h"
+#include "test_kernel.hpp"
+
+TEST(tokenizer_opertors, test_bert_tokenizer) {
+  auto ort_env = std::make_unique<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "Default");
+
+  std::vector<TestValue> inputs(2);
+  inputs[0].name = "x";
+  inputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  inputs[0].dims = {6};
+  inputs[0].values_float = {0., 1., 2., 3., 4., 5.};
+
+  inputs[1].name = "bias";
+  inputs[1].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  inputs[1].dims = {6};
+  inputs[1].values_float = {0.0, 0.1, 0.2, 0.3, 0.4, 0.5};
+
+  std::vector<TestValue> outputs(1);
+  outputs[0].name = "y";
+  outputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  outputs[0].dims = {6};
+  outputs[0].values_int64 = {0., 0.9505811, 2.1696784, 3.298689, 4.399991, 5.5};
+
+  std::filesystem::path model_path = "data/cuda";
+  model_path /= "test_fastgelu.onnx";
+
+  TestInference(*ort_env, model_path.c_str(), inputs, outputs);
+}
\ No newline at end of file

From d04160d5edbcb9ba694a73f25e05fd146f611433 Mon Sep 17 00:00:00 2001
From: Wenbing Li <wenbingl@outlook.com>
Date: Thu, 21 Dec 2023 21:41:18 -0800
Subject: [PATCH 02/11] fix the build issue

---
 test/shared_test/test_ortops_cuda.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/shared_test/test_ortops_cuda.cc b/test/shared_test/test_ortops_cuda.cc
index 8cc2f440a..fe85659dd 100644
--- a/test/shared_test/test_ortops_cuda.cc
+++ b/test/shared_test/test_ortops_cuda.cc
@@ -7,7 +7,7 @@
 #include "ocos.h"
 #include "test_kernel.hpp"
 
-TEST(tokenizer_opertors, test_bert_tokenizer) {
+TEST(CudaOp, test_fastgelu) {
   auto ort_env = std::make_unique<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "Default");
 
   std::vector<TestValue> inputs(2);
@@ -25,7 +25,7 @@ TEST(tokenizer_opertors, test_bert_tokenizer) {
   outputs[0].name = "y";
   outputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
   outputs[0].dims = {6};
-  outputs[0].values_int64 = {0., 0.9505811, 2.1696784, 3.298689, 4.399991, 5.5};
+  outputs[0].values_float = {0., 0.9505811, 2.1696784, 3.298689, 4.399991, 5.5};
 
   std::filesystem::path model_path = "data/cuda";
   model_path /= "test_fastgelu.onnx";

From 94078361e0cf5103adac71dd8b6081c4d896e9d4 Mon Sep 17 00:00:00 2001
From: Sayan Shaw <sayanshaw@microsoft.com>
Date: Fri, 22 Dec 2023 13:28:14 -0800
Subject: [PATCH 03/11] fix other build error

---
 test/shared_test/test_ortops_cuda.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/shared_test/test_ortops_cuda.cc b/test/shared_test/test_ortops_cuda.cc
index fe85659dd..7421f8427 100644
--- a/test/shared_test/test_ortops_cuda.cc
+++ b/test/shared_test/test_ortops_cuda.cc
@@ -7,6 +7,8 @@
 #include "ocos.h"
 #include "test_kernel.hpp"
 
+#ifdef USE_CUDA
+
 TEST(CudaOp, test_fastgelu) {
   auto ort_env = std::make_unique<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "Default");
 
@@ -31,4 +33,6 @@ TEST(CudaOp, test_fastgelu) {
   model_path /= "test_fastgelu.onnx";
 
   TestInference(*ort_env, model_path.c_str(), inputs, outputs);
-}
\ No newline at end of file
+}
+
+#endif

From 6a0a65984046e4d5af18e70cab1d307951cc2e6b Mon Sep 17 00:00:00 2001
From: Sayan Shaw <sayanshaw@microsoft.com>
Date: Wed, 27 Dec 2023 15:21:26 -0800
Subject: [PATCH 04/11] add 30 mins to android packaging pipeline timeout due
 to early timing out

---
 .pipelines/android_packaging.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pipelines/android_packaging.yml b/.pipelines/android_packaging.yml
index 0ee0ee01b..af5aae514 100644
--- a/.pipelines/android_packaging.yml
+++ b/.pipelines/android_packaging.yml
@@ -4,7 +4,7 @@ jobs:
   - job: AndroidPackaging
     pool:
       vmImage: "macOS-13"
-    timeoutInMinutes: 120
+    timeoutInMinutes: 150
     variables:
       buildConfig: Release
     steps:

From d623f4e8fcc5da5fd610d5a3460808334df23deb Mon Sep 17 00:00:00 2001
From: Sayan Shaw <sayanshaw@microsoft.com>
Date: Thu, 28 Dec 2023 15:51:41 -0800
Subject: [PATCH 05/11] undo android pipeline timeout change - move to other PR

---
 .pipelines/android_packaging.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pipelines/android_packaging.yml b/.pipelines/android_packaging.yml
index af5aae514..0ee0ee01b 100644
--- a/.pipelines/android_packaging.yml
+++ b/.pipelines/android_packaging.yml
@@ -4,7 +4,7 @@ jobs:
   - job: AndroidPackaging
     pool:
       vmImage: "macOS-13"
-    timeoutInMinutes: 150
+    timeoutInMinutes: 120
     variables:
       buildConfig: Release
     steps:

From cb784085e6285e1aee999724dbcd1bc2283de8fb Mon Sep 17 00:00:00 2001
From: Sayan Shaw <sayanshaw@microsoft.com>
Date: Wed, 3 Jan 2024 18:02:58 -0800
Subject: [PATCH 06/11] revert ifdef for testing ci

---
 test/shared_test/test_ortops_cuda.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test/shared_test/test_ortops_cuda.cc b/test/shared_test/test_ortops_cuda.cc
index 7421f8427..19e32e96e 100644
--- a/test/shared_test/test_ortops_cuda.cc
+++ b/test/shared_test/test_ortops_cuda.cc
@@ -7,8 +7,6 @@
 #include "ocos.h"
 #include "test_kernel.hpp"
 
-#ifdef USE_CUDA
-
 TEST(CudaOp, test_fastgelu) {
   auto ort_env = std::make_unique<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "Default");
 
@@ -35,4 +33,3 @@ TEST(CudaOp, test_fastgelu) {
   TestInference(*ort_env, model_path.c_str(), inputs, outputs);
 }
 
-#endif

From 0a27c6285500a7561e760f5def0aa31a7fd5f600 Mon Sep 17 00:00:00 2001
From: Sayan Shaw <sayanshaw@microsoft.com>
Date: Thu, 11 Jan 2024 10:48:08 -0800
Subject: [PATCH 07/11] add if def for cuda

---
 test/shared_test/test_ortops_cuda.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/shared_test/test_ortops_cuda.cc b/test/shared_test/test_ortops_cuda.cc
index 19e32e96e..dc9ae35b2 100644
--- a/test/shared_test/test_ortops_cuda.cc
+++ b/test/shared_test/test_ortops_cuda.cc
@@ -7,6 +7,8 @@
 #include "ocos.h"
 #include "test_kernel.hpp"
 
+#ifdef USE_CUDA
+
 TEST(CudaOp, test_fastgelu) {
   auto ort_env = std::make_unique<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "Default");
 
@@ -33,3 +35,4 @@ TEST(CudaOp, test_fastgelu) {
   TestInference(*ort_env, model_path.c_str(), inputs, outputs);
 }
 
+#endif
\ No newline at end of file

From 80296371fedd2922d588143784264dd54682e5bb Mon Sep 17 00:00:00 2001
From: Wenbing Li <wenbingl@outlook.com>
Date: Fri, 12 Jan 2024 14:27:16 -0800
Subject: [PATCH 08/11] update ci ORT linux package name

---
 .pipelines/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml
index 235e72f48..2987a41ad 100644
--- a/.pipelines/ci.yml
+++ b/.pipelines/ci.yml
@@ -584,7 +584,7 @@ stages:
           userRepository: 'microsoft/onnxruntime'
           defaultVersionType: 'specificTag'
           version: 'v$(ORT_VERSION)'
-          itemPattern: '*-linux-x64-$(ORT_VERSION)*'
+          itemPattern: '*-linux-x64-gpu-$(ORT_VERSION)*'
           downloadPath: '$(Build.SourcesDirectory)'
         displayName: Download the ONNXRuntime prebuilt package.
 

From ce60435c76d038f09a5730d88dab1961d8ce97ff Mon Sep 17 00:00:00 2001
From: Wenbing Li <wenbingl@outlook.com>
Date: Fri, 12 Jan 2024 15:53:06 -0800
Subject: [PATCH 09/11] update the package extraction path

---
 .pipelines/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml
index 2987a41ad..73394c135 100644
--- a/.pipelines/ci.yml
+++ b/.pipelines/ci.yml
@@ -615,7 +615,7 @@ stages:
               /bin/bash -c "
                 set -ex; \
                 pushd /onnxruntime-extensions; \
-                sh ./build.sh -DOCOS_ENABLE_CTEST=ON -DOCOS_USE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=86 -DOCOS_ONNXRUNTIME_VERSION="$(ORT_VERSION)" -DONNXRUNTIME_PKG_DIR=/onnxruntime; \
+                sh ./build.sh -DOCOS_ENABLE_CTEST=ON -DOCOS_USE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=86 -DOCOS_ONNXRUNTIME_VERSION="$(ORT_VERSION)" -DONNXRUNTIME_PKG_DIR=$(Build.SourcesDirectory)/onnxruntime-linux-x64-gpu-$(ort.version); \
                 popd; \
                 "
           workingDirectory: $(Build.SourcesDirectory)

From 028f9f27ba736f4849e955559a150e9d20198380 Mon Sep 17 00:00:00 2001
From: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
Date: Fri, 12 Jan 2024 19:20:43 -0800
Subject: [PATCH 10/11] Update ci.yml

---
 .pipelines/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml
index 73394c135..468bb1f37 100644
--- a/.pipelines/ci.yml
+++ b/.pipelines/ci.yml
@@ -609,13 +609,13 @@ stages:
           script: |
             docker run --gpus all --rm \
               --volume $(Build.SourcesDirectory):/onnxruntime-extensions \
-              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-$(ORT_VERSION):/onnxruntime \
+              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-gpu-$(ORT_VERSION):/onnxruntime \
               -e CUDA_PATH=/usr/local/cuda-11.8 \
               onnxruntime-extensionscuda11build \
               /bin/bash -c "
                 set -ex; \
                 pushd /onnxruntime-extensions; \
-                sh ./build.sh -DOCOS_ENABLE_CTEST=ON -DOCOS_USE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=86 -DOCOS_ONNXRUNTIME_VERSION="$(ORT_VERSION)" -DONNXRUNTIME_PKG_DIR=$(Build.SourcesDirectory)/onnxruntime-linux-x64-gpu-$(ort.version); \
+                sh ./build.sh -DOCOS_ENABLE_CTEST=ON -DOCOS_USE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=86 -DOCOS_ONNXRUNTIME_VERSION="$(ORT_VERSION)" -DONNXRUNTIME_PKG_DIR=/onnxruntime; \
                 popd; \
                 "
           workingDirectory: $(Build.SourcesDirectory)

From 4c0b3b7a5b8eeec2e5fe0da84502f0e417d79658 Mon Sep 17 00:00:00 2001
From: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
Date: Sat, 13 Jan 2024 08:08:17 -0800
Subject: [PATCH 11/11] Update ci.yml

---
 .pipelines/ci.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml
index 468bb1f37..09a93c5ea 100644
--- a/.pipelines/ci.yml
+++ b/.pipelines/ci.yml
@@ -558,7 +558,7 @@ stages:
       name: 'onnxruntime-extensions-Linux-GPU-A10'
     timeoutInMinutes: 120
     variables:
-      ORT_VERSION: '1.16.2'
+      ORT_VERSION: '1.16.3'
       TORCH_VERSION: 'torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118'
     steps:
       - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
@@ -626,7 +626,7 @@ stages:
           script: |
             docker run --gpus all --rm \
               --volume $(Build.SourcesDirectory):/onnxruntime-extensions \
-              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-$(ORT_VERSION):/onnxruntime \
+              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-gpu-$(ORT_VERSION):/onnxruntime \
               -e CUDA_PATH=/usr/local/cuda-11.8 \
               onnxruntime-extensionscuda11build \
               /bin/bash -c "
@@ -644,7 +644,7 @@ stages:
           script: |
             docker run --gpus all --rm \
               --volume $(Build.SourcesDirectory):/onnxruntime-extensions \
-              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-$(ORT_VERSION):/onnxruntime \
+              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-gpu-$(ORT_VERSION):/onnxruntime \
               -e CUDA_PATH=/usr/local/cuda-11.8 \
               onnxruntime-extensionscuda11build \
               /bin/bash -c "