From d8792f80408255272a407d59d457fdcbb674a9b5 Mon Sep 17 00:00:00 2001
From: Chi Lo <54722500+chilo-ms@users.noreply.github.com>
Date: Wed, 5 Jul 2023 09:25:05 -0700
Subject: [PATCH] Fix TRT EP allocator memory leak (#16552)

Fix memory leak issue which comes from TRT EP's allocator object not
being released upon destruction.
Following is the log from valgrind:
```
==1911860== 100,272 (56 direct, 100,216 indirect) bytes in 1 blocks are definitely lost in loss record 1,751 of 1,832
==1911860==    at 0x483CFA3: operator new(unsigned long) (vg_replace_malloc.c:472)
==1911860==    by 0x315DC2: std::_MakeUniq<onnxruntime::OrtAllocatorImplWrappingIAllocator>::__single_object std::make_unique<onnxruntime::OrtAllocatorImplWrappingIAllocator, std::shared_ptr<onnxruntime::IAllocator> >(std::shared_ptr<onnxruntime::IAllocator>&&) (unique_ptr.h:857)
==1911860==    by 0x30EE7B: OrtApis::KernelContext_GetAllocator(OrtKernelContext const*, OrtMemoryInfo const*, OrtAllocator**) (custom_ops.cc:121)
==1911860==    by 0x660D115: onnxruntime::TensorrtExecutionProvider::Compile(std::vector<onnxruntime::IExecutionProvider::FusedNodeAndGraph, std::allocator<onnxruntime::IExecutionProvider::FusedNodeAndGraph> > const&, std::vector<onnxruntime::NodeComputeInfo, std::allocator<onnxruntime::NodeComputeInfo> >&)::{lambda(void*, OrtApi const*, OrtKernelContext*)#3}::operator()(void*, OrtApi const*, OrtKernelContext*) const (tensorrt_execution_provider.cc:2223)
```
This issue happens after this [EP allocator
refactor](https://github.com/microsoft/onnxruntime/pull/15833)
---
 .../tensorrt/tensorrt_execution_provider.cc      | 16 +++++++++++++---
 .../tensorrt/tensorrt_execution_provider.h       |  4 ++++
 2 files changed, 17 insertions(+), 3 deletions(-)
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index cb0a0ed432a64..3e1b03871e18d 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -14,6 +14,7 @@
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
 #include "core/providers/cuda/gpu_data_transfer.h"
+#include "core/session/allocator_adapters.h"
 #include "cuda_runtime_api.h"
 #include "core/common/gsl.h"
 #include <unordered_map>
@@ -991,6 +992,12 @@ TensorrtExecutionProvider::~TensorrtExecutionProvider() {
     ORT_IGNORE_RETURN_VALUE(CUDA_CALL(cudaStreamDestroy(stream_)));
   }
   ReleaseTensorRTCustomOpDomainList(info_.custom_op_domain_list);
+
+  if (alloc_ != nullptr) {
+    // This code is same as OrtApis::ReleaseAllocator defined in allocator_adapters.cc.
+    // We can't get api inside destructor so that's why we duplicate the code here.
+    delete static_cast<OrtAllocatorImpl*>(alloc_);
+  }
 }
 
 bool TensorrtExecutionProvider::IsGraphCaptureEnabled() const {
@@ -2213,15 +2220,18 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       auto trt_context = trt_state->context->get();
       auto trt_profiles = trt_state->profiles;
       auto max_context_mem_size_ptr = trt_state->max_context_mem_size_ptr;
-      OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0));
-      OrtAllocator* alloc;
-      Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc));
       int num_inputs = static_cast<int>(input_indexes.size());
       int num_outputs = static_cast<int>(output_indexes.size());
       bool engine_update = false;
       std::unordered_set<std::string> input_names;
       std::unordered_map<std::string, std::vector<int32_t>> tensor_shape_values;
 
+      OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0));
+      if (alloc_ == nullptr) {
+        Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
+      }
+      OrtAllocator* alloc = alloc_;
+
       void* cuda_stream;
       Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
       cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 35e7a110c5ed7..56eda7ad83537 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -214,6 +214,10 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   bool detailed_build_log_ = false;
   bool cuda_graph_enable_ = false;
 
+  // The OrtAllocator object will be get during ep compute time
+  // and should be kept for the lifetime of TRT EP object.
+  OrtAllocator* alloc_ = nullptr;
+
   std::unique_ptr<CUDAGraph> cuda_graph_;  // ORT TRT only supports CUDA graph when whole model is supported by TRT, so simply maintaining a CUDAGraph pointer is enough (no need to maintain one CUDAGraph pointer per TRT subgraph)
   bool is_graph_captured_ = false;
   int regular_run_count_before_graph_capture_ = 0;