Skip to content

Commit

Permalink
Fix TRT EP allocator memory leak (microsoft#16552)
Browse files Browse the repository at this point in the history
Fix memory leak issue which comes from TRT EP's allocator object not
being released upon destruction.
Following is the log from valgrind:
```
==1911860== 100,272 (56 direct, 100,216 indirect) bytes in 1 blocks are definitely lost in loss record 1,751 of 1,832
==1911860==    at 0x483CFA3: operator new(unsigned long) (vg_replace_malloc.c:472)
==1911860==    by 0x315DC2: std::_MakeUniq<onnxruntime::OrtAllocatorImplWrappingIAllocator>::__single_object std::make_unique<onnxruntime::OrtAllocatorImplWrappingIAllocator, std::shared_ptr<onnxruntime::IAllocator> >(std::shared_ptr<onnxruntime::IAllocator>&&) (unique_ptr.h:857)
==1911860==    by 0x30EE7B: OrtApis::KernelContext_GetAllocator(OrtKernelContext const*, OrtMemoryInfo const*, OrtAllocator**) (custom_ops.cc:121)
==1911860==    by 0x660D115: onnxruntime::TensorrtExecutionProvider::Compile(std::vector<onnxruntime::IExecutionProvider::FusedNodeAndGraph, std::allocator<onnxruntime::IExecutionProvider::FusedNodeAndGraph> > const&, std::vector<onnxruntime::NodeComputeInfo, std::allocator<onnxruntime::NodeComputeInfo> >&)::{lambda(void*, OrtApi const*, OrtKernelContext*)#3}::operator()(void*, OrtApi const*, OrtKernelContext*) const (tensorrt_execution_provider.cc:2223)
```
This issue happens after this [EP allocator
refactor](microsoft#15833)
  • Loading branch information
chilo-ms authored Jul 5, 2023
1 parent 9799d43 commit d8792f8
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 3 deletions.
16 changes: 13 additions & 3 deletions onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "core/providers/cuda/shared_inc/cuda_call.h"
#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
#include "core/providers/cuda/gpu_data_transfer.h"
#include "core/session/allocator_adapters.h"
#include "cuda_runtime_api.h"
#include "core/common/gsl.h"
#include <unordered_map>
Expand Down Expand Up @@ -991,6 +992,12 @@ TensorrtExecutionProvider::~TensorrtExecutionProvider() {
ORT_IGNORE_RETURN_VALUE(CUDA_CALL(cudaStreamDestroy(stream_)));
}
ReleaseTensorRTCustomOpDomainList(info_.custom_op_domain_list);

if (alloc_ != nullptr) {
// This code is same as OrtApis::ReleaseAllocator defined in allocator_adapters.cc.
// We can't get api inside destructor so that's why we duplicate the code here.
delete static_cast<OrtAllocatorImpl*>(alloc_);
}
}

bool TensorrtExecutionProvider::IsGraphCaptureEnabled() const {
Expand Down Expand Up @@ -2213,15 +2220,18 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
auto trt_context = trt_state->context->get();
auto trt_profiles = trt_state->profiles;
auto max_context_mem_size_ptr = trt_state->max_context_mem_size_ptr;
OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0));
OrtAllocator* alloc;
Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc));
int num_inputs = static_cast<int>(input_indexes.size());
int num_outputs = static_cast<int>(output_indexes.size());
bool engine_update = false;
std::unordered_set<std::string> input_names;
std::unordered_map<std::string, std::vector<int32_t>> tensor_shape_values;

OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0));
if (alloc_ == nullptr) {
Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
}
OrtAllocator* alloc = alloc_;

void* cuda_stream;
Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,10 @@ class TensorrtExecutionProvider : public IExecutionProvider {
bool detailed_build_log_ = false;
bool cuda_graph_enable_ = false;

// The OrtAllocator object will be get during ep compute time
// and should be kept for the lifetime of TRT EP object.
OrtAllocator* alloc_ = nullptr;

std::unique_ptr<CUDAGraph> cuda_graph_; // ORT TRT only supports CUDA graph when whole model is supported by TRT, so simply maintaining a CUDAGraph pointer is enough (no need to maintain one CUDAGraph pointer per TRT subgraph)
bool is_graph_captured_ = false;
int regular_run_count_before_graph_capture_ = 0;
Expand Down

0 comments on commit d8792f8

Please sign in to comment.