Skip to content

Commit

Permalink
[BYOC][TensorRT] Add TensorRT own int8 calibration support to TensorR…
Browse files Browse the repository at this point in the history
…T BYOC integration (#8808)

* update trt

* clean codes

* tetsing running trt

* clean data

* clean codes?

* remove env func

* fix num_bings

* add buildfromjson func

* change condition

* reset input and output func

* re-config func

* re-added trt version check

* checking sanity

* try to fix sanity issue

* checking sainity

* fixing sanity issue

* fixing sainity issue

* fixing sanity

* clang format fixed

* clang format fixing

* clean trt cali

* try to fix clang format

* fixed some comments

* remove double destroy engine codes

* modify comments

* add checking function

* add trt int8 test

* update trt int8 test file

* Update test_tensorrt_int8_exp.py

* update trt int8 fikle

* change a little

* upate trt int8 file

* upate trt int8 file

* fixing ci

* fixing ci

* fixing ci

* fixing ci

* fixing ci

* fixing ci issue

* fixing ci issue

* fixing ci

* fixing ci issue

* fixing ci

* fixing ci problem

* fixing ci

* upate trt python int8 test file

* fixed ci

* fixed ci

* fix gpu build

* fixed ci

* update trt int8 test file

* fix bug

* fix bug

* update trtint8 file

* reformat

* update trt int8 file

* update

* modify
  • Loading branch information
tiandiao123 committed Sep 15, 2021
1 parent e44f6c0 commit 6f5b674
Show file tree
Hide file tree
Showing 5 changed files with 399 additions and 18 deletions.
19 changes: 17 additions & 2 deletions src/runtime/contrib/tensorrt/tensorrt_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,16 @@ namespace contrib {
TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger,
const std::vector<const DLTensor*>& data_entry,
size_t max_workspace_size, bool use_implicit_batch, bool use_fp16,
int batch_size)
int batch_size, nvinfer1::IInt8Calibrator* calibrator)
: data_entry_(data_entry),
max_workspace_size_(max_workspace_size),
use_implicit_batch_(use_implicit_batch),
use_fp16_(use_fp16),
batch_size_(batch_size) {
// Create TRT builder and network.
builder_ = nvinfer1::createInferBuilder(*logger);
use_int8_ = false;

#if TRT_VERSION_GE(6, 0, 1)
// Use INetworkV2.
auto flags =
Expand All @@ -56,9 +58,15 @@ TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger,
flags = 0U;
builder_->setMaxBatchSize(batch_size_);
}
this->calibrator_ = calibrator;
if (calibrator != nullptr) {
use_int8_ = true;
builder_->setFp16Mode(true);
builder_->setInt8Mode(true);
builder_->setInt8Calibrator(calibrator);
}
network_ = builder_->createNetworkV2(flags);
#else
// Use INetwork with implicit batch.
builder_->setMaxBatchSize(batch_size_);
builder_->setMaxWorkspaceSize(max_workspace_size_);
builder_->setFp16Mode(use_fp16_);
Expand Down Expand Up @@ -158,6 +166,13 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
if (use_fp16_) {
config_->setFlag(nvinfer1::BuilderFlag::kFP16);
}

if (use_int8_) {
config_->setFlag(nvinfer1::BuilderFlag::kINT8);
config_->setInt8Calibrator(calibrator_);
LOG(INFO) << "config finishes setting up calibrator as INT8 mode ... ";
}

// Add profiles.
if (!use_implicit_batch_) {
auto profile = builder_->createOptimizationProfile();
Expand Down
11 changes: 9 additions & 2 deletions src/runtime/contrib/tensorrt/tensorrt_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ class TensorRTBuilder {
* \param batch_size If use_implicit_batch,
*/
TensorRTBuilder(TensorRTLogger* logger, const std::vector<const DLTensor*>& data_entry,
size_t max_workspace_size, bool use_implicit_batch, bool use_fp16,
int batch_size);
size_t max_workspace_size, bool use_implicit_batch, bool use_fp16, int batch_size,
nvinfer1::IInt8Calibrator* calibrator = nullptr);

/*!
* \brief Add TensorRT input(s) for input node in network definition.
Expand Down Expand Up @@ -153,6 +153,9 @@ class TensorRTBuilder {
/*! \brief Whether to automatically convert model to 16-bit floating point precision. */
bool use_fp16_;

/*! \brief whether to automatically convert model to int8 precision */
bool use_int8_;

/*! \brief Batch size to optimize for. */
int batch_size_;

Expand All @@ -161,6 +164,10 @@ class TensorRTBuilder {

/*! \brief Output names. */
std::vector<std::string> network_output_names_;

/*! \brief calibrator pointer to add batch data when using int8 mode */
/*! \brief pointer will be nullptr when it is fp16 or fp32 precision */
nvinfer1::IInt8Calibrator* calibrator_;
};

} // namespace contrib
Expand Down
130 changes: 130 additions & 0 deletions src/runtime/contrib/tensorrt/tensorrt_calibrator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/* * Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
* file runtime/contrib/tensorrt/tensorrt_builder.h
* brief Contains TensorRTBuilder class which can be used to convert a relay
* program into a TRT engine which can be used for inference.
*/

#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_CALIBRATOR_H_
#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_CALIBRATOR_H_

#include <string>
#include <vector>

#include "../../cuda/cuda_common.h"
#include "NvInfer.h"

namespace tvm {
namespace runtime {

class TensorRTCalibrator : public nvinfer1::IInt8EntropyCalibrator2 {
public:
TensorRTCalibrator(int batch_size, const std::vector<std::string>& input_names)
: batch_size_(batch_size), num_batches_calibrated_(0), input_names_(input_names) {}

~TensorRTCalibrator() {
// Free calibration data
for (auto& inputs : data_) {
for (size_t i = 0; i < inputs.size(); ++i) {
delete[] inputs[i];
}
}
// Free buffers
for (size_t i = 0; i < buffers_.size(); ++i) {
CUDA_CALL(cudaFree(buffers_[i]));
}
}

void AddBatchData(const std::vector<void*>& bindings, const std::vector<size_t>& binding_sizes) {
// Copy data from GPU
std::vector<float*> data_host(bindings.size(), nullptr);
for (size_t i = 0; i < bindings.size(); ++i) {
data_host[i] = new float[batch_size_ * binding_sizes[i]];
CUDA_CALL(cudaMemcpy(static_cast<void*>(data_host[i]), bindings[i],
batch_size_ * binding_sizes[i] * sizeof(float), cudaMemcpyDeviceToHost));
}
data_.push_back(data_host);
data_sizes_.push_back(binding_sizes);
}

int getBatchSize() const override { return batch_size_; }

/*!
* \brief TensorRT will call this method to get next batch of data to
* calibrate with.
*/
bool getBatch(void* bindings[], const char* names[], int nbBindings) override {
AllocateBuffersIfNotAllocated();
CHECK_EQ(input_names_.size(), nbBindings);
for (size_t i = 0; i < input_names_.size(); ++i) {
CHECK_EQ(input_names_[i], names[i]);
CUDA_CALL(cudaMemcpy(buffers_[i], data_[num_batches_calibrated_][i],
batch_size_ * data_sizes_[num_batches_calibrated_][i] * sizeof(float),
cudaMemcpyHostToDevice));
bindings[i] = buffers_[i];
}
num_batches_calibrated_++;
// TODO(trevmorr): Free data from previous batch?
return (num_batches_calibrated_ < data_.size());
}

const void* readCalibrationCache(size_t& length) override {
if (calibration_cache_.empty()) return nullptr;
length = calibration_cache_.size();
return calibration_cache_.data();
}

void writeCalibrationCache(const void* cache, size_t length) override {
calibration_cache_.assign(static_cast<const char*>(cache), length);
}

private:
/*! \brief Batch size. */
int batch_size_;
/*! \brief Number of batches already fed to calibrator. */
int num_batches_calibrated_;
/*! \brief Storage for calibration cache. */
std::string calibration_cache_;

/*! \brief Data to be used for calibration. */
std::vector<std::vector<float*>> data_;
/*! \brief Number of elements for data to be used for calibration. */
std::vector<std::vector<size_t>> data_sizes_;

/*! \brief Device buffers to be used for calibration. */
std::vector<void*> buffers_;

/*! \brief Names of inputs */
const std::vector<std::string> input_names_;

/*! \brief Allocate device memory buffers. data_sizes_ must already have one
* entry. */
void AllocateBuffersIfNotAllocated() {
if (!buffers_.empty()) return;
CHECK_GE(data_sizes_.size(), 1);
const int num_inputs = data_sizes_[0].size();
buffers_.assign(num_inputs, nullptr);
for (int i = 0; i < num_inputs; ++i) {
CUDA_CALL(cudaMalloc(&buffers_[i], data_sizes_[0][i] * sizeof(float)));
}
}
};

} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_CALIBRATOR_H_
Loading

0 comments on commit 6f5b674

Please sign in to comment.