Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improves performance of operator Transpose #5550

Merged
merged 16 commits into from
Nov 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
223 changes: 131 additions & 92 deletions onnxruntime/core/providers/cpu/tensor/transpose.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "core/providers/cpu/tensor/transpose.h"
#include "core/framework/utils.h"
#include "utils.h"
namespace onnxruntime {

/* A permutation [a,b,c,...] indicates that
Expand All @@ -12,23 +13,65 @@ namespace onnxruntime {
etc.
*/

// ComputeOffset: compute offset into a tensor. This is essentially the dot-product of
// index and stride, restricted to the specified number of axes.
static inline size_t ComputeOffset(const std::vector<int64_t>& index, const std::vector<size_t>& stride, int64_t num_axes) {
size_t offset = 0;
for (int64_t j = 0; j < num_axes; ++j) {
offset += index[j] * stride[j];
typedef struct MultiIndex {
size_t index;
size_t upper_bound;
int64_t stride;
MultiIndex() {}
MultiIndex(size_t i, size_t n, int64_t s) {
index = i;
upper_bound = n;
stride = s;
}
} MultiIndex;

static size_t IncrementIndexAndComputeOffsetSetup(MultiIndex* mindex, int64_t num_axes, const std::vector<int64_t>& target_dims,
const std::vector<size_t>& stride, size_t element_size) {
size_t naxes = 0;
for (int64_t i = 0; i < num_axes; ++i) {
if (target_dims[i] == 1)
continue;
mindex[naxes] = MultiIndex(0, static_cast<size_t>(target_dims[i]), stride[i] * element_size);
++naxes;
}
return naxes;
}

// Combines multi-index increment and corresponding pointer in the tensor to transpose.
static void IncrementIndexAndComputeOffset(MultiIndex* mindex, size_t naxes, const uint8_t*& local_source) {
MultiIndex* it = mindex + (naxes - 1);
local_source += it->stride;
if (++it->index < it->upper_bound)
return;
local_source -= it->stride * it->index;
it->index = 0;
--it;
MultiIndex* rend = mindex - 1;
for (; it != rend; --it) {
local_source += it->stride;
if (++it->index < it->upper_bound)
break;
local_source -= it->stride * it->index;
it->index = 0;
}
return offset;
}

// IncrementIndex: Increment an index into a tensor (in lexicographic ordering), wrapping
// around the specified upper_bound.
static inline void IncrementIndex(std::vector<int64_t>& index, const std::vector<int64_t>& upper_bound, int64_t num_axes) {
for (int64_t k = num_axes - 1; k >= 0; --k) {
index[k]++;
if (index[k] < upper_bound[k]) break;
index[k] = 0;
// Combines multi-index increment and corresponding pointer in the string tensor to transpose.
static void IncrementIndexAndComputeOffset(MultiIndex* mindex, size_t naxes, const std::string*& local_source) {
MultiIndex* it = mindex + (naxes - 1);
local_source += it->stride;
if (++it->index < it->upper_bound)
return;
local_source -= it->stride * it->index;
it->index = 0;
--it;
MultiIndex* rend = mindex - 1;
for (; it != rend; --it) {
local_source += it->stride;
if (++it->index < it->upper_bound)
break;
local_source -= it->stride * it->index;
it->index = 0;
}
}

Expand All @@ -52,35 +95,28 @@ static void DoTransposeImpl(int64_t num_axes, const std::vector<int64_t>& target
size_t num_blocks, size_t num_elts_in_block, const std::vector<size_t>& stride,
const uint8_t* source, uint8_t* target, size_t element_size) {
size_t blocksize = num_elts_in_block * element_size;
// index used to iterate over target iteration-space
std::vector<int64_t> target_index(num_axes, 0);
for (size_t i = 0; i < num_blocks; ++i) {
// convert target_index into an offset in source data
size_t source_offset = ComputeOffset(target_index, stride, num_axes);

// copy
memcpy(target, source + source_offset * element_size, blocksize);
std::vector<MultiIndex> mindex(num_axes);
size_t naxes = IncrementIndexAndComputeOffsetSetup(mindex.data(), num_axes, target_dims, stride, element_size);

// increment target_index:
IncrementIndex(target_index, target_dims, num_axes);
const uint8_t* local_source = source;
for (size_t i = 0; i < num_blocks; ++i) {
memcpy(target, local_source, blocksize);
IncrementIndexAndComputeOffset(mindex.data(), naxes, local_source);
target += blocksize;
}
}

static void DoTransposeImpl(int64_t num_axes, const std::vector<int64_t>& target_dims,
size_t num_blocks, size_t num_elts_in_block, const std::vector<size_t>& stride,
const std::string* source, std::string* target) {
// index used to iterate over target iteration-space
std::vector<int64_t> target_index(num_axes, 0);
for (size_t i = 0; i < num_blocks; ++i) {
// convert target_index into an offset in source data
size_t source_offset = ComputeOffset(target_index, stride, num_axes);
ORT_ENFORCE(num_axes > 0, "Transpose not implemented for empty tensors.");
hariharans29 marked this conversation as resolved.
Show resolved Hide resolved
std::vector<MultiIndex> mindex(num_axes);
size_t naxes = IncrementIndexAndComputeOffsetSetup(mindex.data(), num_axes, target_dims, stride, 1);

// copy
DoTransposeSingleBlock(num_elts_in_block, source + source_offset, target);

// increment target_index:
IncrementIndex(target_index, target_dims, num_axes);
const std::string* local_source = source;
for (size_t i = 0; i < num_blocks; ++i) {
DoTransposeSingleBlock(num_elts_in_block, local_source, target);
IncrementIndexAndComputeOffset(mindex.data(), naxes, local_source);
target += num_elts_in_block;
}
}
Expand All @@ -90,67 +126,39 @@ inline void CopyPrim(uint8_t* target, const uint8_t* source) {
*reinterpret_cast<T*>(target) = *reinterpret_cast<const T*>(source);
}

// The function does not check num_axes > 0 but this is expected.
template <class T>
static void TypedDoTransposeEltWise(int64_t num_axes, const std::vector<int64_t>& target_dims, size_t num_blocks,
const std::vector<size_t>& stride, const uint8_t* source, uint8_t* target) {
std::vector<MultiIndex> mindex(num_axes);
size_t naxes = IncrementIndexAndComputeOffsetSetup(mindex.data(), num_axes, target_dims, stride, sizeof(T));

const uint8_t* local_source = source;
for (size_t i = 0; i < num_blocks; ++i) {
CopyPrim<uint64_t>(target, local_source);
IncrementIndexAndComputeOffset(mindex.data(), naxes, local_source);
target += sizeof(T);
}
}

// DoTransposeEltWise: specialization of DoTranspose for the num_elts_in_block=1 case.
// copies source tensor to target, transposing elements.
// The stride vector indicates the transposition.
static void DoTransposeEltWise(int64_t num_axes, const std::vector<int64_t>& target_dims, size_t num_blocks,
const std::vector<size_t>& stride, const uint8_t* source, uint8_t* target,
size_t element_size) {
// index used to iterate over target iteration-space
std::vector<int64_t> target_index(num_axes, 0);

switch (element_size) {
case sizeof(uint64_t):
for (size_t i = 0; i < num_blocks; ++i) {
// convert target_index into an offset in source data
size_t source_offset = ComputeOffset(target_index, stride, num_axes);

// copy
CopyPrim<uint64_t>(target, source + (source_offset * element_size));

// increment target_index:
IncrementIndex(target_index, target_dims, num_axes);
target += element_size;
}
TypedDoTransposeEltWise<uint64_t>(num_axes, target_dims, num_blocks, stride, source, target);
break;
case sizeof(uint32_t):
for (size_t i = 0; i < num_blocks; ++i) {
// convert target_index into an offset in source data
size_t source_offset = ComputeOffset(target_index, stride, num_axes);

// copy
CopyPrim<uint32_t>(target, source + (source_offset * element_size));

// increment target_index:
IncrementIndex(target_index, target_dims, num_axes);
target += element_size;
}
TypedDoTransposeEltWise<uint32_t>(num_axes, target_dims, num_blocks, stride, source, target);
break;
case sizeof(uint16_t):
for (size_t i = 0; i < num_blocks; ++i) {
// convert target_index into an offset in source data
size_t source_offset = ComputeOffset(target_index, stride, num_axes);

// copy
CopyPrim<uint16_t>(target, source + (source_offset * element_size));

// increment target_index:
IncrementIndex(target_index, target_dims, num_axes);
target += element_size;
}
TypedDoTransposeEltWise<uint16_t>(num_axes, target_dims, num_blocks, stride, source, target);
break;
case sizeof(uint8_t):
for (size_t i = 0; i < num_blocks; ++i) {
// convert target_index into an offset in source data
size_t source_offset = ComputeOffset(target_index, stride, num_axes);

// copy
*target = *(source + (source_offset * element_size));

// increment target_index:
IncrementIndex(target_index, target_dims, num_axes);
target += element_size;
}
TypedDoTransposeEltWise<uint8_t>(num_axes, target_dims, num_blocks, stride, source, target);
break;
default:
assert(false);
Expand All @@ -159,17 +167,15 @@ static void DoTransposeEltWise(int64_t num_axes, const std::vector<int64_t>& tar

static void DoTransposeEltWise(int64_t num_axes, const std::vector<int64_t>& target_dims, size_t num_blocks,
const std::vector<size_t>& stride, const std::string* source, std::string* target) {
ORT_ENFORCE(num_axes > 0, "Transpose not implemented for empty tensors.");
std::vector<MultiIndex> mindex(num_axes);
size_t naxes = IncrementIndexAndComputeOffsetSetup(mindex.data(), num_axes, target_dims, stride, 1);

// index used to iterate over target iteration-space
std::vector<int64_t> target_index(num_axes, 0);
const std::string* local_source = source;
for (size_t i = 0; i < num_blocks; ++i) {
// convert target_index into an offset in source data
size_t source_offset = ComputeOffset(target_index, stride, num_axes);

// copy
*target = *(source + source_offset);

// increment target_index:
IncrementIndex(target_index, target_dims, num_axes);
*target = *local_source;
IncrementIndexAndComputeOffset(mindex.data(), naxes, local_source);
target++;
}
}
Expand Down Expand Up @@ -271,13 +277,15 @@ template <typename T>
static void SimpleTransposeSingleAxisOutwards(const T* input_data, T* output_data,
int64_t num_loops, int64_t num_writers,
int64_t writes_per_loop, int64_t writes_per_writer_per_loop) {
const T* end;
for (int64_t l = 0; l < num_loops; ++l) {
T* output_for_first_writer = output_data;

for (auto wwpl = 0; wwpl < writes_per_writer_per_loop; ++wwpl) {
T* output_for_current_writer = output_for_first_writer;

for (int64_t w = 0; w < num_writers; ++w) {
end = input_data + num_writers;
for (; input_data != end;) {
*output_for_current_writer = *input_data++;

// skip to output position for next writer
Expand Down Expand Up @@ -363,13 +371,15 @@ template <typename T>
static void SimpleTransposeSingleAxisInwards(const T* input_data, T* output_data,
int64_t num_loops, int64_t num_readers,
int64_t reads_per_loop, int64_t reads_per_reader_per_loop) {
T* end;
for (int64_t l = 0; l < num_loops; ++l) {
const T* input_for_first_reader = input_data;

for (auto rrpl = 0; rrpl < reads_per_reader_per_loop; ++rrpl) {
const T* input_for_current_reader = input_for_first_reader;

for (int64_t r = 0; r < num_readers; ++r) {
end = output_data + num_readers;
for (; output_data != end;) {
*output_data++ = *input_for_current_reader;
// skip to input position for next reader
input_for_current_reader += reads_per_reader_per_loop;
Expand Down Expand Up @@ -531,6 +541,20 @@ static bool IsMovingSingleAxis(const std::vector<size_t>& permutations, size_t&
return single_axis_moved;
}

bool IsReshape(const std::vector<size_t>& perm, const std::vector<int64_t>& input_dims) {
// As long as the dims with values > 1 stay in the same order, it's a reshape.
// Example: Shape=(1,1,1024,4096) -> perm=(2,0,3,1).
size_t last_permuted_axis = 0;
for (size_t i = 0; i < perm.size(); ++i) {
if (input_dims[perm[i]] == 1)
continue;
if (perm[i] < last_permuted_axis)
return false;
last_permuted_axis = perm[i];
}
return true;
}

//`input_shape_override` overrides the shape of `input` for compute purposes.
Status TransposeBase::DoTranspose(const std::vector<size_t>& permutations, const Tensor& input, Tensor& output,
const TensorShape* input_shape_override) {
Expand All @@ -543,6 +567,14 @@ Status TransposeBase::DoTranspose(const std::vector<size_t>& permutations, const
status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Mismatched data types between input and output Tensors. ",
input_type, " != ", output_type);
} else {
TensorShape shape = input_shape_override ? *input_shape_override : input.Shape();
if (IsReshape(permutations, shape.GetDims())) {
// As long as the dims with values > 1 stay in the same order, it's a reshape.
// Example: Shape=(1,1,1024,4096) -> perm=(2,0,3,1).
CopyCpuTensor(&input, &output);
hariharans29 marked this conversation as resolved.
Show resolved Hide resolved
return Status::OK();
}

size_t from = 0, to = 0;
bool moving_single_axis = IsMovingSingleAxis(permutations, from, to);

Expand Down Expand Up @@ -578,6 +610,13 @@ Status Transpose::Compute(OpKernelContext* ctx) const {
if (output_shape.Size() == 0)
return Status::OK();

if (IsReshape(*p_perm, input_dims)) {
// As long as the dims with values > 1 stay in the same order, it's a reshape.
// Example: Shape=(1,1,1024,4096) -> perm=(2,0,3,1).
CopyCpuTensor(&X, &Y);
return Status::OK();
}

size_t from = 0, to = 0;
bool moving_single_axis = IsMovingSingleAxis(*p_perm, from, to);

Expand Down
33 changes: 33 additions & 0 deletions onnxruntime/test/providers/cpu/tensor/transpose_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,39 @@ TEST(TransposeOpTest, ThreeDimSuffix) {
TransposeTest(input_shape, input_vals, &perm, expected_shape, expected_vals, false); //TensorRT: illegal error
}

TEST(TransposeOpTest, TransposeReshape) {
std::vector<int64_t> input_shape({1, 4, 2, 1, 3});
std::vector<float> input_vals = {
1.0f, 2.0f, 3.0f,
4.0f, 5.0f, 6.0f,

1.1f, 2.1f, 3.1f,
4.1f, 5.1f, 6.1f,

1.2f, 2.2f, 3.2f,
4.2f, 5.2f, 6.2f,

1.3f, 2.3f, 3.3f,
4.3f, 5.3f, 6.3f};

std::vector<int64_t> perm = {1, 3, 2, 4, 0};
std::vector<int64_t> expected_shape({4, 1, 2, 3, 1});
auto expected_vals = {
1.0f, 2.0f, 3.0f,
4.0f, 5.0f, 6.0f,

1.1f, 2.1f, 3.1f,
4.1f, 5.1f, 6.1f,

1.2f, 2.2f, 3.2f,
4.2f, 5.2f, 6.2f,

1.3f, 2.3f, 3.3f,
4.3f, 5.3f, 6.3f};

TransposeTest(input_shape, input_vals, &perm, expected_shape, expected_vals, false); //TensorRT: illegal error
}

TEST(TransposeOpTest, ThreeDimStr) {
std::vector<int64_t> input_shape({4, 2, 3});
std::vector<std::string> input_vals = {
Expand Down