From 97834d0364f4b7a1a5dcfa5625761d00d932751b Mon Sep 17 00:00:00 2001 From: Ger Hobbelt Date: Thu, 15 Jul 2021 15:23:41 +0200 Subject: [PATCH] This is how the idea expressed in #3490 looks like: using function templates for TFloat float & double implementations to co-exist in the run-time without cluttering the code with #if/#else and no run-time switches (yet). ## Observations thus far - DRY? Check! - the whole function template (and let the C++ compiler do the heavy lifting) idea of stops somewhere. This regrettably happens to be at the weightmatrix.cpp code, where the code calls the CPU+configuration-selected SIMD implementation via function pointer: `intSimdMatrix->matrixDotVectorFunction` -- this would require code duplication of some kind (e.g. a FP32 callback pointer co-existing with a FP64 callback ptr in the struct and then have the code pick the right one, depending on current TFloat size, for example) and is thus deemed unsatisfactory (my opinion). - So far, and very probably independent of any solutions for the co-existence issue at higher levels in the code, this template approach works out well, with the compiler smartly picking the one matching the current float/double choice. - while we have double the number of specialized SIMD implementations (obviously), these do not need #if/#else checks as we can let the C++ compiler do its prototype matching job --> cleaner code. - the template functions also help clean up the serialization/de-serialization code as the `` dual-type approach there allows one to specify the run-time type (TFloat) and the file-storage type at the same time: also do note how this cleans up the 'Old' scales deserialization code, as the old file storage is simply 'float' instead of 'double'. - the added cost there is a double copy of file data when T==ST, but that turned out negligible in the preliminary tests as that bit of code didn't even reach the Top20 CPU Guzzlers Chart, so that extra copy can wait for smarter C++ template writers to take care of when microtuning is called for. --- src/arch/dotproduct.cpp | 5 + src/arch/dotproduct.h | 37 +++-- src/arch/dotproductavx.cpp | 31 +++-- src/arch/dotproductfma.cpp | 37 +++-- src/arch/dotproductsse.cpp | 28 ++-- src/arch/intsimdmatrixavx2.cpp | 237 ++++----------------------------- src/arch/intsimdmatrixneon.cpp | 14 +- src/arch/intsimdmatrixsse.cpp | 59 +++----- src/arch/simddetect.cpp | 19 ++- 9 files changed, 172 insertions(+), 295 deletions(-) diff --git a/src/arch/dotproduct.cpp b/src/arch/dotproduct.cpp index f964e3256e..daba2ff25b 100644 --- a/src/arch/dotproduct.cpp +++ b/src/arch/dotproduct.cpp @@ -19,6 +19,7 @@ namespace tesseract { // Computes and returns the dot product of the two n-vectors u and v. +template TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) { TFloat total = 0; #if defined(OPENMP_SIMD) || defined(_OPENMP) @@ -30,4 +31,8 @@ TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) { return total; } +// two instantiations: float & double. +template float DotProductNative(const float *u, const float *v, int n); +template double DotProductNative(const double *u, const double *v, int n); + } // namespace tesseract diff --git a/src/arch/dotproduct.h b/src/arch/dotproduct.h index c9b2756e2c..756918d5e3 100644 --- a/src/arch/dotproduct.h +++ b/src/arch/dotproduct.h @@ -22,22 +22,43 @@ namespace tesseract { // Computes and returns the dot product of the n-vectors u and v. +template TFloat DotProductNative(const TFloat *u, const TFloat *v, int n); +// ------------ FAST FLOAT specializations ----------------- + +// Uses Intel AVX intrinsics to access the SIMD instruction set. +float DotProductAVX(const float *u, const float *v, int n); +float DotProductAVX1(const float *u, const float *v, int n); +float DotProductAVX2(const float *u, const float *v, int n); +float DotProductAVX3(const float *u, const float *v, int n); +float DotProductAVX4(const float *u, const float *v, int n); + +// Use Intel FMA. +float DotProductFMA(const float *u, const float *v, int n); + +// Uses Intel SSE intrinsics to access the SIMD instruction set. +float DotProductSSE(const float *u, const float *v, int n); + +float DotProductAccelerate(const float *u, const float *v, int n); + +// ------------ HIGH PRECISION DOUBLE specializations ----------------- + // Uses Intel AVX intrinsics to access the SIMD instruction set. -TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n); -TFloat DotProductAVX1(const TFloat *u, const TFloat *v, int n); -TFloat DotProductAVX2(const TFloat *u, const TFloat *v, int n); -TFloat DotProductAVX3(const TFloat *u, const TFloat *v, int n); -TFloat DotProductAVX4(const TFloat *u, const TFloat *v, int n); +double DotProductAVX(const double *u, const double *v, int n); +double DotProductAVX1(const double *u, const double *v, int n); +double DotProductAVX2(const double *u, const double *v, int n); +double DotProductAVX3(const double *u, const double *v, int n); +double DotProductAVX4(const double *u, const double *v, int n); // Use Intel FMA. -TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n); +double DotProductFMA(const double *u, const double *v, int n); // Uses Intel SSE intrinsics to access the SIMD instruction set. -TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n); +double DotProductSSE(const double *u, const double *v, int n); + +double DotProductAccelerate(const double *u, const double *v, int n); -TFloat DotProductAccelerate(const TFloat *u, const TFloat *v, int n); } // namespace tesseract. #endif // TESSERACT_ARCH_DOTPRODUCT_H_ diff --git a/src/arch/dotproductavx.cpp b/src/arch/dotproductavx.cpp index 4c49e9e4a3..3bdb250fc1 100644 --- a/src/arch/dotproductavx.cpp +++ b/src/arch/dotproductavx.cpp @@ -15,11 +15,9 @@ // limitations under the License. /////////////////////////////////////////////////////////////////////// -#if !defined(__AVX__) -# if defined(__i686__) || defined(__x86_64__) -# error Implementation only for AVX capable architectures -# endif -#else +#include "intsimdmatrix.h" + +#if defined(__AVX__) # include # include @@ -27,9 +25,10 @@ namespace tesseract { +// ---------------------------- FAST FLOAT section ------------------------ + // Computes and returns the dot product of the n-vectors u and v. // Uses Intel AVX intrinsics to access the SIMD instruction set. -#if defined(FAST_FLOAT) float DotProductAVX(const float *u, const float *v, int n) { const unsigned quot = n / 8; const unsigned rem = n % 8; @@ -50,6 +49,7 @@ float DotProductAVX(const float *u, const float *v, int n) { } return result; } + float DotProductAVX1(const float *u, const float *v, int n) { const unsigned quot = n / 16; const unsigned rem = n % 16; @@ -76,7 +76,9 @@ float DotProductAVX1(const float *u, const float *v, int n) { } return result; } -#else + +// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------ + double DotProductAVX1(const double *u, const double *v, int n) { __m256d t0 = _mm256_setzero_pd(); __m256d t1 = _mm256_setzero_pd(); @@ -130,8 +132,21 @@ double DotProductAVX(const double *u, const double *v, int n) { } return result; } -#endif + +// ---------------------------- END FLOAT/DOUBLE sections ------------------------ } // namespace tesseract. +#else + +namespace tesseract { + + // Computes and returns the dot product of the n-vectors u and v. + // Uses Intel FMA intrinsics to access the SIMD instruction set. + inline TFloat DotProductAVX(const TFloat* u, const TFloat* v, int n) { + return DotProductFMA(u, v, n); + } + +} + #endif diff --git a/src/arch/dotproductfma.cpp b/src/arch/dotproductfma.cpp index 6afaefd3eb..704ae0bd01 100644 --- a/src/arch/dotproductfma.cpp +++ b/src/arch/dotproductfma.cpp @@ -15,11 +15,7 @@ // limitations under the License. /////////////////////////////////////////////////////////////////////// -#if !defined(__FMA__) -# if defined(__i686__) || defined(__x86_64__) -# error Implementation only for FMA capable architectures -# endif -#else +#if defined(__FMA__) # include # include @@ -27,10 +23,11 @@ namespace tesseract { +// ---------------------------- FAST FLOAT section ------------------------ + // Computes and returns the dot product of the n-vectors u and v. // Uses Intel FMA intrinsics to access the SIMD instruction set. -#if defined(FAST_FLOAT) -TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) { +float DotProductFMA(const float *u, const float *v, int n) { const unsigned quot = n / 16; const unsigned rem = n % 16; __m256 t0 = _mm256_setzero_ps(); @@ -48,15 +45,17 @@ TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) { v += 8; } t0 = _mm256_hadd_ps(t0, t1); - alignas(32) TFloat tmp[8]; + alignas(32) float tmp[8]; _mm256_store_ps(tmp, t0); - TFloat result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7]; + float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7]; for (unsigned k = 0; k < rem; k++) { result += *u++ * *v++; } return result; } -#else + +// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------ + double DotProductFMA(const double *u, const double *v, int n) { const unsigned quot = n / 8; const unsigned rem = n % 8; @@ -83,8 +82,24 @@ double DotProductFMA(const double *u, const double *v, int n) { } return result; } -#endif + +// ---------------------------- END section ------------------------ } // namespace tesseract. +#else + +namespace tesseract { + +// Computes and returns the dot product of the n-vectors u and v. +// Uses Intel FMA intrinsics to access the SIMD instruction set. +inline float DotProductFMA(const float *u, const float *v, int n) { + return DotProductSSE(u, v, n); +} +inline double DotProductFMA(const double *u, const double *v, int n) { + return DotProductSSE(u, v, n); +} + +} + #endif diff --git a/src/arch/dotproductsse.cpp b/src/arch/dotproductsse.cpp index 9122e9d1b1..3b26e43e63 100644 --- a/src/arch/dotproductsse.cpp +++ b/src/arch/dotproductsse.cpp @@ -15,11 +15,7 @@ // limitations under the License. /////////////////////////////////////////////////////////////////////// -#if !defined(__SSE4_1__) -# if defined(__i686__) || defined(__x86_64__) -# error Implementation only for SSE 4.1 capable architectures -# endif -#else +#if defined(__SSE4_1__) # include # include @@ -28,9 +24,10 @@ namespace tesseract { +// ---------------------------- FAST FLOAT section ------------------------ + // Computes and returns the dot product of the n-vectors u and v. // Uses Intel SSE intrinsics to access the SIMD instruction set. -#if defined(FAST_FLOAT) float DotProductSSE(const float *u, const float *v, int n) { int max_offset = n - 4; int offset = 0; @@ -89,7 +86,9 @@ float DotProductSSE(const float *u, const float *v, int n) { } return result; } -#else + +// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------ + double DotProductSSE(const double *u, const double *v, int n) { int max_offset = n - 2; int offset = 0; @@ -139,8 +138,21 @@ double DotProductSSE(const double *u, const double *v, int n) { } return result; } -#endif + +// ---------------------------- END section ------------------------ } // namespace tesseract. +#else + +namespace tesseract { + + // Computes and returns the dot product of the n-vectors u and v. + // Uses Intel FMA intrinsics to access the SIMD instruction set. + inline TFloat DotProductSSE(const TFloat* u, const TFloat* v, int n) { + return DotProductNative(u, v, n); + } + +} + #endif diff --git a/src/arch/intsimdmatrixavx2.cpp b/src/arch/intsimdmatrixavx2.cpp index c782ebb38c..bd5bbf2695 100644 --- a/src/arch/intsimdmatrixavx2.cpp +++ b/src/arch/intsimdmatrixavx2.cpp @@ -17,11 +17,7 @@ #include "intsimdmatrix.h" -#if !defined(__AVX2__) -# if defined(__i686__) || defined(__x86_64__) -# error Implementation only for AVX2 capable architectures -# endif -#else +#if defined(__AVX2__) # include # include # include @@ -85,7 +81,7 @@ static inline __m128i load64_to_128(const int8_t *wi_) { return _mm_set_epi64x(0, wi[0]); } -#if defined(FAST_FLOAT) +// ------------- FAST FLOAT specifics section ------------------------- static inline void ExtractResults8(__m256i result, const int8_t *wi, const float *scales, float *v) { @@ -130,198 +126,8 @@ static inline void ExtractResults16(__m256i result0, __m256i result1, v += 16; } -// Computes part of matrix.vector v = Wu. Computes N=64 results. -// The weights *must* be arranged so that consecutive reads from wi -// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of -// (kNumInputsPerGroup inputs))). After that there must be N consecutive -// bias weights, before continuing with any more weights. -// u must be padded out with zeros to -// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements. -static void PartialMatrixDotVector64(const int8_t *wi, const float *scales, const int8_t *u, - int num_in, float *v) { - // Register containing 16-bit ones for horizontal add with 16->32 bit - // conversion. - __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); - __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); - // Initialize all the results to 0. - __m256i result0 = _mm256_setzero_si256(); - __m256i result1 = _mm256_setzero_si256(); - __m256i result2 = _mm256_setzero_si256(); - __m256i result3 = _mm256_setzero_si256(); - __m256i result4 = _mm256_setzero_si256(); - __m256i result5 = _mm256_setzero_si256(); - __m256i result6 = _mm256_setzero_si256(); - __m256i result7 = _mm256_setzero_si256(); - // Iterate over the input (u), one registerful at a time. - for (int j = 0; j < num_in;) { - __m256i inputs = _mm256_loadu_si256(reinterpret_cast(u + j)); - // Inputs are processed in groups of kNumInputsPerGroup, replicated - // kNumInputGroups times. - for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { - // Replicate the low 32 bits (4 inputs) 8 times. - __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); - // Rotate the inputs in groups of 4, so the next 4 inputs are ready. - inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); - __m256i weights, reps; - // Mul-add, with horizontal add of the 4 inputs to each of the results. - MultiplyGroup(rep_input, ones, wi, weights, reps, result0); - MultiplyGroup(rep_input, ones, wi, weights, reps, result1); - MultiplyGroup(rep_input, ones, wi, weights, reps, result2); - MultiplyGroup(rep_input, ones, wi, weights, reps, result3); - MultiplyGroup(rep_input, ones, wi, weights, reps, result4); - MultiplyGroup(rep_input, ones, wi, weights, reps, result5); - MultiplyGroup(rep_input, ones, wi, weights, reps, result6); - MultiplyGroup(rep_input, ones, wi, weights, reps, result7); - } - } - ExtractResults16(result0, result1, wi, scales, v); - ExtractResults16(result2, result3, wi, scales, v); - ExtractResults16(result4, result5, wi, scales, v); - ExtractResults16(result6, result7, wi, scales, v); -} - -// Computes part of matrix.vector v = Wu. Computes N=32 results. -// For details see PartialMatrixDotVector64 with N=32. -static void PartialMatrixDotVector32(const int8_t *wi, const float *scales, const int8_t *u, - int num_in, float *v) { - // Register containing 16-bit ones for horizontal add with 16->32 bit - // conversion. - __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); - __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); - // Initialize all the results to 0. - __m256i result0 = _mm256_setzero_si256(); - __m256i result1 = _mm256_setzero_si256(); - __m256i result2 = _mm256_setzero_si256(); - __m256i result3 = _mm256_setzero_si256(); - // Iterate over the input (u), one registerful at a time. - for (int j = 0; j < num_in;) { - __m256i inputs = _mm256_loadu_si256(reinterpret_cast(u + j)); - // Inputs are processed in groups of kNumInputsPerGroup, replicated - // kNumInputGroups times. - for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { - // Replicate the low 32 bits (4 inputs) 8 times. - __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); - // Rotate the inputs in groups of 4, so the next 4 inputs are ready. - inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); - __m256i weights, reps; - // Mul-add, with horizontal add of the 4 inputs to each of the results. - MultiplyGroup(rep_input, ones, wi, weights, reps, result0); - MultiplyGroup(rep_input, ones, wi, weights, reps, result1); - MultiplyGroup(rep_input, ones, wi, weights, reps, result2); - MultiplyGroup(rep_input, ones, wi, weights, reps, result3); - } - } - ExtractResults16(result0, result1, wi, scales, v); - ExtractResults16(result2, result3, wi, scales, v); -} - -// Computes part of matrix.vector v = Wu. Computes N=16 results. -// For details see PartialMatrixDotVector64 with N=16. -static void PartialMatrixDotVector16(const int8_t *wi, const float *scales, const int8_t *u, - int num_in, float *v) { - // Register containing 16-bit ones for horizontal add with 16->32 bit - // conversion. - __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); - __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); - // Initialize all the results to 0. - __m256i result0 = _mm256_setzero_si256(); - __m256i result1 = _mm256_setzero_si256(); - // Iterate over the input (u), one registerful at a time. - for (int j = 0; j < num_in;) { - __m256i inputs = _mm256_loadu_si256(reinterpret_cast(u + j)); - // Inputs are processed in groups of kNumInputsPerGroup, replicated - // kNumInputGroups times. - for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { - // Replicate the low 32 bits (4 inputs) 8 times. - __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); - // Rotate the inputs in groups of 4, so the next 4 inputs are ready. - inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); - __m256i weights, reps; - // Mul-add, with horizontal add of the 4 inputs to each of the results. - MultiplyGroup(rep_input, ones, wi, weights, reps, result0); - MultiplyGroup(rep_input, ones, wi, weights, reps, result1); - } - } - ExtractResults16(result0, result1, wi, scales, v); -} - -// Computes part of matrix.vector v = Wu. Computes N=8 results. -// For details see PartialMatrixDotVector64 with N=8. -static inline void PartialMatrixDotVector8(const int8_t *wi, const float *scales, const int8_t *u, - int num_in, float *v) { - // Register containing 16-bit ones for horizontal add with 16->32 bit - // conversion. - __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); - __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); - // Initialize all the results to 0. - __m256i result0 = _mm256_setzero_si256(); - // Iterate over the input (u), one registerful at a time. - for (int j = 0; j < num_in;) { - __m256i inputs = _mm256_loadu_si256(reinterpret_cast(u + j)); - // Inputs are processed in groups of kNumInputsPerGroup, replicated - // kNumInputGroups times. - for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { - // Replicate the low 32 bits (4 inputs) 8 times. - __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); - // Rotate the inputs in groups of 4, so the next 4 inputs are ready. - inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); - __m256i weights, reps; - // Mul-add, with horizontal add of the 4 inputs to each of the results. - MultiplyGroup(rep_input, ones, wi, weights, reps, result0); - } - } - ExtractResults8(result0, wi, scales, v); -} - -static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const float *scales, - const int8_t *u, float *v) { - const int num_out = dim1; - const int num_in = dim2 - 1; - // Each call to a partial_func_ produces group_size outputs, except the - // last one, which can produce less. - const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup); - const int rounded_num_out = IntSimdMatrix::Roundup(num_out, kNumOutputsPerRegister); - int group_size = kNumOutputsPerRegister * kMaxOutputRegisters; - int output = 0; - - int w_step = (rounded_num_in + 1) * group_size; - - // Run with this group size, until it would produce too much output, then - // switch to a smaller size. - for (; output + group_size <= rounded_num_out; output += group_size) { - PartialMatrixDotVector64(wi, scales, u, rounded_num_in, v); - wi += w_step; - scales += group_size; - v += group_size; - } - group_size /= 2; - w_step /= 2; - - if (output + group_size <= rounded_num_out) { - PartialMatrixDotVector32(wi, scales, u, rounded_num_in, v); - wi += w_step; - scales += group_size; - v += group_size; - output += group_size; - } - group_size /= 2; - w_step /= 2; - - if (output + group_size <= rounded_num_out) { - PartialMatrixDotVector16(wi, scales, u, rounded_num_in, v); - wi += w_step; - scales += group_size; - v += group_size; - output += group_size; - } - group_size /= 2; - w_step /= 2; +// ------------- HIGH-PRECICION DOUBLE specifics section ------------------------- - if (output + group_size <= rounded_num_out) { - PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v); - } -} -#else static inline void ExtractResults8(__m256i result, const int8_t *wi, const double *scales, double *v) { __m128i w128 = load64_to_128(wi); // 8x8bit vals in bottom of 128bit reg @@ -375,6 +181,8 @@ static inline void ExtractResults16(__m256i result0, __m256i result1, const int8 v += 16; } +// ------------- END specifics section ------------------------- + // Computes part of matrix.vector v = Wu. Computes N=64 results. // The weights *must* be arranged so that consecutive reads from wi // provides (num_in/kNumInputsPerGroup groups of (N output dim groups of @@ -382,8 +190,9 @@ static inline void ExtractResults16(__m256i result0, __m256i result1, const int8 // bias weights, before continuing with any more weights. // u must be padded out with zeros to // kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements. -static void PartialMatrixDotVector64(const int8_t *wi, const double *scales, const int8_t *u, - int num_in, double *v) { +template +static void PartialMatrixDotVector64(const int8_t *wi, const TFloat *scales, const int8_t *u, + int num_in, TFloat *v) { // Register containing 16-bit ones for horizontal add with 16->32 bit // conversion. __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); @@ -427,8 +236,9 @@ static void PartialMatrixDotVector64(const int8_t *wi, const double *scales, con // Computes part of matrix.vector v = Wu. Computes N=32 results. // For details see PartialMatrixDotVector64 with N=32. -static void PartialMatrixDotVector32(const int8_t *wi, const double *scales, const int8_t *u, - int num_in, double *v) { +template +static void PartialMatrixDotVector32(const int8_t *wi, const TFloat *scales, const int8_t *u, + int num_in, TFloat *v) { // Register containing 16-bit ones for horizontal add with 16->32 bit // conversion. __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); @@ -462,8 +272,9 @@ static void PartialMatrixDotVector32(const int8_t *wi, const double *scales, con // Computes part of matrix.vector v = Wu. Computes N=16 results. // For details see PartialMatrixDotVector64 with N=16. -static void PartialMatrixDotVector16(const int8_t *wi, const double *scales, const int8_t *u, - int num_in, double *v) { +template +static void PartialMatrixDotVector16(const int8_t *wi, const TFloat *scales, const int8_t *u, + int num_in, TFloat *v) { // Register containing 16-bit ones for horizontal add with 16->32 bit // conversion. __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); @@ -492,8 +303,9 @@ static void PartialMatrixDotVector16(const int8_t *wi, const double *scales, con // Computes part of matrix.vector v = Wu. Computes N=8 results. // For details see PartialMatrixDotVector64 with N=8. -static inline void PartialMatrixDotVector8(const int8_t *wi, const double *scales, const int8_t *u, - int num_in, double *v) { +template +static inline void PartialMatrixDotVector8(const int8_t *wi, const TFloat *scales, const int8_t *u, + int num_in, TFloat *v) { // Register containing 16-bit ones for horizontal add with 16->32 bit // conversion. __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); @@ -518,8 +330,9 @@ static inline void PartialMatrixDotVector8(const int8_t *wi, const double *scale ExtractResults8(result0, wi, scales, v); } -static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales, - const int8_t *u, double *v) { +template +static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const TFloat *scales, + const int8_t *u, TFloat *v) { const int num_out = dim1; const int num_in = dim2 - 1; // Each call to a partial_func_ produces group_size outputs, except the @@ -566,7 +379,7 @@ static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double * PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v); } } -#endif + static const IntSimdMatrix simdMatrix = { // Function. @@ -585,4 +398,12 @@ const IntSimdMatrix *IntSimdMatrix::intSimdMatrixAVX2 = &simdMatrix; } // namespace tesseract. +#else + +namespace tesseract { + + const IntSimdMatrix* IntSimdMatrix::intSimdMatrixAVX2 = nullptr; + +} // namespace tesseract. + #endif diff --git a/src/arch/intsimdmatrixneon.cpp b/src/arch/intsimdmatrixneon.cpp index 260f747d48..9eecc6cde4 100644 --- a/src/arch/intsimdmatrixneon.cpp +++ b/src/arch/intsimdmatrixneon.cpp @@ -16,10 +16,10 @@ // limitations under the License. /////////////////////////////////////////////////////////////////////// -#if defined(__ARM_NEON) +#include "intsimdmatrix.h" +#include "tfloat.h" -# include "intsimdmatrix.h" -# include "tfloat.h" +#if defined(HAVE_NEON) # include # include @@ -212,4 +212,12 @@ const IntSimdMatrix *IntSimdMatrix::intSimdMatrixNEON = &simdMatrix; } // namespace tesseract. +#else + +namespace tesseract { + + const IntSimdMatrix* IntSimdMatrix::intSimdMatrixNEON = nullptr; + +} // namespace tesseract. + #endif /* __ARM_NEON */ diff --git a/src/arch/intsimdmatrixsse.cpp b/src/arch/intsimdmatrixsse.cpp index a46b319fd2..0e1d1f7b5b 100644 --- a/src/arch/intsimdmatrixsse.cpp +++ b/src/arch/intsimdmatrixsse.cpp @@ -17,44 +17,7 @@ #include "intsimdmatrix.h" -#if !defined(__SSE4_1__) -# if defined(__i686__) || defined(__x86_64__) -# error Implementation only for SSE 4.1 capable architectures -# endif -#elif defined(FAST_FLOAT) -namespace tesseract { -static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const float *scales, - const int8_t *u, float *v) { - const int num_out = dim1; - const int num_in = dim2 - 1; -//#pragma omp simd collapse(2) - for (int i = 0; i < num_out; ++i) { - int total = 0; -#pragma omp simd reduction(+:total) - for (int j = 0; j < num_in; ++j) { - total += wi[j] * u[j]; - } - // Add in the bias and correct for integer values. - v[i] = (total + wi[num_in] * INT8_MAX) * scales[i]; - wi += dim2; - } -} - -static const IntSimdMatrix simdMatrix = { - matrixDotVector, - // Number of 32 bit outputs held in each register. - 1, - // Maximum number of registers that we will use to hold outputs. - 1, - // Number of 8 bit inputs in the inputs register. - 1, - // Number of inputs in each weight group. - 1 -}; - -const IntSimdMatrix *IntSimdMatrix::intSimdMatrixSSE = &simdMatrix; -} -#else +#if defined(__SSE4_1__) # include # include @@ -102,15 +65,17 @@ static int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n) { } // Computes part of matrix.vector v = Wu. Computes 1 result. -static void PartialMatrixDotVector1(const int8_t *wi, const double *scales, const int8_t *u, - int num_in, double *v) { - double total = IntDotProductSSE(u, wi, num_in); +template +static void PartialMatrixDotVector1(const int8_t *wi, const TFloat *scales, const int8_t *u, + int num_in, TFloat *v) { + TFloat total = IntDotProductSSE(u, wi, num_in); // Add in the bias and correct for integer values. *v = (total + wi[num_in] * INT8_MAX) * *scales; } -static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales, - const int8_t *u, double *v) { +template +static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const TFloat *scales, + const int8_t *u, TFloat *v) { const int num_out = dim1; const int num_in = dim2 - 1; int output = 0; @@ -139,4 +104,12 @@ const IntSimdMatrix *IntSimdMatrix::intSimdMatrixSSE = &simdMatrix; } // namespace tesseract. +#else + +namespace tesseract { + + const IntSimdMatrix* IntSimdMatrix::intSimdMatrixSSE = nullptr; + +} // namespace tesseract. + #endif diff --git a/src/arch/simddetect.cpp b/src/arch/simddetect.cpp index 6c7f822239..f58c9eb494 100644 --- a/src/arch/simddetect.cpp +++ b/src/arch/simddetect.cpp @@ -24,6 +24,7 @@ #include "params.h" // for STRING_VAR #include "simddetect.h" #include "tprintf.h" // for tprintf +#include "tfloat.h" #if defined(HAVE_FRAMEWORK_ACCELERATE) @@ -101,16 +102,21 @@ bool SIMDDetect::sse_available_; #endif #if defined(HAVE_FRAMEWORK_ACCELERATE) -TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) { - TFloat total = 0; + +float DotProductAccelerate(const float* u, const float* v, int n) { + float total = 0; const int stride = 1; -#if defined(FAST_FLOAT) vDSP_dotpr(u, stride, v, stride, &total, n); -#else + return total; +} + +double DotProductAccelerate(const double* u, const double* v, int n) { + double total = 0; + const int stride = 1; vDSP_dotprD(u, stride, v, stride, &total, n); -#endif return total; } + #endif // Computes and returns the dot product of the two n-vectors u and v. @@ -258,7 +264,7 @@ SIMDDetect::SIMDDetect() { #if defined(HAVE_NEON) || defined(__aarch64__) } else if (neon_available_) { // NEON detected. - SetDotProduct(DotProduct, IntSimdMatrix::intSimdMatrixNEON); + SetDotProduct(DotProductNative, IntSimdMatrix::intSimdMatrixNEON); #endif } } @@ -288,6 +294,7 @@ void SIMDDetect::Update() { SetDotProduct(DotProductAVX, IntSimdMatrix::intSimdMatrixAVX2); dotproduct_method = "avx2"; } else if (dotproduct == "avx-1") { + // AVX2 (Alternative Implementation) selected by config variable. SetDotProduct(DotProductAVX1, IntSimdMatrix::intSimdMatrixAVX2); dotproduct_method = "avx-1"; #endif