diff --git a/src/arch/dotproduct.cpp b/src/arch/dotproduct.cpp index 47c042f5df..9a0e5cb6f9 100644 --- a/src/arch/dotproduct.cpp +++ b/src/arch/dotproduct.cpp @@ -19,6 +19,7 @@ namespace tesseract { // Computes and returns the dot product of the two n-vectors u and v. +template TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) { TFloat total = 0; #if defined(OPENMP_SIMD) @@ -30,4 +31,8 @@ TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) { return total; } +// two instantiations: float & double. +template float DotProductNative(const float *u, const float *v, int n); +template double DotProductNative(const double *u, const double *v, int n); + } // namespace tesseract diff --git a/src/arch/dotproduct.h b/src/arch/dotproduct.h index c9b2756e2c..756918d5e3 100644 --- a/src/arch/dotproduct.h +++ b/src/arch/dotproduct.h @@ -22,22 +22,43 @@ namespace tesseract { // Computes and returns the dot product of the n-vectors u and v. +template TFloat DotProductNative(const TFloat *u, const TFloat *v, int n); +// ------------ FAST FLOAT specializations ----------------- + +// Uses Intel AVX intrinsics to access the SIMD instruction set. +float DotProductAVX(const float *u, const float *v, int n); +float DotProductAVX1(const float *u, const float *v, int n); +float DotProductAVX2(const float *u, const float *v, int n); +float DotProductAVX3(const float *u, const float *v, int n); +float DotProductAVX4(const float *u, const float *v, int n); + +// Use Intel FMA. +float DotProductFMA(const float *u, const float *v, int n); + +// Uses Intel SSE intrinsics to access the SIMD instruction set. +float DotProductSSE(const float *u, const float *v, int n); + +float DotProductAccelerate(const float *u, const float *v, int n); + +// ------------ HIGH PRECISION DOUBLE specializations ----------------- + // Uses Intel AVX intrinsics to access the SIMD instruction set. -TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n); -TFloat DotProductAVX1(const TFloat *u, const TFloat *v, int n); -TFloat DotProductAVX2(const TFloat *u, const TFloat *v, int n); -TFloat DotProductAVX3(const TFloat *u, const TFloat *v, int n); -TFloat DotProductAVX4(const TFloat *u, const TFloat *v, int n); +double DotProductAVX(const double *u, const double *v, int n); +double DotProductAVX1(const double *u, const double *v, int n); +double DotProductAVX2(const double *u, const double *v, int n); +double DotProductAVX3(const double *u, const double *v, int n); +double DotProductAVX4(const double *u, const double *v, int n); // Use Intel FMA. -TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n); +double DotProductFMA(const double *u, const double *v, int n); // Uses Intel SSE intrinsics to access the SIMD instruction set. -TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n); +double DotProductSSE(const double *u, const double *v, int n); + +double DotProductAccelerate(const double *u, const double *v, int n); -TFloat DotProductAccelerate(const TFloat *u, const TFloat *v, int n); } // namespace tesseract. #endif // TESSERACT_ARCH_DOTPRODUCT_H_ diff --git a/src/arch/dotproductavx.cpp b/src/arch/dotproductavx.cpp index 4c49e9e4a3..275ae28363 100644 --- a/src/arch/dotproductavx.cpp +++ b/src/arch/dotproductavx.cpp @@ -27,9 +27,10 @@ namespace tesseract { +// ---------------------------- FAST FLOAT section ------------------------ + // Computes and returns the dot product of the n-vectors u and v. // Uses Intel AVX intrinsics to access the SIMD instruction set. -#if defined(FAST_FLOAT) float DotProductAVX(const float *u, const float *v, int n) { const unsigned quot = n / 8; const unsigned rem = n % 8; @@ -50,6 +51,7 @@ float DotProductAVX(const float *u, const float *v, int n) { } return result; } + float DotProductAVX1(const float *u, const float *v, int n) { const unsigned quot = n / 16; const unsigned rem = n % 16; @@ -76,7 +78,9 @@ float DotProductAVX1(const float *u, const float *v, int n) { } return result; } -#else + +// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------ + double DotProductAVX1(const double *u, const double *v, int n) { __m256d t0 = _mm256_setzero_pd(); __m256d t1 = _mm256_setzero_pd(); @@ -130,7 +134,8 @@ double DotProductAVX(const double *u, const double *v, int n) { } return result; } -#endif + +// ---------------------------- END FLOAT/DOUBLE sections ------------------------ } // namespace tesseract. diff --git a/src/arch/dotproductfma.cpp b/src/arch/dotproductfma.cpp index 6afaefd3eb..edc85d0b33 100644 --- a/src/arch/dotproductfma.cpp +++ b/src/arch/dotproductfma.cpp @@ -27,10 +27,11 @@ namespace tesseract { +// ---------------------------- FAST FLOAT section ------------------------ + // Computes and returns the dot product of the n-vectors u and v. // Uses Intel FMA intrinsics to access the SIMD instruction set. -#if defined(FAST_FLOAT) -TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) { +float DotProductFMA(const float *u, const float *v, int n) { const unsigned quot = n / 16; const unsigned rem = n % 16; __m256 t0 = _mm256_setzero_ps(); @@ -48,15 +49,17 @@ TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) { v += 8; } t0 = _mm256_hadd_ps(t0, t1); - alignas(32) TFloat tmp[8]; + alignas(32) float tmp[8]; _mm256_store_ps(tmp, t0); - TFloat result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7]; + float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7]; for (unsigned k = 0; k < rem; k++) { result += *u++ * *v++; } return result; } -#else + +// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------ + double DotProductFMA(const double *u, const double *v, int n) { const unsigned quot = n / 8; const unsigned rem = n % 8; @@ -83,7 +86,8 @@ double DotProductFMA(const double *u, const double *v, int n) { } return result; } -#endif + +// ---------------------------- END section ------------------------ } // namespace tesseract. diff --git a/src/arch/dotproductsse.cpp b/src/arch/dotproductsse.cpp index 9122e9d1b1..39ddf01972 100644 --- a/src/arch/dotproductsse.cpp +++ b/src/arch/dotproductsse.cpp @@ -28,9 +28,10 @@ namespace tesseract { +// ---------------------------- FAST FLOAT section ------------------------ + // Computes and returns the dot product of the n-vectors u and v. // Uses Intel SSE intrinsics to access the SIMD instruction set. -#if defined(FAST_FLOAT) float DotProductSSE(const float *u, const float *v, int n) { int max_offset = n - 4; int offset = 0; @@ -89,7 +90,9 @@ float DotProductSSE(const float *u, const float *v, int n) { } return result; } -#else + +// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------ + double DotProductSSE(const double *u, const double *v, int n) { int max_offset = n - 2; int offset = 0; @@ -139,7 +142,8 @@ double DotProductSSE(const double *u, const double *v, int n) { } return result; } -#endif + +// ---------------------------- END section ------------------------ } // namespace tesseract. diff --git a/src/arch/intsimdmatrixavx2.cpp b/src/arch/intsimdmatrixavx2.cpp index c87ef414a7..92936309b3 100644 --- a/src/arch/intsimdmatrixavx2.cpp +++ b/src/arch/intsimdmatrixavx2.cpp @@ -21,53 +21,8 @@ # if defined(__i686__) || defined(__x86_64__) # error Implementation only for AVX2 capable architectures # endif -#elif defined(FAST_FLOAT) -namespace tesseract { - -static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const float *scales, - const int8_t *u, float *v) { - const int num_out = dim1; - const int num_in = dim2 - 1; - for (int i = 0; i < num_out; ++i) { - for (int j = 0; j < num_in; ++j) { - } - } -} - -#if 0 -void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY &w, - const std::vector &scales, const int8_t *u, TFloat *v) { - int num_out = w.dim1(); - int num_in = w.dim2() - 1; - // Base implementation. - for (int i = 0; i < num_out; ++i) { - const int8_t *wi = w[i]; - int total = 0; - for (int j = 0; j < num_in; ++j) { - total += wi[j] * u[j]; - } - // Add in the bias and correct for integer values. - v[i] = (total + wi[num_in] * INT8_MAX) * scales[i]; - } -} -#endif - -static const IntSimdMatrix simdMatrix = { - // Function. - matrixDotVector, - // Number of 32 bit outputs held in each register. - 1, - // Maximum number of registers that we will use to hold outputs. - 1, - // Number of 8 bit inputs in the inputs register. - 1, - // Number of inputs in each weight group. - 1 -}; - -const IntSimdMatrix *IntSimdMatrix::intSimdMatrixAVX2 = &simdMatrix; -} #else + # include # include # include @@ -131,7 +86,7 @@ static inline __m128i load64_to_128(const int8_t *wi_) { return _mm_set_epi64x(0, wi[0]); } -#if defined(FAST_FLOAT) +// ------------- FAST FLOAT specifics section ------------------------- static inline void ExtractResults8(__m256i result, const int8_t *wi, const float *scales, float *v) { @@ -176,198 +131,8 @@ static inline void ExtractResults16(__m256i result0, __m256i result1, v += 16; } -// Computes part of matrix.vector v = Wu. Computes N=64 results. -// The weights *must* be arranged so that consecutive reads from wi -// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of -// (kNumInputsPerGroup inputs))). After that there must be N consecutive -// bias weights, before continuing with any more weights. -// u must be padded out with zeros to -// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements. -static void PartialMatrixDotVector64(const int8_t *wi, const float *scales, const int8_t *u, - int num_in, float *v) { - // Register containing 16-bit ones for horizontal add with 16->32 bit - // conversion. - __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); - __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); - // Initialize all the results to 0. - __m256i result0 = _mm256_setzero_si256(); - __m256i result1 = _mm256_setzero_si256(); - __m256i result2 = _mm256_setzero_si256(); - __m256i result3 = _mm256_setzero_si256(); - __m256i result4 = _mm256_setzero_si256(); - __m256i result5 = _mm256_setzero_si256(); - __m256i result6 = _mm256_setzero_si256(); - __m256i result7 = _mm256_setzero_si256(); - // Iterate over the input (u), one registerful at a time. - for (int j = 0; j < num_in;) { - __m256i inputs = _mm256_loadu_si256(reinterpret_cast(u + j)); - // Inputs are processed in groups of kNumInputsPerGroup, replicated - // kNumInputGroups times. - for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { - // Replicate the low 32 bits (4 inputs) 8 times. - __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); - // Rotate the inputs in groups of 4, so the next 4 inputs are ready. - inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); - __m256i weights, reps; - // Mul-add, with horizontal add of the 4 inputs to each of the results. - MultiplyGroup(rep_input, ones, wi, weights, reps, result0); - MultiplyGroup(rep_input, ones, wi, weights, reps, result1); - MultiplyGroup(rep_input, ones, wi, weights, reps, result2); - MultiplyGroup(rep_input, ones, wi, weights, reps, result3); - MultiplyGroup(rep_input, ones, wi, weights, reps, result4); - MultiplyGroup(rep_input, ones, wi, weights, reps, result5); - MultiplyGroup(rep_input, ones, wi, weights, reps, result6); - MultiplyGroup(rep_input, ones, wi, weights, reps, result7); - } - } - ExtractResults16(result0, result1, wi, scales, v); - ExtractResults16(result2, result3, wi, scales, v); - ExtractResults16(result4, result5, wi, scales, v); - ExtractResults16(result6, result7, wi, scales, v); -} - -// Computes part of matrix.vector v = Wu. Computes N=32 results. -// For details see PartialMatrixDotVector64 with N=32. -static void PartialMatrixDotVector32(const int8_t *wi, const float *scales, const int8_t *u, - int num_in, float *v) { - // Register containing 16-bit ones for horizontal add with 16->32 bit - // conversion. - __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); - __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); - // Initialize all the results to 0. - __m256i result0 = _mm256_setzero_si256(); - __m256i result1 = _mm256_setzero_si256(); - __m256i result2 = _mm256_setzero_si256(); - __m256i result3 = _mm256_setzero_si256(); - // Iterate over the input (u), one registerful at a time. - for (int j = 0; j < num_in;) { - __m256i inputs = _mm256_loadu_si256(reinterpret_cast(u + j)); - // Inputs are processed in groups of kNumInputsPerGroup, replicated - // kNumInputGroups times. - for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { - // Replicate the low 32 bits (4 inputs) 8 times. - __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); - // Rotate the inputs in groups of 4, so the next 4 inputs are ready. - inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); - __m256i weights, reps; - // Mul-add, with horizontal add of the 4 inputs to each of the results. - MultiplyGroup(rep_input, ones, wi, weights, reps, result0); - MultiplyGroup(rep_input, ones, wi, weights, reps, result1); - MultiplyGroup(rep_input, ones, wi, weights, reps, result2); - MultiplyGroup(rep_input, ones, wi, weights, reps, result3); - } - } - ExtractResults16(result0, result1, wi, scales, v); - ExtractResults16(result2, result3, wi, scales, v); -} +// ------------- HIGH-PRECICION DOUBLE specifics section ------------------------- -// Computes part of matrix.vector v = Wu. Computes N=16 results. -// For details see PartialMatrixDotVector64 with N=16. -static void PartialMatrixDotVector16(const int8_t *wi, const float *scales, const int8_t *u, - int num_in, float *v) { - // Register containing 16-bit ones for horizontal add with 16->32 bit - // conversion. - __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); - __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); - // Initialize all the results to 0. - __m256i result0 = _mm256_setzero_si256(); - __m256i result1 = _mm256_setzero_si256(); - // Iterate over the input (u), one registerful at a time. - for (int j = 0; j < num_in;) { - __m256i inputs = _mm256_loadu_si256(reinterpret_cast(u + j)); - // Inputs are processed in groups of kNumInputsPerGroup, replicated - // kNumInputGroups times. - for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { - // Replicate the low 32 bits (4 inputs) 8 times. - __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); - // Rotate the inputs in groups of 4, so the next 4 inputs are ready. - inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); - __m256i weights, reps; - // Mul-add, with horizontal add of the 4 inputs to each of the results. - MultiplyGroup(rep_input, ones, wi, weights, reps, result0); - MultiplyGroup(rep_input, ones, wi, weights, reps, result1); - } - } - ExtractResults16(result0, result1, wi, scales, v); -} - -// Computes part of matrix.vector v = Wu. Computes N=8 results. -// For details see PartialMatrixDotVector64 with N=8. -static inline void PartialMatrixDotVector8(const int8_t *wi, const float *scales, const int8_t *u, - int num_in, float *v) { - // Register containing 16-bit ones for horizontal add with 16->32 bit - // conversion. - __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); - __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); - // Initialize all the results to 0. - __m256i result0 = _mm256_setzero_si256(); - // Iterate over the input (u), one registerful at a time. - for (int j = 0; j < num_in;) { - __m256i inputs = _mm256_loadu_si256(reinterpret_cast(u + j)); - // Inputs are processed in groups of kNumInputsPerGroup, replicated - // kNumInputGroups times. - for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { - // Replicate the low 32 bits (4 inputs) 8 times. - __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); - // Rotate the inputs in groups of 4, so the next 4 inputs are ready. - inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); - __m256i weights, reps; - // Mul-add, with horizontal add of the 4 inputs to each of the results. - MultiplyGroup(rep_input, ones, wi, weights, reps, result0); - } - } - ExtractResults8(result0, wi, scales, v); -} - -static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const float *scales, - const int8_t *u, float *v) { - const int num_out = dim1; - const int num_in = dim2 - 1; - // Each call to a partial_func_ produces group_size outputs, except the - // last one, which can produce less. - const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup); - const int rounded_num_out = IntSimdMatrix::Roundup(num_out, kNumOutputsPerRegister); - int group_size = kNumOutputsPerRegister * kMaxOutputRegisters; - int output = 0; - - int w_step = (rounded_num_in + 1) * group_size; - - // Run with this group size, until it would produce too much output, then - // switch to a smaller size. - for (; output + group_size <= rounded_num_out; output += group_size) { - PartialMatrixDotVector64(wi, scales, u, rounded_num_in, v); - wi += w_step; - scales += group_size; - v += group_size; - } - group_size /= 2; - w_step /= 2; - - if (output + group_size <= rounded_num_out) { - PartialMatrixDotVector32(wi, scales, u, rounded_num_in, v); - wi += w_step; - scales += group_size; - v += group_size; - output += group_size; - } - group_size /= 2; - w_step /= 2; - - if (output + group_size <= rounded_num_out) { - PartialMatrixDotVector16(wi, scales, u, rounded_num_in, v); - wi += w_step; - scales += group_size; - v += group_size; - output += group_size; - } - group_size /= 2; - w_step /= 2; - - if (output + group_size <= rounded_num_out) { - PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v); - } -} -#else static inline void ExtractResults8(__m256i result, const int8_t *wi, const double *scales, double *v) { __m128i w128 = load64_to_128(wi); // 8x8bit vals in bottom of 128bit reg @@ -421,6 +186,8 @@ static inline void ExtractResults16(__m256i result0, __m256i result1, const int8 v += 16; } +// ------------- END specifics section ------------------------- + // Computes part of matrix.vector v = Wu. Computes N=64 results. // The weights *must* be arranged so that consecutive reads from wi // provides (num_in/kNumInputsPerGroup groups of (N output dim groups of @@ -428,8 +195,9 @@ static inline void ExtractResults16(__m256i result0, __m256i result1, const int8 // bias weights, before continuing with any more weights. // u must be padded out with zeros to // kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements. -static void PartialMatrixDotVector64(const int8_t *wi, const double *scales, const int8_t *u, - int num_in, double *v) { +template +static void PartialMatrixDotVector64(const int8_t *wi, const TFloat *scales, const int8_t *u, + int num_in, TFloat *v) { // Register containing 16-bit ones for horizontal add with 16->32 bit // conversion. __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); @@ -473,8 +241,9 @@ static void PartialMatrixDotVector64(const int8_t *wi, const double *scales, con // Computes part of matrix.vector v = Wu. Computes N=32 results. // For details see PartialMatrixDotVector64 with N=32. -static void PartialMatrixDotVector32(const int8_t *wi, const double *scales, const int8_t *u, - int num_in, double *v) { +template +static void PartialMatrixDotVector32(const int8_t *wi, const TFloat *scales, const int8_t *u, + int num_in, TFloat *v) { // Register containing 16-bit ones for horizontal add with 16->32 bit // conversion. __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); @@ -508,8 +277,9 @@ static void PartialMatrixDotVector32(const int8_t *wi, const double *scales, con // Computes part of matrix.vector v = Wu. Computes N=16 results. // For details see PartialMatrixDotVector64 with N=16. -static void PartialMatrixDotVector16(const int8_t *wi, const double *scales, const int8_t *u, - int num_in, double *v) { +template +static void PartialMatrixDotVector16(const int8_t *wi, const TFloat *scales, const int8_t *u, + int num_in, TFloat *v) { // Register containing 16-bit ones for horizontal add with 16->32 bit // conversion. __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); @@ -538,8 +308,9 @@ static void PartialMatrixDotVector16(const int8_t *wi, const double *scales, con // Computes part of matrix.vector v = Wu. Computes N=8 results. // For details see PartialMatrixDotVector64 with N=8. -static inline void PartialMatrixDotVector8(const int8_t *wi, const double *scales, const int8_t *u, - int num_in, double *v) { +template +static inline void PartialMatrixDotVector8(const int8_t *wi, const TFloat *scales, const int8_t *u, + int num_in, TFloat *v) { // Register containing 16-bit ones for horizontal add with 16->32 bit // conversion. __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); @@ -564,8 +335,9 @@ static inline void PartialMatrixDotVector8(const int8_t *wi, const double *scale ExtractResults8(result0, wi, scales, v); } -static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales, - const int8_t *u, double *v) { +template +static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const TFloat *scales, + const int8_t *u, TFloat *v) { const int num_out = dim1; const int num_in = dim2 - 1; // Each call to a partial_func_ produces group_size outputs, except the @@ -612,7 +384,7 @@ static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double * PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v); } } -#endif + static const IntSimdMatrix simdMatrix = { // Function. diff --git a/src/arch/intsimdmatrixsse.cpp b/src/arch/intsimdmatrixsse.cpp index 7407f6f5a1..ab71a1f1d7 100644 --- a/src/arch/intsimdmatrixsse.cpp +++ b/src/arch/intsimdmatrixsse.cpp @@ -21,10 +21,6 @@ # if defined(__i686__) || defined(__x86_64__) # error Implementation only for SSE 4.1 capable architectures # endif -#elif defined(FAST_FLOAT) -namespace tesseract { -const IntSimdMatrix *IntSimdMatrix::intSimdMatrixSSE = nullptr; -} #else # include @@ -73,15 +69,17 @@ static int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n) { } // Computes part of matrix.vector v = Wu. Computes 1 result. -static void PartialMatrixDotVector1(const int8_t *wi, const double *scales, const int8_t *u, - int num_in, double *v) { - double total = IntDotProductSSE(u, wi, num_in); +template +static void PartialMatrixDotVector1(const int8_t *wi, const TFloat *scales, const int8_t *u, + int num_in, TFloat *v) { + TFloat total = IntDotProductSSE(u, wi, num_in); // Add in the bias and correct for integer values. *v = (total + wi[num_in] * INT8_MAX) * *scales; } -static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales, - const int8_t *u, double *v) { +template +static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const TFloat *scales, + const int8_t *u, TFloat *v) { const int num_out = dim1; const int num_in = dim2 - 1; int output = 0; diff --git a/src/ccstruct/matrix.h b/src/ccstruct/matrix.h index a97912ad78..e1062294dc 100644 --- a/src/ccstruct/matrix.h +++ b/src/ccstruct/matrix.h @@ -145,45 +145,53 @@ class GENERIC_2D_ARRAY { } } + // ----------------------------------------------------------- + // Serialization & Deserialization to disk uses specific Storage Types (ST) + // which MAY not be identical to the run-time Type (T). + // ----------------------------------------------------------- + // Writes to the given file. Returns false in case of error. // Only works with bitwise-serializeable types! + template bool Serialize(FILE *fp) const { if (!SerializeSize(fp)) { return false; } - if (!tesseract::Serialize(fp, &empty_)) { + if (!tesseract::Serialize(fp, &empty_)) { return false; } int size = num_elements(); - return tesseract::Serialize(fp, &array_[0], size); + return tesseract::Serialize(fp, &array_[0], size); } + template bool Serialize(TFile *fp) const { if (!SerializeSize(fp)) { return false; } - if (!fp->Serialize(&empty_)) { + if (!fp->Serialize(&empty_)) { return false; } int size = num_elements(); - return fp->Serialize(&array_[0], size); + return fp->Serialize(&array_[0], size); } // Reads from the given file. Returns false in case of error. // Only works with bitwise-serializeable types! // If swap is true, assumes a big/little-endian swap is needed. + template bool DeSerialize(bool swap, FILE *fp) { if (!DeSerializeSize(swap, fp)) { return false; } - if (!tesseract::DeSerialize(fp, &empty_)) { + if (!tesseract::DeSerialize(fp, &empty_)) { return false; } if (swap) { ReverseN(&empty_, sizeof(empty_)); } int size = num_elements(); - if (!tesseract::DeSerialize(fp, &array_[0], size)) { + if (!tesseract::DeSerialize(fp, &array_[0], size)) { return false; } if (swap) { @@ -194,9 +202,10 @@ class GENERIC_2D_ARRAY { return true; } + template bool DeSerialize(TFile *fp) { - return DeSerializeSize(fp) && fp->DeSerialize(&empty_) && - fp->DeSerialize(&array_[0], num_elements()); + return DeSerializeSize(fp) && fp->DeSerialize(&empty_) && + fp->DeSerialize(&array_[0], num_elements()); } // Writes to the given file. Returns false in case of error. @@ -291,7 +300,7 @@ class GENERIC_2D_ARRAY { void operator+=(const GENERIC_2D_ARRAY &addend) { if (dim2_ == addend.dim2_) { // Faster if equal size in the major dimension. - int size = std::min(num_elements(), addend.num_elements()); + int size = std::min(num_elements(), addend.num_elements()); for (int i = 0; i < size; ++i) { array_[i] += addend.array_[i]; } @@ -307,7 +316,7 @@ class GENERIC_2D_ARRAY { void operator-=(const GENERIC_2D_ARRAY &minuend) { if (dim2_ == minuend.dim2_) { // Faster if equal size in the major dimension. - int size = std::min(num_elements(), minuend.num_elements()); + int size = std::min(num_elements(), minuend.num_elements()); for (int i = 0; i < size; ++i) { array_[i] -= minuend.array_[i]; } @@ -467,8 +476,8 @@ class GENERIC_2D_ARRAY { // Higher dimensions above 2 are strictly the responsibility of the caller. void RotatingTranspose(const int *dims, int num_dims, int src_dim, int dest_dim, GENERIC_2D_ARRAY *result) const { - int max_d = std::max(src_dim, dest_dim); - int min_d = std::min(src_dim, dest_dim); + int max_d = std::max(src_dim, dest_dim); + int min_d = std::min(src_dim, dest_dim); // In a tensor of shape [d0, d1... min_d, ... max_d, ... dn-2, dn-1], the // ends outside of min_d and max_d are unaffected, with [max_d +1, dn-1] // being contiguous blocks of data that will move together, and @@ -632,7 +641,7 @@ class BandTriMatrix : public GENERIC_2D_ARRAY { // to *this. void AttachOnCorner(BandTriMatrix *array2) { int new_dim1 = this->dim1_ + array2->dim1_; - int new_dim2 = std::max(this->dim2_, array2->dim2_); + int new_dim2 = std::max(this->dim2_, array2->dim2_); T *new_array = new T[new_dim1 * new_dim2]; for (int col = 0; col < new_dim1; ++col) { for (int j = 0; j < new_dim2; ++j) { diff --git a/src/ccutil/serialis.h b/src/ccutil/serialis.h index a07ed394eb..f656d5b8ca 100644 --- a/src/ccutil/serialis.h +++ b/src/ccutil/serialis.h @@ -26,6 +26,7 @@ #include #include #include // std::vector +#include "tfloat.h" namespace tesseract { @@ -43,6 +44,11 @@ constexpr size_t countof(T const (&)[N]) noexcept { return N; } +// Function to read a std::vector from a whole file. +// Returns false on failure. +// using FileReader = bool (*)(const char* filename, std::vector* data); +// ^-- imported from baseapi.h + // Function to write a std::vector to a whole file. // Returns false on failure. using FileWriter = bool (*)(const std::vector &data, const char *filename); @@ -64,6 +70,53 @@ bool Serialize(FILE *fp, const T *data, size_t n = 1) { return fwrite(data, sizeof(T), n, fp) == n; } +// Deserialize data from file. +template +bool DeSerialize(FILE* fp, T* data, size_t n = 1) { + ST* arr = new ST[n]; + bool rv = (fread(&arr[0], sizeof(ST), n, fp) == n); + if (rv) { + for (size_t i = 0; i < n; i++) { + data[i] = arr[i]; + } + } + delete[] arr; + return rv; +} + +template +bool DeSerialize(FILE* fp, double* data, size_t n); +template +bool DeSerialize(FILE* fp, double* data, size_t n); +template +bool DeSerialize(FILE* fp, float* data, size_t n); +template +bool DeSerialize(FILE* fp, float* data, size_t n); + +// Serialize data to file. +template +bool Serialize(FILE* fp, const T* data, size_t n = 1) { + ST* arr = new ST[n]; + for (size_t i = 0; i < n; i++) { + arr[i] = data[i]; + } + bool rv = (fwrite(&arr[0], sizeof(ST), n, fp) == n); + delete[] arr; + return rv; +} + +template +bool Serialize(FILE* fp, const double* data, size_t n); +template +bool Serialize(FILE* fp, const double* data, size_t n); +template +bool Serialize(FILE* fp, const float* data, size_t n); +template +bool Serialize(FILE* fp, const float* data, size_t n); + + + + // Simple file class. // Allows for portable file input from memory and from foreign file systems. class TESS_API TFile { @@ -128,9 +181,9 @@ class TESS_API TFile { data.resize(size); for (uint32_t i = 0; i < size; i++) { uint8_t non_null; - if (!DeSerialize(&non_null)) { + if (!DeSerialize(&non_null)) { return false; - } + } if (non_null) { typedef typename std::remove_pointer::type ST; auto item = new ST; @@ -149,13 +202,72 @@ class TESS_API TFile { } return true; } + template + bool DeSerialize(T* data, size_t count = 1) + { + ST* arr = new ST[count]; + bool rv = (FReadEndian(&arr[0], sizeof(ST), count) == static_cast(count)); + if (rv) + { + for (size_t i = 0; i < count; i++) + { + data[i] = arr[i]; + } + } + delete[] arr; + return rv; + } + template + bool DeSerialize(std::vector& data) + { + std::vector arr; + bool rv = DeSerialize(arr); + if (rv) + { + size_t len = arr.size(); + data.resize(len); + for (size_t i = 0; i < len; i++) { + data[i] = arr[i]; + } + } + return rv; + } +#if 0 + template bool DeSerialize(double* data, size_t count); + template bool DeSerialize(double* data, size_t count); + template bool DeSerialize(float* data, size_t count); + template bool DeSerialize(float* data, size_t count); +#endif // Serialize data. bool Serialize(const std::string &data); bool Serialize(const std::vector &data); template + bool Serialize(const T* data, size_t count = 1) { + return FWrite(data, sizeof(T), count) == static_cast(count); + } + template bool Serialize(const T *data, size_t count = 1) { - return FWrite(data, sizeof(T), count) == static_cast(count); + ST* arr = new ST[count]; + for (size_t i = 0; i < count; i++) + { + arr[i] = data[i]; + } + bool rv = (FWrite(&arr[0], sizeof(ST), count) == static_cast(count)); + delete[] arr; + return rv; + } + template + bool Serialize(const std::vector& data) + { + std::vector arr; + size_t len = data.size(); + arr.resize(len); + for (size_t i = 0; i < len; i++) { + arr[i] = data[i]; + } + bool rv = Serialize(arr); + return rv; } template bool Serialize(const std::vector &data) { @@ -181,14 +293,14 @@ class TESS_API TFile { // Serialize pointers. for (auto &item : data) { uint8_t non_null = (item != nullptr); - if (!Serialize(&non_null)) { + if (!Serialize(&non_null)) { return false; - } + } if (non_null) { if (!item->Serialize(this)) { return false; - } - } + } + } } } else if (size > 0) { // Serialize a non-class. diff --git a/src/lstm/weightmatrix.cpp b/src/lstm/weightmatrix.cpp index ba7b8653e7..46e10433ca 100644 --- a/src/lstm/weightmatrix.cpp +++ b/src/lstm/weightmatrix.cpp @@ -22,6 +22,7 @@ #include "simddetect.h" // for DotProduct #include "statistc.h" #include "tprintf.h" // forTFloat +#include "tfloat.h" namespace tesseract { @@ -37,7 +38,6 @@ const int kAdamCorrectionIterations = 200000; const TFloat kAdamEpsilon = 1e-8; // Utility functions convert between double and float arrays. -#ifdef FAST_FLOAT static void DoubleToFloat(const GENERIC_2D_ARRAY &src, GENERIC_2D_ARRAY &dst) { const auto dim1 = src.dim1(); const auto dim2 = src.dim2(); @@ -50,7 +50,6 @@ static void DoubleToFloat(const GENERIC_2D_ARRAY &src, GENERIC_2D_ARRAY< } } } -#endif static void FloatToDouble(const GENERIC_2D_ARRAY &src, GENERIC_2D_ARRAY &dst) { const auto dim1 = src.dim1(); @@ -66,28 +65,14 @@ static void FloatToDouble(const GENERIC_2D_ARRAY &src, GENERIC_2D_ARRAY &tfloat_array) { -#ifdef FAST_FLOAT - GENERIC_2D_ARRAY double_array; - if (!double_array.DeSerialize(fp)) { - return false; - } - DoubleToFloat(double_array, tfloat_array); - return true; -#else - return tfloat_array.DeSerialize(fp); -#endif + return tfloat_array.DeSerialize(fp); } static bool Serialize(TFile *fp, const GENERIC_2D_ARRAY &tfloat_array) { -#ifdef FAST_FLOAT - GENERIC_2D_ARRAY double_array; - FloatToDouble(tfloat_array, double_array); - return double_array.Serialize(fp); -#else - return tfloat_array.Serialize(fp); -#endif + return tfloat_array.Serialize(fp); } + // Computes matrix.vector v = Wu. // u is of size W.dim2() - add_bias_fwd and the output v is of size // W.dim1() - skip_bias_back. @@ -243,7 +228,7 @@ bool WeightMatrix::Serialize(bool training, TFile *fp) const { return false; } if (int_mode_) { - if (!wi_.Serialize(fp)) { + if (!wi_.Serialize(fp)) { return false; } // The scales stored in memory have an extra factor applied to them @@ -257,14 +242,9 @@ bool WeightMatrix::Serialize(bool training, TFile *fp) const { if (!fp->Serialize(&size)) { return false; } -#ifdef FAST_FLOAT - assert(!"not implemented"); - return false; -#else - if (!fp->Serialize(&scales[0], size)) { + if (!fp->Serialize(&scales[0], size)) { return false; } -#endif } else { if (!tesseract::Serialize(fp, wf_)) { return false; @@ -294,31 +274,20 @@ bool WeightMatrix::DeSerialize(bool training, TFile *fp) { return DeSerializeOld(training, fp); } if (int_mode_) { - if (!wi_.DeSerialize(fp)) { + if (!wi_.DeSerialize(fp)) { return false; } uint32_t size; if (!fp->DeSerialize(&size)) { return false; } -#ifdef FAST_FLOAT - scales_.reserve(size); - for (auto n = size; n > 0; n--) { - double val; - if (!fp->DeSerialize(&val)) { - return false; - } - scales_.push_back(val / INT8_MAX); - } -#else scales_.resize(size); - if (!fp->DeSerialize(&scales_[0], size)) { + if (!fp->DeSerialize(&scales_[0], size)) { return false; } for (auto &scale : scales_) { scale /= INT8_MAX; } -#endif if (IntSimdMatrix::intSimdMatrix) { int32_t rounded_num_out; IntSimdMatrix::intSimdMatrix->Init(wi_, shaped_w_, rounded_num_out); @@ -346,44 +315,30 @@ bool WeightMatrix::DeSerialize(bool training, TFile *fp) { // As DeSerialize, but reads an old (float) format WeightMatrix for // backward compatibility. bool WeightMatrix::DeSerializeOld(bool training, TFile *fp) { -#ifdef FAST_FLOAT - // Not implemented. - assert(!"not implemented"); - return false; -#else if (int_mode_) { - if (!wi_.DeSerialize(fp)) { + if (!wi_.DeSerialize(fp)) { return false; } - std::vector old_scales; - if (!fp->DeSerialize(old_scales)) { + if (!fp->DeSerialize(scales_)) { return false; } - scales_.reserve(old_scales.size()); - for (float old_scale : old_scales) { - scales_.push_back(old_scale); - } } else { - GENERIC_2D_ARRAY float_array; - if (!float_array.DeSerialize(fp)) { + if (!wf_.DeSerialize(fp)) { return false; } - FloatToDouble(float_array, wf_); } if (training) { InitBackward(); - GENERIC_2D_ARRAY float_array; - if (!float_array.DeSerialize(fp)) { + if (!updates_.DeSerialize(fp)) { return false; } - FloatToDouble(float_array, updates_); // Errs was only used in int training, which is now dead. - if (!float_array.DeSerialize(fp)) { + GENERIC_2D_ARRAY float_array; + if (!float_array.DeSerialize(fp)) { return false; } } return true; -#endif } // Computes matrix.vector v = Wu. diff --git a/unittest/dotproduct_test.cc b/unittest/dotproduct_test.cc index e97eea325a..e37322ce7f 100644 --- a/unittest/dotproduct_test.cc +++ b/unittest/dotproduct_test.cc @@ -49,7 +49,6 @@ void DotProductTest::RunTest(TFloat (*f)(const TFloat *u, const TFloat *v, int n } } -TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n); TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) { TFloat total = 0; #pragma omp simd reduction(+:total)