From 97834d0364f4b7a1a5dcfa5625761d00d932751b Mon Sep 17 00:00:00 2001
From: Ger Hobbelt <ger@hobbelt.com>
Date: Thu, 15 Jul 2021 15:23:41 +0200
Subject: [PATCH] This is how the idea expressed in #3490 looks like: using
 function templates for TFloat float & double implementations to co-exist in
 the run-time without cluttering the code with #if/#else and no run-time
 switches (yet).

## Observations thus far

- DRY? Check!
- the whole function template (and let the C++ compiler do the heavy lifting) idea of stops somewhere. This regrettably happens to be at the weightmatrix.cpp code, where the code calls the CPU+configuration-selected SIMD implementation via function pointer: `intSimdMatrix->matrixDotVectorFunction` -- this would require code duplication of some kind (e.g. a FP32 callback pointer co-existing with a FP64 callback ptr in the struct and then have the code pick the right one, depending on current TFloat size, for example) and is thus deemed unsatisfactory (my opinion).
- So far, and very probably independent of any solutions for the co-existence issue at higher levels in the code, this template approach works out well, with the compiler smartly picking the one matching the current float/double choice.
- while we have double the number of specialized SIMD implementations (obviously), these do not need #if/#else checks as we can let the C++ compiler do its prototype matching job --> cleaner code.
- the template functions also help clean up the serialization/de-serialization code as the `<T, ST>` dual-type approach there allows one to specify the run-time type (TFloat) and the file-storage type at the same time: also do note how this cleans up the 'Old' scales deserialization code, as the old file storage is simply 'float' instead of 'double'.
- the added cost there is a double copy of file data when T==ST, but that turned out negligible in the preliminary tests as that bit of code didn't even reach the Top20 CPU Guzzlers Chart, so that extra copy can wait for smarter C++ template writers to take care of when microtuning is called for.
---
 src/arch/dotproduct.cpp        |   5 +
 src/arch/dotproduct.h          |  37 +++--
 src/arch/dotproductavx.cpp     |  31 +++--
 src/arch/dotproductfma.cpp     |  37 +++--
 src/arch/dotproductsse.cpp     |  28 ++--
 src/arch/intsimdmatrixavx2.cpp | 237 ++++-----------------------------
 src/arch/intsimdmatrixneon.cpp |  14 +-
 src/arch/intsimdmatrixsse.cpp  |  59 +++-----
 src/arch/simddetect.cpp        |  19 ++-
 9 files changed, 172 insertions(+), 295 deletions(-)
diff --git a/src/arch/dotproduct.cpp b/src/arch/dotproduct.cpp
index f964e3256e..daba2ff25b 100644
--- a/src/arch/dotproduct.cpp
+++ b/src/arch/dotproduct.cpp
@@ -19,6 +19,7 @@
 namespace tesseract {
 
 // Computes and returns the dot product of the two n-vectors u and v.
+template <class TFloat>
 TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
   TFloat total = 0;
 #if defined(OPENMP_SIMD) || defined(_OPENMP)
@@ -30,4 +31,8 @@ TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
   return total;
 }
 
+// two instantiations: float & double.
+template float DotProductNative<float>(const float *u, const float *v, int n);
+template double DotProductNative<double>(const double *u, const double *v, int n);
+
 } // namespace tesseract
diff --git a/src/arch/dotproduct.h b/src/arch/dotproduct.h
index c9b2756e2c..756918d5e3 100644
--- a/src/arch/dotproduct.h
+++ b/src/arch/dotproduct.h
@@ -22,22 +22,43 @@
 namespace tesseract {
 
 // Computes and returns the dot product of the n-vectors u and v.
+template <class TFloat>
 TFloat DotProductNative(const TFloat *u, const TFloat *v, int n);
 
+// ------------ FAST FLOAT specializations -----------------
+
+// Uses Intel AVX intrinsics to access the SIMD instruction set.
+float DotProductAVX(const float *u, const float *v, int n);
+float DotProductAVX1(const float *u, const float *v, int n);
+float DotProductAVX2(const float *u, const float *v, int n);
+float DotProductAVX3(const float *u, const float *v, int n);
+float DotProductAVX4(const float *u, const float *v, int n);
+
+// Use Intel FMA.
+float DotProductFMA(const float *u, const float *v, int n);
+
+// Uses Intel SSE intrinsics to access the SIMD instruction set.
+float DotProductSSE(const float *u, const float *v, int n);
+
+float DotProductAccelerate(const float *u, const float *v, int n);
+
+// ------------ HIGH PRECISION DOUBLE specializations -----------------
+
 // Uses Intel AVX intrinsics to access the SIMD instruction set.
-TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX1(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX2(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX3(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX4(const TFloat *u, const TFloat *v, int n);
+double DotProductAVX(const double *u, const double *v, int n);
+double DotProductAVX1(const double *u, const double *v, int n);
+double DotProductAVX2(const double *u, const double *v, int n);
+double DotProductAVX3(const double *u, const double *v, int n);
+double DotProductAVX4(const double *u, const double *v, int n);
 
 // Use Intel FMA.
-TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n);
+double DotProductFMA(const double *u, const double *v, int n);
 
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
-TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n);
+double DotProductSSE(const double *u, const double *v, int n);
+
+double DotProductAccelerate(const double *u, const double *v, int n);
 
-TFloat DotProductAccelerate(const TFloat *u, const TFloat *v, int n);
 } // namespace tesseract.
 
 #endif // TESSERACT_ARCH_DOTPRODUCT_H_
diff --git a/src/arch/dotproductavx.cpp b/src/arch/dotproductavx.cpp
index 4c49e9e4a3..3bdb250fc1 100644
--- a/src/arch/dotproductavx.cpp
+++ b/src/arch/dotproductavx.cpp
@@ -15,11 +15,9 @@
 // limitations under the License.
 ///////////////////////////////////////////////////////////////////////
 
-#if !defined(__AVX__)
-#  if defined(__i686__) || defined(__x86_64__)
-#    error Implementation only for AVX capable architectures
-#  endif
-#else
+#include "intsimdmatrix.h"
+
+#if defined(__AVX__)
 
 #  include <immintrin.h>
 #  include <cstdint>
@@ -27,9 +25,10 @@
 
 namespace tesseract {
 
+// ---------------------------- FAST FLOAT section ------------------------
+
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel AVX intrinsics to access the SIMD instruction set.
-#if defined(FAST_FLOAT)
 float DotProductAVX(const float *u, const float *v, int n) {
   const unsigned quot = n / 8;
   const unsigned rem = n % 8;
@@ -50,6 +49,7 @@ float DotProductAVX(const float *u, const float *v, int n) {
   }
   return result;
 }
+
 float DotProductAVX1(const float *u, const float *v, int n) {
   const unsigned quot = n / 16;
   const unsigned rem = n % 16;
@@ -76,7 +76,9 @@ float DotProductAVX1(const float *u, const float *v, int n) {
   }
   return result;
 }
-#else
+
+// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------
+
 double DotProductAVX1(const double *u, const double *v, int n) {
   __m256d t0 = _mm256_setzero_pd();
   __m256d t1 = _mm256_setzero_pd();
@@ -130,8 +132,21 @@ double DotProductAVX(const double *u, const double *v, int n) {
   }
   return result;
 }
-#endif
+
+// ---------------------------- END FLOAT/DOUBLE sections ------------------------
 
 } // namespace tesseract.
 
+#else
+
+namespace tesseract {
+
+	// Computes and returns the dot product of the n-vectors u and v.
+	// Uses Intel FMA intrinsics to access the SIMD instruction set.
+	inline TFloat DotProductAVX(const TFloat* u, const TFloat* v, int n) {
+		return DotProductFMA(u, v, n);
+	}
+
+}
+
 #endif
diff --git a/src/arch/dotproductfma.cpp b/src/arch/dotproductfma.cpp
index 6afaefd3eb..704ae0bd01 100644
--- a/src/arch/dotproductfma.cpp
+++ b/src/arch/dotproductfma.cpp
@@ -15,11 +15,7 @@
 // limitations under the License.
 ///////////////////////////////////////////////////////////////////////
 
-#if !defined(__FMA__)
-#  if defined(__i686__) || defined(__x86_64__)
-#    error Implementation only for FMA capable architectures
-#  endif
-#else
+#if defined(__FMA__)
 
 #  include <immintrin.h>
 #  include <cstdint>
@@ -27,10 +23,11 @@
 
 namespace tesseract {
 
+// ---------------------------- FAST FLOAT section ------------------------
+
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel FMA intrinsics to access the SIMD instruction set.
-#if defined(FAST_FLOAT)
-TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
+float DotProductFMA(const float *u, const float *v, int n) {
   const unsigned quot = n / 16;
   const unsigned rem = n % 16;
   __m256 t0 = _mm256_setzero_ps();
@@ -48,15 +45,17 @@ TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
     v += 8;
   }
   t0 = _mm256_hadd_ps(t0, t1);
-  alignas(32) TFloat tmp[8];
+  alignas(32) float tmp[8];
   _mm256_store_ps(tmp, t0);
-  TFloat result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
+  float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
   for (unsigned k = 0; k < rem; k++) {
     result += *u++ * *v++;
   }
   return result;
 }
-#else
+
+// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------
+
 double DotProductFMA(const double *u, const double *v, int n) {
   const unsigned quot = n / 8;
   const unsigned rem = n % 8;
@@ -83,8 +82,24 @@ double DotProductFMA(const double *u, const double *v, int n) {
   }
   return result;
 }
-#endif
+
+// ---------------------------- END section ------------------------
 
 } // namespace tesseract.
 
+#else
+
+namespace tesseract {
+
+// Computes and returns the dot product of the n-vectors u and v.
+// Uses Intel FMA intrinsics to access the SIMD instruction set.
+inline float DotProductFMA(const float *u, const float *v, int n) {
+	return DotProductSSE(u, v, n);
+}
+inline double DotProductFMA(const double *u, const double *v, int n) {
+  return DotProductSSE(u, v, n);
+}
+
+}
+
 #endif
diff --git a/src/arch/dotproductsse.cpp b/src/arch/dotproductsse.cpp
index 9122e9d1b1..3b26e43e63 100644
--- a/src/arch/dotproductsse.cpp
+++ b/src/arch/dotproductsse.cpp
@@ -15,11 +15,7 @@
 // limitations under the License.
 ///////////////////////////////////////////////////////////////////////
 
-#if !defined(__SSE4_1__)
-#  if defined(__i686__) || defined(__x86_64__)
-#    error Implementation only for SSE 4.1 capable architectures
-#  endif
-#else
+#if defined(__SSE4_1__)
 
 #  include <emmintrin.h>
 #  include <smmintrin.h>
@@ -28,9 +24,10 @@
 
 namespace tesseract {
 
+// ---------------------------- FAST FLOAT section ------------------------
+
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
-#if defined(FAST_FLOAT)
 float DotProductSSE(const float *u, const float *v, int n) {
   int max_offset = n - 4;
   int offset = 0;
@@ -89,7 +86,9 @@ float DotProductSSE(const float *u, const float *v, int n) {
   }
   return result;
 }
-#else
+
+// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------
+
 double DotProductSSE(const double *u, const double *v, int n) {
   int max_offset = n - 2;
   int offset = 0;
@@ -139,8 +138,21 @@ double DotProductSSE(const double *u, const double *v, int n) {
   }
   return result;
 }
-#endif
+
+// ---------------------------- END section ------------------------
 
 } // namespace tesseract.
 
+#else
+
+namespace tesseract {
+
+	// Computes and returns the dot product of the n-vectors u and v.
+	// Uses Intel FMA intrinsics to access the SIMD instruction set.
+	inline TFloat DotProductSSE(const TFloat* u, const TFloat* v, int n) {
+		return DotProductNative(u, v, n);
+	}
+
+}
+
 #endif
diff --git a/src/arch/intsimdmatrixavx2.cpp b/src/arch/intsimdmatrixavx2.cpp
index c782ebb38c..bd5bbf2695 100644
--- a/src/arch/intsimdmatrixavx2.cpp
+++ b/src/arch/intsimdmatrixavx2.cpp
@@ -17,11 +17,7 @@
 
 #include "intsimdmatrix.h"
 
-#if !defined(__AVX2__)
-#  if defined(__i686__) || defined(__x86_64__)
-#    error Implementation only for AVX2 capable architectures
-#  endif
-#else
+#if defined(__AVX2__)
 #  include <immintrin.h>
 #  include <algorithm>
 #  include <cstdint>
@@ -85,7 +81,7 @@ static inline __m128i load64_to_128(const int8_t *wi_) {
   return _mm_set_epi64x(0, wi[0]);
 }
 
-#if defined(FAST_FLOAT)
+// ------------- FAST FLOAT specifics section -------------------------
 
 static inline void ExtractResults8(__m256i result, const int8_t *wi,
                                    const float *scales, float *v) {
@@ -130,198 +126,8 @@ static inline void ExtractResults16(__m256i result0, __m256i result1,
   v += 16;
 }
 
-// Computes part of matrix.vector v = Wu. Computes N=64 results.
-// The weights *must* be arranged so that consecutive reads from wi
-// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of
-// (kNumInputsPerGroup inputs))). After that there must be N consecutive
-// bias weights, before continuing with any more weights.
-// u must be padded out with zeros to
-// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.
-static void PartialMatrixDotVector64(const int8_t *wi, const float *scales, const int8_t *u,
-                                     int num_in, float *v) {
-  // Register containing 16-bit ones for horizontal add with 16->32 bit
-  // conversion.
-  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
-  // Initialize all the results to 0.
-  __m256i result0 = _mm256_setzero_si256();
-  __m256i result1 = _mm256_setzero_si256();
-  __m256i result2 = _mm256_setzero_si256();
-  __m256i result3 = _mm256_setzero_si256();
-  __m256i result4 = _mm256_setzero_si256();
-  __m256i result5 = _mm256_setzero_si256();
-  __m256i result6 = _mm256_setzero_si256();
-  __m256i result7 = _mm256_setzero_si256();
-  // Iterate over the input (u), one registerful at a time.
-  for (int j = 0; j < num_in;) {
-    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
-    // Inputs are processed in groups of kNumInputsPerGroup, replicated
-    // kNumInputGroups times.
-    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
-      // Replicate the low 32 bits (4 inputs) 8 times.
-      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
-      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
-      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
-      __m256i weights, reps;
-      // Mul-add, with horizontal add of the 4 inputs to each of the results.
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result2);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result3);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result4);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result5);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result6);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result7);
-    }
-  }
-  ExtractResults16(result0, result1, wi, scales, v);
-  ExtractResults16(result2, result3, wi, scales, v);
-  ExtractResults16(result4, result5, wi, scales, v);
-  ExtractResults16(result6, result7, wi, scales, v);
-}
-
-// Computes part of matrix.vector v = Wu. Computes N=32 results.
-// For details see PartialMatrixDotVector64 with N=32.
-static void PartialMatrixDotVector32(const int8_t *wi, const float *scales, const int8_t *u,
-                                     int num_in, float *v) {
-  // Register containing 16-bit ones for horizontal add with 16->32 bit
-  // conversion.
-  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
-  // Initialize all the results to 0.
-  __m256i result0 = _mm256_setzero_si256();
-  __m256i result1 = _mm256_setzero_si256();
-  __m256i result2 = _mm256_setzero_si256();
-  __m256i result3 = _mm256_setzero_si256();
-  // Iterate over the input (u), one registerful at a time.
-  for (int j = 0; j < num_in;) {
-    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
-    // Inputs are processed in groups of kNumInputsPerGroup, replicated
-    // kNumInputGroups times.
-    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
-      // Replicate the low 32 bits (4 inputs) 8 times.
-      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
-      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
-      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
-      __m256i weights, reps;
-      // Mul-add, with horizontal add of the 4 inputs to each of the results.
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result2);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result3);
-    }
-  }
-  ExtractResults16(result0, result1, wi, scales, v);
-  ExtractResults16(result2, result3, wi, scales, v);
-}
-
-// Computes part of matrix.vector v = Wu. Computes N=16 results.
-// For details see PartialMatrixDotVector64 with N=16.
-static void PartialMatrixDotVector16(const int8_t *wi, const float *scales, const int8_t *u,
-                                     int num_in, float *v) {
-  // Register containing 16-bit ones for horizontal add with 16->32 bit
-  // conversion.
-  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
-  // Initialize all the results to 0.
-  __m256i result0 = _mm256_setzero_si256();
-  __m256i result1 = _mm256_setzero_si256();
-  // Iterate over the input (u), one registerful at a time.
-  for (int j = 0; j < num_in;) {
-    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
-    // Inputs are processed in groups of kNumInputsPerGroup, replicated
-    // kNumInputGroups times.
-    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
-      // Replicate the low 32 bits (4 inputs) 8 times.
-      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
-      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
-      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
-      __m256i weights, reps;
-      // Mul-add, with horizontal add of the 4 inputs to each of the results.
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
-    }
-  }
-  ExtractResults16(result0, result1, wi, scales, v);
-}
-
-// Computes part of matrix.vector v = Wu. Computes N=8 results.
-// For details see PartialMatrixDotVector64 with N=8.
-static inline void PartialMatrixDotVector8(const int8_t *wi, const float *scales, const int8_t *u,
-                                           int num_in, float *v) {
-  // Register containing 16-bit ones for horizontal add with 16->32 bit
-  // conversion.
-  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
-  // Initialize all the results to 0.
-  __m256i result0 = _mm256_setzero_si256();
-  // Iterate over the input (u), one registerful at a time.
-  for (int j = 0; j < num_in;) {
-    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
-    // Inputs are processed in groups of kNumInputsPerGroup, replicated
-    // kNumInputGroups times.
-    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
-      // Replicate the low 32 bits (4 inputs) 8 times.
-      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
-      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
-      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
-      __m256i weights, reps;
-      // Mul-add, with horizontal add of the 4 inputs to each of the results.
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
-    }
-  }
-  ExtractResults8(result0, wi, scales, v);
-}
-
-static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const float *scales,
-                            const int8_t *u, float *v) {
-  const int num_out = dim1;
-  const int num_in = dim2 - 1;
-  // Each call to a partial_func_ produces group_size outputs, except the
-  // last one, which can produce less.
-  const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup);
-  const int rounded_num_out = IntSimdMatrix::Roundup(num_out, kNumOutputsPerRegister);
-  int group_size = kNumOutputsPerRegister * kMaxOutputRegisters;
-  int output = 0;
-
-  int w_step = (rounded_num_in + 1) * group_size;
-
-  // Run with this group size, until it would produce too much output, then
-  // switch to a smaller size.
-  for (; output + group_size <= rounded_num_out; output += group_size) {
-    PartialMatrixDotVector64(wi, scales, u, rounded_num_in, v);
-    wi += w_step;
-    scales += group_size;
-    v += group_size;
-  }
-  group_size /= 2;
-  w_step /= 2;
-
-  if (output + group_size <= rounded_num_out) {
-    PartialMatrixDotVector32(wi, scales, u, rounded_num_in, v);
-    wi += w_step;
-    scales += group_size;
-    v += group_size;
-    output += group_size;
-  }
-  group_size /= 2;
-  w_step /= 2;
-
-  if (output + group_size <= rounded_num_out) {
-    PartialMatrixDotVector16(wi, scales, u, rounded_num_in, v);
-    wi += w_step;
-    scales += group_size;
-    v += group_size;
-    output += group_size;
-  }
-  group_size /= 2;
-  w_step /= 2;
+// ------------- HIGH-PRECICION DOUBLE specifics section -------------------------
 
-  if (output + group_size <= rounded_num_out) {
-    PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v);
-  }
-}
-#else
 static inline void ExtractResults8(__m256i result, const int8_t *wi, const double *scales,
                                    double *v) {
   __m128i w128 = load64_to_128(wi);          // 8x8bit vals in bottom of 128bit reg
@@ -375,6 +181,8 @@ static inline void ExtractResults16(__m256i result0, __m256i result1, const int8
   v += 16;
 }
 
+// ------------- END specifics section -------------------------
+
 // Computes part of matrix.vector v = Wu. Computes N=64 results.
 // The weights *must* be arranged so that consecutive reads from wi
 // provides (num_in/kNumInputsPerGroup groups of (N output dim groups of
@@ -382,8 +190,9 @@ static inline void ExtractResults16(__m256i result0, __m256i result1, const int8
 // bias weights, before continuing with any more weights.
 // u must be padded out with zeros to
 // kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.
-static void PartialMatrixDotVector64(const int8_t *wi, const double *scales, const int8_t *u,
-                                     int num_in, double *v) {
+template <class TFloat>
+static void PartialMatrixDotVector64(const int8_t *wi, const TFloat *scales, const int8_t *u,
+                                     int num_in, TFloat *v) {
   // Register containing 16-bit ones for horizontal add with 16->32 bit
   // conversion.
   __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
@@ -427,8 +236,9 @@ static void PartialMatrixDotVector64(const int8_t *wi, const double *scales, con
 
 // Computes part of matrix.vector v = Wu. Computes N=32 results.
 // For details see PartialMatrixDotVector64 with N=32.
-static void PartialMatrixDotVector32(const int8_t *wi, const double *scales, const int8_t *u,
-                                     int num_in, double *v) {
+template <class TFloat>
+static void PartialMatrixDotVector32(const int8_t *wi, const TFloat *scales, const int8_t *u,
+                                     int num_in, TFloat *v) {
   // Register containing 16-bit ones for horizontal add with 16->32 bit
   // conversion.
   __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
@@ -462,8 +272,9 @@ static void PartialMatrixDotVector32(const int8_t *wi, const double *scales, con
 
 // Computes part of matrix.vector v = Wu. Computes N=16 results.
 // For details see PartialMatrixDotVector64 with N=16.
-static void PartialMatrixDotVector16(const int8_t *wi, const double *scales, const int8_t *u,
-                                     int num_in, double *v) {
+template <class TFloat>
+static void PartialMatrixDotVector16(const int8_t *wi, const TFloat *scales, const int8_t *u,
+                                     int num_in, TFloat *v) {
   // Register containing 16-bit ones for horizontal add with 16->32 bit
   // conversion.
   __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
@@ -492,8 +303,9 @@ static void PartialMatrixDotVector16(const int8_t *wi, const double *scales, con
 
 // Computes part of matrix.vector v = Wu. Computes N=8 results.
 // For details see PartialMatrixDotVector64 with N=8.
-static inline void PartialMatrixDotVector8(const int8_t *wi, const double *scales, const int8_t *u,
-                                           int num_in, double *v) {
+template <class TFloat>
+static inline void PartialMatrixDotVector8(const int8_t *wi, const TFloat *scales, const int8_t *u,
+                                           int num_in, TFloat *v) {
   // Register containing 16-bit ones for horizontal add with 16->32 bit
   // conversion.
   __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
@@ -518,8 +330,9 @@ static inline void PartialMatrixDotVector8(const int8_t *wi, const double *scale
   ExtractResults8(result0, wi, scales, v);
 }
 
-static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
-                            const int8_t *u, double *v) {
+template <class TFloat>
+static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const TFloat *scales,
+                            const int8_t *u, TFloat *v) {
   const int num_out = dim1;
   const int num_in = dim2 - 1;
   // Each call to a partial_func_ produces group_size outputs, except the
@@ -566,7 +379,7 @@ static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *
     PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v);
   }
 }
-#endif
+
 
 static const IntSimdMatrix simdMatrix = {
     // Function.
@@ -585,4 +398,12 @@ const IntSimdMatrix *IntSimdMatrix::intSimdMatrixAVX2 = &simdMatrix;
 
 } // namespace tesseract.
 
+#else
+
+namespace tesseract {
+
+	const IntSimdMatrix* IntSimdMatrix::intSimdMatrixAVX2 = nullptr;
+
+} // namespace tesseract.
+
 #endif
diff --git a/src/arch/intsimdmatrixneon.cpp b/src/arch/intsimdmatrixneon.cpp
index 260f747d48..9eecc6cde4 100644
--- a/src/arch/intsimdmatrixneon.cpp
+++ b/src/arch/intsimdmatrixneon.cpp
@@ -16,10 +16,10 @@
 // limitations under the License.
 ///////////////////////////////////////////////////////////////////////
 
-#if defined(__ARM_NEON)
+#include "intsimdmatrix.h"
+#include "tfloat.h"
 
-#  include "intsimdmatrix.h"
-#  include "tfloat.h"
+#if defined(HAVE_NEON)
 
 #  include <algorithm>
 #  include <cstdint>
@@ -212,4 +212,12 @@ const IntSimdMatrix *IntSimdMatrix::intSimdMatrixNEON = &simdMatrix;
 
 } // namespace tesseract.
 
+#else
+
+namespace tesseract {
+
+	const IntSimdMatrix* IntSimdMatrix::intSimdMatrixNEON = nullptr;
+
+} // namespace tesseract.
+
 #endif /* __ARM_NEON */
diff --git a/src/arch/intsimdmatrixsse.cpp b/src/arch/intsimdmatrixsse.cpp
index a46b319fd2..0e1d1f7b5b 100644
--- a/src/arch/intsimdmatrixsse.cpp
+++ b/src/arch/intsimdmatrixsse.cpp
@@ -17,44 +17,7 @@
 
 #include "intsimdmatrix.h"
 
-#if !defined(__SSE4_1__)
-#  if defined(__i686__) || defined(__x86_64__)
-#    error Implementation only for SSE 4.1 capable architectures
-#  endif
-#elif defined(FAST_FLOAT)
-namespace tesseract {
-static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const float *scales,
-                            const int8_t *u, float *v) {
-  const int num_out = dim1;
-  const int num_in = dim2 - 1;
-//#pragma omp simd collapse(2)
-  for (int i = 0; i < num_out; ++i) {
-    int total = 0;
-#pragma omp simd reduction(+:total)
-    for (int j = 0; j < num_in; ++j) {
-      total += wi[j] * u[j];
-    }
-    // Add in the bias and correct for integer values.
-    v[i] = (total + wi[num_in] * INT8_MAX) * scales[i];
-    wi += dim2;
-  }
-}
-
-static const IntSimdMatrix simdMatrix = {
-    matrixDotVector,
-    // Number of 32 bit outputs held in each register.
-    1,
-    // Maximum number of registers that we will use to hold outputs.
-    1,
-    // Number of 8 bit inputs in the inputs register.
-    1,
-    // Number of inputs in each weight group.
-    1
-};
-
-const IntSimdMatrix *IntSimdMatrix::intSimdMatrixSSE = &simdMatrix;
-}
-#else
+#if defined(__SSE4_1__)
 
 #  include <emmintrin.h>
 #  include <smmintrin.h>
@@ -102,15 +65,17 @@ static int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n) {
 }
 
 // Computes part of matrix.vector v = Wu. Computes 1 result.
-static void PartialMatrixDotVector1(const int8_t *wi, const double *scales, const int8_t *u,
-                                    int num_in, double *v) {
-  double total = IntDotProductSSE(u, wi, num_in);
+template <class TFloat>
+static void PartialMatrixDotVector1(const int8_t *wi, const TFloat *scales, const int8_t *u,
+                                    int num_in, TFloat *v) {
+  TFloat total = IntDotProductSSE(u, wi, num_in);
   // Add in the bias and correct for integer values.
   *v = (total + wi[num_in] * INT8_MAX) * *scales;
 }
 
-static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
-                            const int8_t *u, double *v) {
+template <class TFloat>
+static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const TFloat *scales,
+                            const int8_t *u, TFloat *v) {
   const int num_out = dim1;
   const int num_in = dim2 - 1;
   int output = 0;
@@ -139,4 +104,12 @@ const IntSimdMatrix *IntSimdMatrix::intSimdMatrixSSE = &simdMatrix;
 
 } // namespace tesseract.
 
+#else
+
+namespace tesseract {
+
+	const IntSimdMatrix* IntSimdMatrix::intSimdMatrixSSE = nullptr;
+
+} // namespace tesseract.
+
 #endif
diff --git a/src/arch/simddetect.cpp b/src/arch/simddetect.cpp
index 6c7f822239..f58c9eb494 100644
--- a/src/arch/simddetect.cpp
+++ b/src/arch/simddetect.cpp
@@ -24,6 +24,7 @@
 #include "params.h"        // for STRING_VAR
 #include "simddetect.h"
 #include "tprintf.h" // for tprintf
+#include "tfloat.h"
 
 #if defined(HAVE_FRAMEWORK_ACCELERATE)
 
@@ -101,16 +102,21 @@ bool SIMDDetect::sse_available_;
 #endif
 
 #if defined(HAVE_FRAMEWORK_ACCELERATE)
-TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) {
-  TFloat total = 0;
+
+float DotProductAccelerate(const float* u, const float* v, int n) {
+  float total = 0;
   const int stride = 1;
-#if defined(FAST_FLOAT)
   vDSP_dotpr(u, stride, v, stride, &total, n);
-#else
+  return total;
+}
+
+double DotProductAccelerate(const double* u, const double* v, int n) {
+  double total = 0;
+  const int stride = 1;
   vDSP_dotprD(u, stride, v, stride, &total, n);
-#endif
   return total;
 }
+
 #endif
 
 // Computes and returns the dot product of the two n-vectors u and v.
@@ -258,7 +264,7 @@ SIMDDetect::SIMDDetect() {
 #if defined(HAVE_NEON) || defined(__aarch64__)
   } else if (neon_available_) {
     // NEON detected.
-    SetDotProduct(DotProduct, IntSimdMatrix::intSimdMatrixNEON);
+    SetDotProduct(DotProductNative, IntSimdMatrix::intSimdMatrixNEON);
 #endif
   }
 }
@@ -288,6 +294,7 @@ void SIMDDetect::Update() {
     SetDotProduct(DotProductAVX, IntSimdMatrix::intSimdMatrixAVX2);
     dotproduct_method = "avx2";
   } else if (dotproduct == "avx-1") {
+    // AVX2 (Alternative Implementation) selected by config variable.
     SetDotProduct(DotProductAVX1, IntSimdMatrix::intSimdMatrixAVX2);
     dotproduct_method = "avx-1";
 #endif