diff --git a/src/arch/dotproduct.cpp b/src/arch/dotproduct.cpp
index 47c042f5df..9a0e5cb6f9 100644
--- a/src/arch/dotproduct.cpp
+++ b/src/arch/dotproduct.cpp
@@ -19,6 +19,7 @@
 namespace tesseract {
 
 // Computes and returns the dot product of the two n-vectors u and v.
+template <class TFloat>
 TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
   TFloat total = 0;
 #if defined(OPENMP_SIMD)
@@ -30,4 +31,8 @@ TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
   return total;
 }
 
+// two instantiations: float & double.
+template float DotProductNative<float>(const float *u, const float *v, int n);
+template double DotProductNative<double>(const double *u, const double *v, int n);
+
 } // namespace tesseract
diff --git a/src/arch/dotproduct.h b/src/arch/dotproduct.h
index c9b2756e2c..756918d5e3 100644
--- a/src/arch/dotproduct.h
+++ b/src/arch/dotproduct.h
@@ -22,22 +22,43 @@
 namespace tesseract {
 
 // Computes and returns the dot product of the n-vectors u and v.
+template <class TFloat>
 TFloat DotProductNative(const TFloat *u, const TFloat *v, int n);
 
+// ------------ FAST FLOAT specializations -----------------
+
+// Uses Intel AVX intrinsics to access the SIMD instruction set.
+float DotProductAVX(const float *u, const float *v, int n);
+float DotProductAVX1(const float *u, const float *v, int n);
+float DotProductAVX2(const float *u, const float *v, int n);
+float DotProductAVX3(const float *u, const float *v, int n);
+float DotProductAVX4(const float *u, const float *v, int n);
+
+// Use Intel FMA.
+float DotProductFMA(const float *u, const float *v, int n);
+
+// Uses Intel SSE intrinsics to access the SIMD instruction set.
+float DotProductSSE(const float *u, const float *v, int n);
+
+float DotProductAccelerate(const float *u, const float *v, int n);
+
+// ------------ HIGH PRECISION DOUBLE specializations -----------------
+
 // Uses Intel AVX intrinsics to access the SIMD instruction set.
-TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX1(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX2(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX3(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX4(const TFloat *u, const TFloat *v, int n);
+double DotProductAVX(const double *u, const double *v, int n);
+double DotProductAVX1(const double *u, const double *v, int n);
+double DotProductAVX2(const double *u, const double *v, int n);
+double DotProductAVX3(const double *u, const double *v, int n);
+double DotProductAVX4(const double *u, const double *v, int n);
 
 // Use Intel FMA.
-TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n);
+double DotProductFMA(const double *u, const double *v, int n);
 
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
-TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n);
+double DotProductSSE(const double *u, const double *v, int n);
+
+double DotProductAccelerate(const double *u, const double *v, int n);
 
-TFloat DotProductAccelerate(const TFloat *u, const TFloat *v, int n);
 } // namespace tesseract.
 
 #endif // TESSERACT_ARCH_DOTPRODUCT_H_
diff --git a/src/arch/dotproductavx.cpp b/src/arch/dotproductavx.cpp
index 4c49e9e4a3..275ae28363 100644
--- a/src/arch/dotproductavx.cpp
+++ b/src/arch/dotproductavx.cpp
@@ -27,9 +27,10 @@
 
 namespace tesseract {
 
+// ---------------------------- FAST FLOAT section ------------------------
+
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel AVX intrinsics to access the SIMD instruction set.
-#if defined(FAST_FLOAT)
 float DotProductAVX(const float *u, const float *v, int n) {
   const unsigned quot = n / 8;
   const unsigned rem = n % 8;
@@ -50,6 +51,7 @@ float DotProductAVX(const float *u, const float *v, int n) {
   }
   return result;
 }
+
 float DotProductAVX1(const float *u, const float *v, int n) {
   const unsigned quot = n / 16;
   const unsigned rem = n % 16;
@@ -76,7 +78,9 @@ float DotProductAVX1(const float *u, const float *v, int n) {
   }
   return result;
 }
-#else
+
+// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------
+
 double DotProductAVX1(const double *u, const double *v, int n) {
   __m256d t0 = _mm256_setzero_pd();
   __m256d t1 = _mm256_setzero_pd();
@@ -130,7 +134,8 @@ double DotProductAVX(const double *u, const double *v, int n) {
   }
   return result;
 }
-#endif
+
+// ---------------------------- END FLOAT/DOUBLE sections ------------------------
 
 } // namespace tesseract.
 
diff --git a/src/arch/dotproductfma.cpp b/src/arch/dotproductfma.cpp
index 6afaefd3eb..edc85d0b33 100644
--- a/src/arch/dotproductfma.cpp
+++ b/src/arch/dotproductfma.cpp
@@ -27,10 +27,11 @@
 
 namespace tesseract {
 
+// ---------------------------- FAST FLOAT section ------------------------
+
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel FMA intrinsics to access the SIMD instruction set.
-#if defined(FAST_FLOAT)
-TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
+float DotProductFMA(const float *u, const float *v, int n) {
   const unsigned quot = n / 16;
   const unsigned rem = n % 16;
   __m256 t0 = _mm256_setzero_ps();
@@ -48,15 +49,17 @@ TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
     v += 8;
   }
   t0 = _mm256_hadd_ps(t0, t1);
-  alignas(32) TFloat tmp[8];
+  alignas(32) float tmp[8];
   _mm256_store_ps(tmp, t0);
-  TFloat result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
+  float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
   for (unsigned k = 0; k < rem; k++) {
     result += *u++ * *v++;
   }
   return result;
 }
-#else
+
+// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------
+
 double DotProductFMA(const double *u, const double *v, int n) {
   const unsigned quot = n / 8;
   const unsigned rem = n % 8;
@@ -83,7 +86,8 @@ double DotProductFMA(const double *u, const double *v, int n) {
   }
   return result;
 }
-#endif
+
+// ---------------------------- END section ------------------------
 
 } // namespace tesseract.
 
diff --git a/src/arch/dotproductsse.cpp b/src/arch/dotproductsse.cpp
index 9122e9d1b1..39ddf01972 100644
--- a/src/arch/dotproductsse.cpp
+++ b/src/arch/dotproductsse.cpp
@@ -28,9 +28,10 @@
 
 namespace tesseract {
 
+// ---------------------------- FAST FLOAT section ------------------------
+
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
-#if defined(FAST_FLOAT)
 float DotProductSSE(const float *u, const float *v, int n) {
   int max_offset = n - 4;
   int offset = 0;
@@ -89,7 +90,9 @@ float DotProductSSE(const float *u, const float *v, int n) {
   }
   return result;
 }
-#else
+
+// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------
+
 double DotProductSSE(const double *u, const double *v, int n) {
   int max_offset = n - 2;
   int offset = 0;
@@ -139,7 +142,8 @@ double DotProductSSE(const double *u, const double *v, int n) {
   }
   return result;
 }
-#endif
+
+// ---------------------------- END section ------------------------
 
 } // namespace tesseract.
 
diff --git a/src/arch/intsimdmatrixavx2.cpp b/src/arch/intsimdmatrixavx2.cpp
index c87ef414a7..92936309b3 100644
--- a/src/arch/intsimdmatrixavx2.cpp
+++ b/src/arch/intsimdmatrixavx2.cpp
@@ -21,53 +21,8 @@
 #  if defined(__i686__) || defined(__x86_64__)
 #    error Implementation only for AVX2 capable architectures
 #  endif
-#elif defined(FAST_FLOAT)
-namespace tesseract {
-
-static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const float *scales,
-                            const int8_t *u, float *v) {
-  const int num_out = dim1;
-  const int num_in = dim2 - 1;
-  for (int i = 0; i < num_out; ++i) {
-    for (int j = 0; j < num_in; ++j) {
-    }
-  }
-}
-
-#if 0
-void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w,
-                                    const std::vector<TFloat> &scales, const int8_t *u, TFloat *v) {
-  int num_out = w.dim1();
-  int num_in = w.dim2() - 1;
-  // Base implementation.
-  for (int i = 0; i < num_out; ++i) {
-    const int8_t *wi = w[i];
-    int total = 0;
-    for (int j = 0; j < num_in; ++j) {
-      total += wi[j] * u[j];
-    }
-    // Add in the bias and correct for integer values.
-    v[i] = (total + wi[num_in] * INT8_MAX) * scales[i];
-  }
-}
-#endif
-
-static const IntSimdMatrix simdMatrix = {
-    // Function.
-    matrixDotVector,
-    // Number of 32 bit outputs held in each register.
-    1,
-    // Maximum number of registers that we will use to hold outputs.
-    1,
-    // Number of 8 bit inputs in the inputs register.
-    1,
-    // Number of inputs in each weight group.
-    1
-};
-
-const IntSimdMatrix *IntSimdMatrix::intSimdMatrixAVX2 = &simdMatrix;
-}
 #else
+
 #  include <immintrin.h>
 #  include <algorithm>
 #  include <cstdint>
@@ -131,7 +86,7 @@ static inline __m128i load64_to_128(const int8_t *wi_) {
   return _mm_set_epi64x(0, wi[0]);
 }
 
-#if defined(FAST_FLOAT)
+// ------------- FAST FLOAT specifics section -------------------------
 
 static inline void ExtractResults8(__m256i result, const int8_t *wi,
                                    const float *scales, float *v) {
@@ -176,198 +131,8 @@ static inline void ExtractResults16(__m256i result0, __m256i result1,
   v += 16;
 }
 
-// Computes part of matrix.vector v = Wu. Computes N=64 results.
-// The weights *must* be arranged so that consecutive reads from wi
-// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of
-// (kNumInputsPerGroup inputs))). After that there must be N consecutive
-// bias weights, before continuing with any more weights.
-// u must be padded out with zeros to
-// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.
-static void PartialMatrixDotVector64(const int8_t *wi, const float *scales, const int8_t *u,
-                                     int num_in, float *v) {
-  // Register containing 16-bit ones for horizontal add with 16->32 bit
-  // conversion.
-  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
-  // Initialize all the results to 0.
-  __m256i result0 = _mm256_setzero_si256();
-  __m256i result1 = _mm256_setzero_si256();
-  __m256i result2 = _mm256_setzero_si256();
-  __m256i result3 = _mm256_setzero_si256();
-  __m256i result4 = _mm256_setzero_si256();
-  __m256i result5 = _mm256_setzero_si256();
-  __m256i result6 = _mm256_setzero_si256();
-  __m256i result7 = _mm256_setzero_si256();
-  // Iterate over the input (u), one registerful at a time.
-  for (int j = 0; j < num_in;) {
-    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
-    // Inputs are processed in groups of kNumInputsPerGroup, replicated
-    // kNumInputGroups times.
-    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
-      // Replicate the low 32 bits (4 inputs) 8 times.
-      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
-      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
-      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
-      __m256i weights, reps;
-      // Mul-add, with horizontal add of the 4 inputs to each of the results.
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result2);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result3);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result4);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result5);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result6);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result7);
-    }
-  }
-  ExtractResults16(result0, result1, wi, scales, v);
-  ExtractResults16(result2, result3, wi, scales, v);
-  ExtractResults16(result4, result5, wi, scales, v);
-  ExtractResults16(result6, result7, wi, scales, v);
-}
-
-// Computes part of matrix.vector v = Wu. Computes N=32 results.
-// For details see PartialMatrixDotVector64 with N=32.
-static void PartialMatrixDotVector32(const int8_t *wi, const float *scales, const int8_t *u,
-                                     int num_in, float *v) {
-  // Register containing 16-bit ones for horizontal add with 16->32 bit
-  // conversion.
-  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
-  // Initialize all the results to 0.
-  __m256i result0 = _mm256_setzero_si256();
-  __m256i result1 = _mm256_setzero_si256();
-  __m256i result2 = _mm256_setzero_si256();
-  __m256i result3 = _mm256_setzero_si256();
-  // Iterate over the input (u), one registerful at a time.
-  for (int j = 0; j < num_in;) {
-    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
-    // Inputs are processed in groups of kNumInputsPerGroup, replicated
-    // kNumInputGroups times.
-    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
-      // Replicate the low 32 bits (4 inputs) 8 times.
-      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
-      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
-      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
-      __m256i weights, reps;
-      // Mul-add, with horizontal add of the 4 inputs to each of the results.
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result2);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result3);
-    }
-  }
-  ExtractResults16(result0, result1, wi, scales, v);
-  ExtractResults16(result2, result3, wi, scales, v);
-}
+// ------------- HIGH-PRECICION DOUBLE specifics section -------------------------
 
-// Computes part of matrix.vector v = Wu. Computes N=16 results.
-// For details see PartialMatrixDotVector64 with N=16.
-static void PartialMatrixDotVector16(const int8_t *wi, const float *scales, const int8_t *u,
-                                     int num_in, float *v) {
-  // Register containing 16-bit ones for horizontal add with 16->32 bit
-  // conversion.
-  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
-  // Initialize all the results to 0.
-  __m256i result0 = _mm256_setzero_si256();
-  __m256i result1 = _mm256_setzero_si256();
-  // Iterate over the input (u), one registerful at a time.
-  for (int j = 0; j < num_in;) {
-    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
-    // Inputs are processed in groups of kNumInputsPerGroup, replicated
-    // kNumInputGroups times.
-    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
-      // Replicate the low 32 bits (4 inputs) 8 times.
-      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
-      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
-      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
-      __m256i weights, reps;
-      // Mul-add, with horizontal add of the 4 inputs to each of the results.
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
-    }
-  }
-  ExtractResults16(result0, result1, wi, scales, v);
-}
-
-// Computes part of matrix.vector v = Wu. Computes N=8 results.
-// For details see PartialMatrixDotVector64 with N=8.
-static inline void PartialMatrixDotVector8(const int8_t *wi, const float *scales, const int8_t *u,
-                                           int num_in, float *v) {
-  // Register containing 16-bit ones for horizontal add with 16->32 bit
-  // conversion.
-  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
-  // Initialize all the results to 0.
-  __m256i result0 = _mm256_setzero_si256();
-  // Iterate over the input (u), one registerful at a time.
-  for (int j = 0; j < num_in;) {
-    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
-    // Inputs are processed in groups of kNumInputsPerGroup, replicated
-    // kNumInputGroups times.
-    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
-      // Replicate the low 32 bits (4 inputs) 8 times.
-      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
-      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
-      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
-      __m256i weights, reps;
-      // Mul-add, with horizontal add of the 4 inputs to each of the results.
-      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
-    }
-  }
-  ExtractResults8(result0, wi, scales, v);
-}
-
-static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const float *scales,
-                            const int8_t *u, float *v) {
-  const int num_out = dim1;
-  const int num_in = dim2 - 1;
-  // Each call to a partial_func_ produces group_size outputs, except the
-  // last one, which can produce less.
-  const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup);
-  const int rounded_num_out = IntSimdMatrix::Roundup(num_out, kNumOutputsPerRegister);
-  int group_size = kNumOutputsPerRegister * kMaxOutputRegisters;
-  int output = 0;
-
-  int w_step = (rounded_num_in + 1) * group_size;
-
-  // Run with this group size, until it would produce too much output, then
-  // switch to a smaller size.
-  for (; output + group_size <= rounded_num_out; output += group_size) {
-    PartialMatrixDotVector64(wi, scales, u, rounded_num_in, v);
-    wi += w_step;
-    scales += group_size;
-    v += group_size;
-  }
-  group_size /= 2;
-  w_step /= 2;
-
-  if (output + group_size <= rounded_num_out) {
-    PartialMatrixDotVector32(wi, scales, u, rounded_num_in, v);
-    wi += w_step;
-    scales += group_size;
-    v += group_size;
-    output += group_size;
-  }
-  group_size /= 2;
-  w_step /= 2;
-
-  if (output + group_size <= rounded_num_out) {
-    PartialMatrixDotVector16(wi, scales, u, rounded_num_in, v);
-    wi += w_step;
-    scales += group_size;
-    v += group_size;
-    output += group_size;
-  }
-  group_size /= 2;
-  w_step /= 2;
-
-  if (output + group_size <= rounded_num_out) {
-    PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v);
-  }
-}
-#else
 static inline void ExtractResults8(__m256i result, const int8_t *wi, const double *scales,
                                    double *v) {
   __m128i w128 = load64_to_128(wi);          // 8x8bit vals in bottom of 128bit reg
@@ -421,6 +186,8 @@ static inline void ExtractResults16(__m256i result0, __m256i result1, const int8
   v += 16;
 }
 
+// ------------- END specifics section -------------------------
+
 // Computes part of matrix.vector v = Wu. Computes N=64 results.
 // The weights *must* be arranged so that consecutive reads from wi
 // provides (num_in/kNumInputsPerGroup groups of (N output dim groups of
@@ -428,8 +195,9 @@ static inline void ExtractResults16(__m256i result0, __m256i result1, const int8
 // bias weights, before continuing with any more weights.
 // u must be padded out with zeros to
 // kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.
-static void PartialMatrixDotVector64(const int8_t *wi, const double *scales, const int8_t *u,
-                                     int num_in, double *v) {
+template <class TFloat>
+static void PartialMatrixDotVector64(const int8_t *wi, const TFloat *scales, const int8_t *u,
+                                     int num_in, TFloat *v) {
   // Register containing 16-bit ones for horizontal add with 16->32 bit
   // conversion.
   __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
@@ -473,8 +241,9 @@ static void PartialMatrixDotVector64(const int8_t *wi, const double *scales, con
 
 // Computes part of matrix.vector v = Wu. Computes N=32 results.
 // For details see PartialMatrixDotVector64 with N=32.
-static void PartialMatrixDotVector32(const int8_t *wi, const double *scales, const int8_t *u,
-                                     int num_in, double *v) {
+template <class TFloat>
+static void PartialMatrixDotVector32(const int8_t *wi, const TFloat *scales, const int8_t *u,
+                                     int num_in, TFloat *v) {
   // Register containing 16-bit ones for horizontal add with 16->32 bit
   // conversion.
   __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
@@ -508,8 +277,9 @@ static void PartialMatrixDotVector32(const int8_t *wi, const double *scales, con
 
 // Computes part of matrix.vector v = Wu. Computes N=16 results.
 // For details see PartialMatrixDotVector64 with N=16.
-static void PartialMatrixDotVector16(const int8_t *wi, const double *scales, const int8_t *u,
-                                     int num_in, double *v) {
+template <class TFloat>
+static void PartialMatrixDotVector16(const int8_t *wi, const TFloat *scales, const int8_t *u,
+                                     int num_in, TFloat *v) {
   // Register containing 16-bit ones for horizontal add with 16->32 bit
   // conversion.
   __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
@@ -538,8 +308,9 @@ static void PartialMatrixDotVector16(const int8_t *wi, const double *scales, con
 
 // Computes part of matrix.vector v = Wu. Computes N=8 results.
 // For details see PartialMatrixDotVector64 with N=8.
-static inline void PartialMatrixDotVector8(const int8_t *wi, const double *scales, const int8_t *u,
-                                           int num_in, double *v) {
+template <class TFloat>
+static inline void PartialMatrixDotVector8(const int8_t *wi, const TFloat *scales, const int8_t *u,
+                                           int num_in, TFloat *v) {
   // Register containing 16-bit ones for horizontal add with 16->32 bit
   // conversion.
   __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
@@ -564,8 +335,9 @@ static inline void PartialMatrixDotVector8(const int8_t *wi, const double *scale
   ExtractResults8(result0, wi, scales, v);
 }
 
-static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
-                            const int8_t *u, double *v) {
+template <class TFloat>
+static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const TFloat *scales,
+                            const int8_t *u, TFloat *v) {
   const int num_out = dim1;
   const int num_in = dim2 - 1;
   // Each call to a partial_func_ produces group_size outputs, except the
@@ -612,7 +384,7 @@ static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *
     PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v);
   }
 }
-#endif
+
 
 static const IntSimdMatrix simdMatrix = {
     // Function.
diff --git a/src/arch/intsimdmatrixsse.cpp b/src/arch/intsimdmatrixsse.cpp
index 7407f6f5a1..ab71a1f1d7 100644
--- a/src/arch/intsimdmatrixsse.cpp
+++ b/src/arch/intsimdmatrixsse.cpp
@@ -21,10 +21,6 @@
 #  if defined(__i686__) || defined(__x86_64__)
 #    error Implementation only for SSE 4.1 capable architectures
 #  endif
-#elif defined(FAST_FLOAT)
-namespace tesseract {
-const IntSimdMatrix *IntSimdMatrix::intSimdMatrixSSE = nullptr;
-}
 #else
 
 #  include <emmintrin.h>
@@ -73,15 +69,17 @@ static int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n) {
 }
 
 // Computes part of matrix.vector v = Wu. Computes 1 result.
-static void PartialMatrixDotVector1(const int8_t *wi, const double *scales, const int8_t *u,
-                                    int num_in, double *v) {
-  double total = IntDotProductSSE(u, wi, num_in);
+template <class TFloat>
+static void PartialMatrixDotVector1(const int8_t *wi, const TFloat *scales, const int8_t *u,
+                                    int num_in, TFloat *v) {
+  TFloat total = IntDotProductSSE(u, wi, num_in);
   // Add in the bias and correct for integer values.
   *v = (total + wi[num_in] * INT8_MAX) * *scales;
 }
 
-static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
-                            const int8_t *u, double *v) {
+template <class TFloat>
+static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const TFloat *scales,
+                            const int8_t *u, TFloat *v) {
   const int num_out = dim1;
   const int num_in = dim2 - 1;
   int output = 0;
diff --git a/src/ccstruct/matrix.h b/src/ccstruct/matrix.h
index a97912ad78..e1062294dc 100644
--- a/src/ccstruct/matrix.h
+++ b/src/ccstruct/matrix.h
@@ -145,45 +145,53 @@ class GENERIC_2D_ARRAY {
     }
   }
 
+  // -----------------------------------------------------------
+  // Serialization & Deserialization to disk uses specific Storage Types (ST)
+  // which MAY not be identical to the run-time Type (T).
+  // -----------------------------------------------------------
+  
   // Writes to the given file. Returns false in case of error.
   // Only works with bitwise-serializeable types!
+  template <class ST>
   bool Serialize(FILE *fp) const {
     if (!SerializeSize(fp)) {
       return false;
     }
-    if (!tesseract::Serialize(fp, &empty_)) {
+    if (!tesseract::Serialize<ST>(fp, &empty_)) {
       return false;
     }
     int size = num_elements();
-    return tesseract::Serialize(fp, &array_[0], size);
+	return tesseract::Serialize<ST>(fp, &array_[0], size);
   }
 
+  template <class ST>
   bool Serialize(TFile *fp) const {
     if (!SerializeSize(fp)) {
       return false;
     }
-    if (!fp->Serialize(&empty_)) {
+    if (!fp->Serialize<T, ST>(&empty_)) {
       return false;
     }
     int size = num_elements();
-    return fp->Serialize(&array_[0], size);
+    return fp->Serialize<T, ST>(&array_[0], size);
   }
 
   // Reads from the given file. Returns false in case of error.
   // Only works with bitwise-serializeable types!
   // If swap is true, assumes a big/little-endian swap is needed.
+  template <class ST>
   bool DeSerialize(bool swap, FILE *fp) {
     if (!DeSerializeSize(swap, fp)) {
       return false;
     }
-    if (!tesseract::DeSerialize(fp, &empty_)) {
+    if (!tesseract::DeSerialize<ST>(fp, &empty_)) {
       return false;
     }
     if (swap) {
       ReverseN(&empty_, sizeof(empty_));
     }
     int size = num_elements();
-    if (!tesseract::DeSerialize(fp, &array_[0], size)) {
+    if (!tesseract::DeSerialize<ST>(fp, &array_[0], size)) {
       return false;
     }
     if (swap) {
@@ -194,9 +202,10 @@ class GENERIC_2D_ARRAY {
     return true;
   }
 
+  template <class ST>
   bool DeSerialize(TFile *fp) {
-    return DeSerializeSize(fp) && fp->DeSerialize(&empty_) &&
-           fp->DeSerialize(&array_[0], num_elements());
+    return DeSerializeSize(fp) && fp->DeSerialize<T, ST>(&empty_) &&
+           fp->DeSerialize<T, ST>(&array_[0], num_elements());
   }
 
   // Writes to the given file. Returns false in case of error.
@@ -291,7 +300,7 @@ class GENERIC_2D_ARRAY {
   void operator+=(const GENERIC_2D_ARRAY<T> &addend) {
     if (dim2_ == addend.dim2_) {
       // Faster if equal size in the major dimension.
-      int size = std::min(num_elements(), addend.num_elements());
+      int size = std::min<int>(num_elements(), addend.num_elements());
       for (int i = 0; i < size; ++i) {
         array_[i] += addend.array_[i];
       }
@@ -307,7 +316,7 @@ class GENERIC_2D_ARRAY {
   void operator-=(const GENERIC_2D_ARRAY<T> &minuend) {
     if (dim2_ == minuend.dim2_) {
       // Faster if equal size in the major dimension.
-      int size = std::min(num_elements(), minuend.num_elements());
+      int size = std::min<int>(num_elements(), minuend.num_elements());
       for (int i = 0; i < size; ++i) {
         array_[i] -= minuend.array_[i];
       }
@@ -467,8 +476,8 @@ class GENERIC_2D_ARRAY {
   // Higher dimensions above 2 are strictly the responsibility of the caller.
   void RotatingTranspose(const int *dims, int num_dims, int src_dim, int dest_dim,
                          GENERIC_2D_ARRAY<T> *result) const {
-    int max_d = std::max(src_dim, dest_dim);
-    int min_d = std::min(src_dim, dest_dim);
+    int max_d = std::max<int>(src_dim, dest_dim);
+    int min_d = std::min<int>(src_dim, dest_dim);
     // In a tensor of shape [d0, d1... min_d, ... max_d, ... dn-2, dn-1], the
     // ends outside of min_d and max_d are unaffected, with [max_d +1, dn-1]
     // being contiguous blocks of data that will move together, and
@@ -632,7 +641,7 @@ class BandTriMatrix : public GENERIC_2D_ARRAY<T> {
   // to *this.
   void AttachOnCorner(BandTriMatrix<T> *array2) {
     int new_dim1 = this->dim1_ + array2->dim1_;
-    int new_dim2 = std::max(this->dim2_, array2->dim2_);
+    int new_dim2 = std::max<int>(this->dim2_, array2->dim2_);
     T *new_array = new T[new_dim1 * new_dim2];
     for (int col = 0; col < new_dim1; ++col) {
       for (int j = 0; j < new_dim2; ++j) {
diff --git a/src/ccutil/serialis.h b/src/ccutil/serialis.h
index a07ed394eb..f656d5b8ca 100644
--- a/src/ccutil/serialis.h
+++ b/src/ccutil/serialis.h
@@ -26,6 +26,7 @@
 #include <cstring>
 #include <type_traits>
 #include <vector> // std::vector
+#include "tfloat.h"
 
 namespace tesseract {
 
@@ -43,6 +44,11 @@ constexpr size_t countof(T const (&)[N]) noexcept {
   return N;
 }
 
+// Function to read a std::vector<char> from a whole file.
+// Returns false on failure.
+// using FileReader = bool (*)(const char* filename, std::vector<char>* data);
+// ^-- imported from baseapi.h
+
 // Function to write a std::vector<char> to a whole file.
 // Returns false on failure.
 using FileWriter = bool (*)(const std::vector<char> &data, const char *filename);
@@ -64,6 +70,53 @@ bool Serialize(FILE *fp, const T *data, size_t n = 1) {
   return fwrite(data, sizeof(T), n, fp) == n;
 }
 
+// Deserialize data from file.
+template <typename T, typename ST>
+bool DeSerialize(FILE* fp, T* data, size_t n = 1) {
+	ST* arr = new ST[n];
+	bool rv = (fread(&arr[0], sizeof(ST), n, fp) == n);
+	if (rv) {
+		for (size_t i = 0; i < n; i++) {
+			data[i] = arr[i];
+		}
+	}
+	delete[] arr;
+	return rv;
+}
+
+template
+bool DeSerialize<double, double>(FILE* fp, double* data, size_t n);
+template
+bool DeSerialize<double, float>(FILE* fp, double* data, size_t n);
+template
+bool DeSerialize<float, float>(FILE* fp, float* data, size_t n);
+template
+bool DeSerialize<float, double>(FILE* fp, float* data, size_t n);
+
+// Serialize data to file.
+template <typename T, typename ST>
+bool Serialize(FILE* fp, const T* data, size_t n = 1) {
+	ST* arr = new ST[n];
+	for (size_t i = 0; i < n; i++) {
+		arr[i] = data[i];
+	}
+	bool rv = (fwrite(&arr[0], sizeof(ST), n, fp) == n);
+	delete[] arr;
+	return rv;
+}
+
+template 
+bool Serialize<double, double>(FILE* fp, const double* data, size_t n);
+template
+bool Serialize<double, float>(FILE* fp, const double* data, size_t n);
+template
+bool Serialize<float, float>(FILE* fp, const float* data, size_t n);
+template
+bool Serialize<float, double>(FILE* fp, const float* data, size_t n);
+
+
+
+
 // Simple file class.
 // Allows for portable file input from memory and from foreign file systems.
 class TESS_API TFile {
@@ -128,9 +181,9 @@ class TESS_API TFile {
       data.resize(size);
       for (uint32_t i = 0; i < size; i++) {
         uint8_t non_null;
-	if (!DeSerialize(&non_null)) {
+        if (!DeSerialize(&non_null)) {
           return false;
-	}
+        }
         if (non_null) {
           typedef typename std::remove_pointer<T>::type ST;
           auto item = new ST;
@@ -149,13 +202,72 @@ class TESS_API TFile {
     }
     return true;
   }
+  template <typename T, typename ST>
+  bool DeSerialize(T* data, size_t count = 1)
+  {
+	  ST* arr = new ST[count];
+	  bool rv = (FReadEndian(&arr[0], sizeof(ST), count) == static_cast<int>(count));
+	  if (rv)
+	  {
+		  for (size_t i = 0; i < count; i++)
+		  {
+			  data[i] = arr[i];
+		  }
+	  }
+	  delete[] arr;
+	  return rv;
+  }
+  template <typename T, typename ST>
+  bool DeSerialize(std::vector<T>& data)
+  {
+	  std::vector<ST> arr;
+	  bool rv = DeSerialize(arr);
+	  if (rv)
+	  {
+		  size_t len = arr.size();
+		  data.resize(len);
+		  for (size_t i = 0; i < len; i++) {
+			  data[i] = arr[i];
+		  }
+	  }
+	  return rv;
+  }
+#if 0
+  template bool DeSerialize<double, double>(double* data, size_t count);
+  template bool DeSerialize<double, float>(double* data, size_t count);
+  template bool DeSerialize<float, float>(float* data, size_t count);
+  template bool DeSerialize<float, double>(float* data, size_t count);
+#endif
 
   // Serialize data.
   bool Serialize(const std::string &data);
   bool Serialize(const std::vector<char> &data);
   template <typename T>
+  bool Serialize(const T* data, size_t count = 1) {
+	  return FWrite(data, sizeof(T), count) == static_cast<int>(count);
+  }
+  template <typename T, typename ST>
   bool Serialize(const T *data, size_t count = 1) {
-    return FWrite(data, sizeof(T), count) == static_cast<int>(count);
+	  ST* arr = new ST[count];
+  	  for (size_t i = 0; i < count; i++)
+	  {
+		arr[i] = data[i];
+	  }
+	  bool rv = (FWrite(&arr[0], sizeof(ST), count) == static_cast<int>(count));
+	  delete[] arr;
+	  return rv;
+  }
+  template <typename T, typename ST>
+  bool Serialize(const std::vector<T>& data)
+  {
+	  std::vector<ST> arr;
+		size_t len = data.size();
+		arr.resize(len);
+		for (size_t i = 0; i < len; i++) {
+			arr[i] = data[i];
+		}
+		bool rv = Serialize(arr);
+		return rv;
   }
   template <typename T>
   bool Serialize(const std::vector<T> &data) {
@@ -181,14 +293,14 @@ class TESS_API TFile {
       // Serialize pointers.
       for (auto &item : data) {
         uint8_t non_null = (item != nullptr);
-	if (!Serialize(&non_null)) {
+        if (!Serialize(&non_null)) {
           return false;
-	}
+        }
         if (non_null) {
           if (!item->Serialize(this)) {
             return false;
-	  }
-	}
+          }
+        }
       }
     } else if (size > 0) {
       // Serialize a non-class.
diff --git a/src/lstm/weightmatrix.cpp b/src/lstm/weightmatrix.cpp
index ba7b8653e7..46e10433ca 100644
--- a/src/lstm/weightmatrix.cpp
+++ b/src/lstm/weightmatrix.cpp
@@ -22,6 +22,7 @@
 #include "simddetect.h" // for DotProduct
 #include "statistc.h"
 #include "tprintf.h"    // forTFloat
+#include "tfloat.h"
 
 namespace tesseract {
 
@@ -37,7 +38,6 @@ const int kAdamCorrectionIterations = 200000;
 const TFloat kAdamEpsilon = 1e-8;
 
 // Utility functions convert between double and float arrays.
-#ifdef FAST_FLOAT
 static void DoubleToFloat(const GENERIC_2D_ARRAY<double> &src, GENERIC_2D_ARRAY<float> &dst) {
   const auto dim1 = src.dim1();
   const auto dim2 = src.dim2();
@@ -50,7 +50,6 @@ static void DoubleToFloat(const GENERIC_2D_ARRAY<double> &src, GENERIC_2D_ARRAY<
     }
   }
 }
-#endif
 
 static void FloatToDouble(const GENERIC_2D_ARRAY<float> &src, GENERIC_2D_ARRAY<double> &dst) {
   const auto dim1 = src.dim1();
@@ -66,28 +65,14 @@ static void FloatToDouble(const GENERIC_2D_ARRAY<float> &src, GENERIC_2D_ARRAY<d
 }
 
 static bool DeSerialize(TFile *fp, GENERIC_2D_ARRAY<TFloat> &tfloat_array) {
-#ifdef FAST_FLOAT
-  GENERIC_2D_ARRAY<double> double_array;
-  if (!double_array.DeSerialize(fp)) {
-    return false;
-  }
-  DoubleToFloat(double_array, tfloat_array);
-  return true;
-#else
-  return tfloat_array.DeSerialize(fp);
-#endif
+  return tfloat_array.DeSerialize<double>(fp);
 }
 
 static bool Serialize(TFile *fp, const GENERIC_2D_ARRAY<TFloat> &tfloat_array) {
-#ifdef FAST_FLOAT
-  GENERIC_2D_ARRAY<double> double_array;
-  FloatToDouble(tfloat_array, double_array);
-  return double_array.Serialize(fp);
-#else
-  return tfloat_array.Serialize(fp);
-#endif
+  return tfloat_array.Serialize<double>(fp);
 }
 
+
 // Computes matrix.vector v = Wu.
 // u is of size W.dim2() - add_bias_fwd and the output v is of size
 // W.dim1() - skip_bias_back.
@@ -243,7 +228,7 @@ bool WeightMatrix::Serialize(bool training, TFile *fp) const {
     return false;
   }
   if (int_mode_) {
-    if (!wi_.Serialize(fp)) {
+    if (!wi_.Serialize<int8_t>(fp)) {
       return false;
     }
     // The scales stored in memory have an extra factor applied to them
@@ -257,14 +242,9 @@ bool WeightMatrix::Serialize(bool training, TFile *fp) const {
     if (!fp->Serialize(&size)) {
       return false;
     }
-#ifdef FAST_FLOAT
-    assert(!"not implemented");
-    return false;
-#else
-    if (!fp->Serialize(&scales[0], size)) {
+    if (!fp->Serialize<TFloat, double>(&scales[0], size)) {
       return false;
     }
-#endif
   } else {
     if (!tesseract::Serialize(fp, wf_)) {
       return false;
@@ -294,31 +274,20 @@ bool WeightMatrix::DeSerialize(bool training, TFile *fp) {
     return DeSerializeOld(training, fp);
   }
   if (int_mode_) {
-    if (!wi_.DeSerialize(fp)) {
+    if (!wi_.DeSerialize<int8_t>(fp)) {
       return false;
     }
     uint32_t size;
     if (!fp->DeSerialize(&size)) {
       return false;
     }
-#ifdef FAST_FLOAT
-    scales_.reserve(size);
-    for (auto n = size; n > 0; n--) {
-      double val;
-      if (!fp->DeSerialize(&val)) {
-        return false;
-      }
-      scales_.push_back(val / INT8_MAX);
-    }
-#else
     scales_.resize(size);
-    if (!fp->DeSerialize(&scales_[0], size)) {
+    if (!fp->DeSerialize<TFloat, double>(&scales_[0], size)) {
       return false;
     }
     for (auto &scale : scales_) {
       scale /= INT8_MAX;
     }
-#endif
     if (IntSimdMatrix::intSimdMatrix) {
       int32_t rounded_num_out;
       IntSimdMatrix::intSimdMatrix->Init(wi_, shaped_w_, rounded_num_out);
@@ -346,44 +315,30 @@ bool WeightMatrix::DeSerialize(bool training, TFile *fp) {
 // As DeSerialize, but reads an old (float) format WeightMatrix for
 // backward compatibility.
 bool WeightMatrix::DeSerializeOld(bool training, TFile *fp) {
-#ifdef FAST_FLOAT
-  // Not implemented.
-  assert(!"not implemented");
-  return false;
-#else
   if (int_mode_) {
-    if (!wi_.DeSerialize(fp)) {
+    if (!wi_.DeSerialize<int8_t>(fp)) {
       return false;
     }
-    std::vector<float> old_scales;
-    if (!fp->DeSerialize(old_scales)) {
+    if (!fp->DeSerialize<TFloat, float>(scales_)) {
       return false;
     }
-    scales_.reserve(old_scales.size());
-    for (float old_scale : old_scales) {
-      scales_.push_back(old_scale);
-    }
   } else {
-    GENERIC_2D_ARRAY<float> float_array;
-    if (!float_array.DeSerialize(fp)) {
+    if (!wf_.DeSerialize<float>(fp)) {
       return false;
     }
-    FloatToDouble(float_array, wf_);
   }
   if (training) {
     InitBackward();
-    GENERIC_2D_ARRAY<float> float_array;
-    if (!float_array.DeSerialize(fp)) {
+    if (!updates_.DeSerialize<float>(fp)) {
       return false;
     }
-    FloatToDouble(float_array, updates_);
     // Errs was only used in int training, which is now dead.
-    if (!float_array.DeSerialize(fp)) {
+	GENERIC_2D_ARRAY<float> float_array;
+	if (!float_array.DeSerialize<float>(fp)) {
       return false;
     }
   }
   return true;
-#endif
 }
 
 // Computes matrix.vector v = Wu.
diff --git a/unittest/dotproduct_test.cc b/unittest/dotproduct_test.cc
index e97eea325a..e37322ce7f 100644
--- a/unittest/dotproduct_test.cc
+++ b/unittest/dotproduct_test.cc
@@ -49,7 +49,6 @@ void DotProductTest::RunTest(TFloat (*f)(const TFloat *u, const TFloat *v, int n
   }
 }
 
-TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n);
 TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) {
   TFloat total = 0;
 #pragma omp simd reduction(+:total)