This is how the idea expressed in tesseract-ocr#3490 looks like: usin…

…g function templates for TFloat float & double implementations to co-exist in the run-time without cluttering the code with #if/#else and no run-time switches (yet). ## Observations thus far - DRY? Check! - the whole function template (and let the C++ compiler do the heavy lifting) idea of stops somewhere. This regrettably happens to be at the weightmatrix.cpp code, where the code calls the CPU+configuration-selected SIMD implementation via function pointer: `intSimdMatrix->matrixDotVectorFunction` -- this would require code duplication of some kind (e.g. a FP32 callback pointer co-existing with a FP64 callback ptr in the struct and then have the code pick the right one, depending on current TFloat size, for example) and is thus deemed unsatisfactory (my opinion). - So far, and very probably independent of any solutions for the co-existence issue at higher levels in the code, this template approach works out well, with the compiler smartly picking the one matching the current float/double choice. - while we have double the number of specialized SIMD implementations (obviously), these do not need #if/#else checks as we can let the C++ compiler do its prototype matching job --> cleaner code. - the template functions also help clean up the serialization/de-serialization code as the `<T, ST>` dual-type approach there allows one to specify the run-time type (TFloat) and the file-storage type at the same time: also do note how this cleans up the 'Old' scales deserialization code, as the old file storage is simply 'float' instead of 'double'. - the added cost there is a double copy of file data when T==ST, but that turned out negligible in the preliminary tests as that bit of code didn't even reach the Top20 CPU Guzzlers Chart, so that extra copy can wait for smarter C++ template writers to take care of when microtuning is called for.
GerHobbelt · Jul 13, 2021 · 00ed4b2 · 00ed4b2
1 parent 3eae6d7
commit 00ed4b2
Show file tree

Hide file tree

Showing 11 changed files with 242 additions and 358 deletions.
diff --git a/src/arch/dotproduct.cpp b/src/arch/dotproduct.cpp
@@ -19,6 +19,7 @@
 namespace tesseract {
 
 // Computes and returns the dot product of the two n-vectors u and v.
+template <class TFloat>
 TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
   TFloat total = 0;
 #if defined(OPENMP_SIMD)
@@ -30,4 +31,8 @@ TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
   return total;
 }
 
+// two instantiations: float & double.
+template float DotProductNative<float>(const float *u, const float *v, int n);
+template double DotProductNative<double>(const double *u, const double *v, int n);
+
 } // namespace tesseract
diff --git a/src/arch/dotproduct.h b/src/arch/dotproduct.h
@@ -22,22 +22,43 @@
 namespace tesseract {
 
 // Computes and returns the dot product of the n-vectors u and v.
+template <class TFloat>
 TFloat DotProductNative(const TFloat *u, const TFloat *v, int n);
 
+// ------------ FAST FLOAT specializations -----------------
+
+// Uses Intel AVX intrinsics to access the SIMD instruction set.
+float DotProductAVX(const float *u, const float *v, int n);
+float DotProductAVX1(const float *u, const float *v, int n);
+float DotProductAVX2(const float *u, const float *v, int n);
+float DotProductAVX3(const float *u, const float *v, int n);
+float DotProductAVX4(const float *u, const float *v, int n);
+
+// Use Intel FMA.
+float DotProductFMA(const float *u, const float *v, int n);
+
+// Uses Intel SSE intrinsics to access the SIMD instruction set.
+float DotProductSSE(const float *u, const float *v, int n);
+
+float DotProductAccelerate(const float *u, const float *v, int n);
+
+// ------------ HIGH PRECISION DOUBLE specializations -----------------
+
 // Uses Intel AVX intrinsics to access the SIMD instruction set.
-TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX1(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX2(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX3(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX4(const TFloat *u, const TFloat *v, int n);
+double DotProductAVX(const double *u, const double *v, int n);
+double DotProductAVX1(const double *u, const double *v, int n);
+double DotProductAVX2(const double *u, const double *v, int n);
+double DotProductAVX3(const double *u, const double *v, int n);
+double DotProductAVX4(const double *u, const double *v, int n);
 
 // Use Intel FMA.
-TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n);
+double DotProductFMA(const double *u, const double *v, int n);
 
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
-TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n);
+double DotProductSSE(const double *u, const double *v, int n);
+
+double DotProductAccelerate(const double *u, const double *v, int n);
 
-TFloat DotProductAccelerate(const TFloat *u, const TFloat *v, int n);
 } // namespace tesseract.
 
 #endif // TESSERACT_ARCH_DOTPRODUCT_H_
diff --git a/src/arch/dotproductavx.cpp b/src/arch/dotproductavx.cpp
@@ -27,9 +27,10 @@
 
 namespace tesseract {
 
+// ---------------------------- FAST FLOAT section ------------------------
+
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel AVX intrinsics to access the SIMD instruction set.
-#if defined(FAST_FLOAT)
 float DotProductAVX(const float *u, const float *v, int n) {
   const unsigned quot = n / 8;
   const unsigned rem = n % 8;
@@ -50,6 +51,7 @@ float DotProductAVX(const float *u, const float *v, int n) {
   }
   return result;
 }
+
 float DotProductAVX1(const float *u, const float *v, int n) {
   const unsigned quot = n / 16;
   const unsigned rem = n % 16;
@@ -76,7 +78,9 @@ float DotProductAVX1(const float *u, const float *v, int n) {
   }
   return result;
 }
-#else
+
+// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------
+
 double DotProductAVX1(const double *u, const double *v, int n) {
   __m256d t0 = _mm256_setzero_pd();
   __m256d t1 = _mm256_setzero_pd();
@@ -130,7 +134,8 @@ double DotProductAVX(const double *u, const double *v, int n) {
   }
   return result;
 }
-#endif
+
+// ---------------------------- END FLOAT/DOUBLE sections ------------------------
 
 } // namespace tesseract.
 

diff --git a/src/arch/dotproductfma.cpp b/src/arch/dotproductfma.cpp
@@ -27,10 +27,11 @@
 
 namespace tesseract {
 
+// ---------------------------- FAST FLOAT section ------------------------
+
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel FMA intrinsics to access the SIMD instruction set.
-#if defined(FAST_FLOAT)
-TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
+float DotProductFMA(const float *u, const float *v, int n) {
   const unsigned quot = n / 16;
   const unsigned rem = n % 16;
   __m256 t0 = _mm256_setzero_ps();
@@ -48,15 +49,17 @@ TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
     v += 8;
   }
   t0 = _mm256_hadd_ps(t0, t1);
-  alignas(32) TFloat tmp[8];
+  alignas(32) float tmp[8];
   _mm256_store_ps(tmp, t0);
-  TFloat result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
+  float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
   for (unsigned k = 0; k < rem; k++) {
     result += *u++ * *v++;
   }
   return result;
 }
-#else
+
+// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------
+
 double DotProductFMA(const double *u, const double *v, int n) {
   const unsigned quot = n / 8;
   const unsigned rem = n % 8;
@@ -83,7 +86,8 @@ double DotProductFMA(const double *u, const double *v, int n) {
   }
   return result;
 }
-#endif
+
+// ---------------------------- END section ------------------------
 
 } // namespace tesseract.
 

diff --git a/src/arch/dotproductsse.cpp b/src/arch/dotproductsse.cpp
@@ -28,9 +28,10 @@
 
 namespace tesseract {
 
+// ---------------------------- FAST FLOAT section ------------------------
+
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
-#if defined(FAST_FLOAT)
 float DotProductSSE(const float *u, const float *v, int n) {
   int max_offset = n - 4;
   int offset = 0;
@@ -89,7 +90,9 @@ float DotProductSSE(const float *u, const float *v, int n) {
   }
   return result;
 }
-#else
+
+// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------
+
 double DotProductSSE(const double *u, const double *v, int n) {
   int max_offset = n - 2;
   int offset = 0;
@@ -139,7 +142,8 @@ double DotProductSSE(const double *u, const double *v, int n) {
   }
   return result;
 }
-#endif
+
+// ---------------------------- END section ------------------------
 
 } // namespace tesseract.