tesseract-ocr · GerHobbelt · Jun 30, 2021 · Jul 7, 2021 · Jul 7, 2021 · Jul 13, 2021
diff --git a/Makefile.am b/Makefile.am
@@ -147,38 +147,44 @@ libtesseract_native_la_CXXFLAGS = -O3 -ffast-math
 if MARCH_NATIVE_OPT
 libtesseract_native_la_CXXFLAGS += -march=native -mtune=native
 endif
+libtesseract_native_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
 libtesseract_native_la_SOURCES = src/arch/dotproduct.cpp
 
 if HAVE_AVX
 libtesseract_avx_la_CXXFLAGS = -mavx
+libtesseract_avx_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
 libtesseract_avx_la_SOURCES = src/arch/dotproductavx.cpp
 libtesseract_la_LIBADD += libtesseract_avx.la
 noinst_LTLIBRARIES += libtesseract_avx.la
 endif
 
 if HAVE_AVX2
 libtesseract_avx2_la_CXXFLAGS = -mavx2
+libtesseract_avx2_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
 libtesseract_avx2_la_SOURCES = src/arch/intsimdmatrixavx2.cpp
 libtesseract_la_LIBADD += libtesseract_avx2.la
 noinst_LTLIBRARIES += libtesseract_avx2.la
 endif
 
 if HAVE_FMA
 libtesseract_fma_la_CXXFLAGS = -mfma
+libtesseract_fma_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
 libtesseract_fma_la_SOURCES = src/arch/dotproductfma.cpp
 libtesseract_la_LIBADD += libtesseract_fma.la
 noinst_LTLIBRARIES += libtesseract_fma.la
 endif
 
 if HAVE_SSE4_1
 libtesseract_sse_la_CXXFLAGS = -msse4.1
+libtesseract_sse_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
 libtesseract_sse_la_SOURCES = src/arch/dotproductsse.cpp src/arch/intsimdmatrixsse.cpp
 libtesseract_la_LIBADD += libtesseract_sse.la
 noinst_LTLIBRARIES += libtesseract_sse.la
 endif
 
 if HAVE_NEON
 libtesseract_neon_la_CXXFLAGS = $(NEON_CXXFLAGS)
+libtesseract_neon_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
 libtesseract_neon_la_SOURCES = src/arch/intsimdmatrixneon.cpp
 libtesseract_la_LIBADD += libtesseract_neon.la
 noinst_LTLIBRARIES += libtesseract_neon.la
@@ -1230,6 +1236,7 @@ check_PROGRAMS += commandlineflags_test
 check_PROGRAMS += dawg_test
 endif # ENABLE_TRAINING
 check_PROGRAMS += denorm_test
+check_PROGRAMS += dotproduct_test
 if !DISABLED_LEGACY_ENGINE
 check_PROGRAMS += equationdetect_test
 endif # !DISABLED_LEGACY_ENGINE
@@ -1356,6 +1363,16 @@ denorm_test_SOURCES = unittest/denorm_test.cc
 denorm_test_CPPFLAGS = $(unittest_CPPFLAGS)
 denorm_test_LDADD = $(TESS_LIBS)
 
+dotproduct_test_SOURCES = unittest/dotproduct_test.cc
+dotproduct_test_CPPFLAGS = $(unittest_CPPFLAGS)
+if HAVE_AVX2
+dotproduct_test_CPPFLAGS += -DHAVE_AVX2
+endif
+if HAVE_SSE4_1
+dotproduct_test_CPPFLAGS += -DHAVE_SSE4_1
+endif
+dotproduct_test_LDADD = $(TESS_LIBS)
+
 if !DISABLED_LEGACY_ENGINE
 equationdetect_test_SOURCES = unittest/equationdetect_test.cc
 equationdetect_test_CPPFLAGS = $(unittest_CPPFLAGS)

diff --git a/configure.ac b/configure.ac
@@ -284,7 +284,7 @@ m4_define([MY_CHECK_FRAMEWORK],
     ])
    if test "$my_cv_framework_$1"="yes"; then
      AC_DEFINE(AS_TR_CPP([HAVE_FRAMEWORK_$1]), 1,
-            [Define if you have the  $1 framework])
+            [Define if you have the $1 framework])
      AS_TR_CPP([FRAMEWORK_$1])="-framework $1"
      AC_SUBST(AS_TR_CPP([FRAMEWORK_$1]))
    fi]
@@ -295,13 +295,14 @@ OPENCL_CPPFLAGS=''
 OPENCL_LDFLAGS=''
 case "${host_os}" in
   *darwin* | *-macos10*)
-    echo "checking for OpenCL framework"
-    MY_CHECK_FRAMEWORK([OpenCL])
-    if test $my_cv_framework_OpenCL = yes; then
-       have_opencl_lib=true
+    MY_CHECK_FRAMEWORK([Accelerate])
+    if test $my_cv_framework_Accelerate = yes; then
+      AM_CPPFLAGS="-DHAVE_FRAMEWORK_ACCELERATE $AM_CPPFLAGS"
+      LDFLAGS="$LDFLAGS -framework Accelerate"
     fi
+    MY_CHECK_FRAMEWORK([OpenCL])
     if test "$enable_opencl" = "yes"; then
-      if !($have_opencl_lib); then
+      if test $my_cv_framework_OpenCL = no; then
         AC_MSG_ERROR([Required OpenCL library not found!])
       fi
       AM_CPPFLAGS="-DUSE_OPENCL $AM_CPPFLAGS"

diff --git a/src/arch/dotproduct.cpp b/src/arch/dotproduct.cpp
@@ -19,9 +19,10 @@
 namespace tesseract {
 
 // Computes and returns the dot product of the two n-vectors u and v.
-double DotProductNative(const double *u, const double *v, int n) {
-  double total = 0.0;
-  for (int k = 0; k < n; ++k) {
+TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
+  TFloat total = 0;
+#pragma omp simdi reduction(+:total)
+  for (int k = 0; k < n; k++) {
     total += u[k] * v[k];
   }
   return total;

diff --git a/src/arch/dotproduct.h b/src/arch/dotproduct.h
@@ -17,20 +17,27 @@
 #ifndef TESSERACT_ARCH_DOTPRODUCT_H_
 #define TESSERACT_ARCH_DOTPRODUCT_H_
 
+#include "tfloat.h"
+
 namespace tesseract {
 
 // Computes and returns the dot product of the n-vectors u and v.
-double DotProductNative(const double *u, const double *v, int n);
+TFloat DotProductNative(const TFloat *u, const TFloat *v, int n);
 
 // Uses Intel AVX intrinsics to access the SIMD instruction set.
-double DotProductAVX(const double *u, const double *v, int n);
+TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n);
+TFloat DotProductAVX1(const TFloat *u, const TFloat *v, int n);
+TFloat DotProductAVX2(const TFloat *u, const TFloat *v, int n);
+TFloat DotProductAVX3(const TFloat *u, const TFloat *v, int n);
+TFloat DotProductAVX4(const TFloat *u, const TFloat *v, int n);
 
 // Use Intel FMA.
-double DotProductFMA(const double *u, const double *v, int n);
+TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n);
 
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
-double DotProductSSE(const double *u, const double *v, int n);
+TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n);
 
+TFloat DotProductAccelerate(const TFloat *u, const TFloat *v, int n);
 } // namespace tesseract.
 
 #endif // TESSERACT_ARCH_DOTPRODUCT_H_
diff --git a/src/arch/dotproductavx.cpp b/src/arch/dotproductavx.cpp
@@ -29,6 +29,79 @@ namespace tesseract {
 
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel AVX intrinsics to access the SIMD instruction set.
+#if defined(FAST_FLOAT)
+float DotProductAVX(const float *u, const float *v, int n) {
+  const unsigned quot = n / 8;
+  const unsigned rem = n % 8;
+  __m256 t0 = _mm256_setzero_ps();
+  for (unsigned k = 0; k < quot; k++) {
+    __m256 f0 = _mm256_loadu_ps(u);
+    __m256 f1 = _mm256_loadu_ps(v);
+    f0 = _mm256_mul_ps(f0, f1);
+    t0 = _mm256_add_ps(t0, f0);
+    u += 8;
+    v += 8;
+  }
+  alignas(32) float tmp[8];
+  _mm256_store_ps(tmp, t0);
+  float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
+  for (unsigned k = 0; k < rem; k++) {
+    result += *u++ * *v++;
+  }
+  return result;
+}
+float DotProductAVX1(const float *u, const float *v, int n) {
+  const unsigned quot = n / 16;
+  const unsigned rem = n % 16;
+  __m256 t0 = _mm256_setzero_ps();
+  __m256 t1 = _mm256_setzero_ps();
+  for (unsigned k = 0; k < quot; k++) {
+    __m256 f0 = _mm256_loadu_ps(u);
+    __m256 f1 = _mm256_loadu_ps(v);
+    __m256 f2 = _mm256_loadu_ps(u + 8);
+    __m256 f3 = _mm256_loadu_ps(v + 8);
+    f0 = _mm256_mul_ps(f0, f1);
+    f2 = _mm256_mul_ps(f2, f3);
+    t0 = _mm256_add_ps(t0, f0);
+    t1 = _mm256_add_ps(t1, f2);
+    u += 16;
+    v += 16;
+  }
+  t0 = _mm256_hadd_ps(t0, t1);
+  alignas(32) float tmp[8];
+  _mm256_store_ps(tmp, t0);
+  float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
+  for (unsigned k = 0; k < rem; k++) {
+    result += *u++ * *v++;
+  }
+  return result;
+}
+#else
+double DotProductAVX1(const double *u, const double *v, int n) {
+  __m256d t0 = _mm256_setzero_pd();
+  __m256d t1 = _mm256_setzero_pd();
+  for (unsigned quot = n / 8; quot > 0; quot--) {
+    __m256d f0 = _mm256_loadu_pd(u);
+    __m256d f1 = _mm256_loadu_pd(v);
+    __m256d f2 = _mm256_loadu_pd(u + 4);
+    __m256d f3 = _mm256_loadu_pd(v + 4);
+    f0 = _mm256_mul_pd(f0, f1);
+    f2 = _mm256_mul_pd(f2, f3);
+    t0 = _mm256_add_pd(t0, f0);
+    t1 = _mm256_add_pd(t1, f2);
+    u += 8;
+    v += 8;
+  }
+  t0 = _mm256_hadd_pd(t0, t1);
+  alignas(32) double tmp[4];
+  _mm256_store_pd(tmp, t0);
+  double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+  for (unsigned rem = n % 8; rem > 0; rem--) {
+    result += *u++ * *v++;
+  }
+  return result;
+}
+
 double DotProductAVX(const double *u, const double *v, int n) {
   const unsigned quot = n / 8;
   const unsigned rem = n % 8;
@@ -57,6 +130,7 @@ double DotProductAVX(const double *u, const double *v, int n) {
   }
   return result;
 }
+#endif
 
 } // namespace tesseract.
 

diff --git a/src/arch/dotproductfma.cpp b/src/arch/dotproductfma.cpp
@@ -29,6 +29,34 @@ namespace tesseract {
 
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel FMA intrinsics to access the SIMD instruction set.
+#if defined(FAST_FLOAT)
+TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
+  const unsigned quot = n / 8;
+  const unsigned rem = n % 8;
+  __m256 t0 = _mm256_setzero_ps();
+  __m256 t1 = _mm256_setzero_ps();
+  for (unsigned k = 0; k < quot; k++) {
+    __m256 f0 = _mm256_loadu_ps(u);
+    __m256 f1 = _mm256_loadu_ps(v);
+    t0 = _mm256_fmadd_ps(f0, f1, t0);
+    u += 4;
+    v += 4;
+    __m256 f2 = _mm256_loadu_ps(u);
+    __m256 f3 = _mm256_loadu_ps(v);
+    t1 = _mm256_fmadd_ps(f2, f3, t1);
+    u += 4;
+    v += 4;
+  }
+  t0 = _mm256_hadd_ps(t0, t1);
+  alignas(32) float tmp[4];
+  _mm256_store_ps(tmp, t0);
+  float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+  for (unsigned k = 0; k < rem; k++) {
+    result += *u++ * *v++;
+  }
+  return result;
+}
+#else
 double DotProductFMA(const double *u, const double *v, int n) {
   const unsigned quot = n / 8;
   const unsigned rem = n % 8;
@@ -55,6 +83,7 @@ double DotProductFMA(const double *u, const double *v, int n) {
   }
   return result;
 }
+#endif
 
 } // namespace tesseract.
 

diff --git a/src/arch/dotproductsse.cpp b/src/arch/dotproductsse.cpp
@@ -30,6 +30,15 @@ namespace tesseract {
 
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
+#if defined(FAST_FLOAT)
+TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {
+  TFloat total = 0.0;
+  for (int k = 0; k < n; ++k) {
+    total += u[k] * v[k];
+  }
+  return total;
+}
+#else
 double DotProductSSE(const double *u, const double *v, int n) {
   int max_offset = n - 2;
   int offset = 0;
@@ -78,6 +87,7 @@ double DotProductSSE(const double *u, const double *v, int n) {
   }
   return result;
 }
+#endif
 
 } // namespace tesseract.
 

diff --git a/src/arch/intsimdmatrix.cpp b/src/arch/intsimdmatrix.cpp
@@ -76,7 +76,7 @@ void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t>
 // u is imagined to have an extra element at the end with value 1, to
 // implement the bias, but it doesn't actually have it.
 void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w,
-                                    const std::vector<double> &scales, const int8_t *u, double *v) {
+                                    const std::vector<TFloat> &scales, const int8_t *u, TFloat *v) {
   int num_out = w.dim1();
   int num_in = w.dim2() - 1;
   // Base implementation.

diff --git a/src/arch/intsimdmatrix.h b/src/arch/intsimdmatrix.h
@@ -23,6 +23,8 @@
 #include <cstdint>
 #include <vector>
 
+#include "tfloat.h"
+
 namespace tesseract {
 
 template <class T>
@@ -78,8 +80,8 @@ struct TESS_API IntSimdMatrix {
   // u is imagined to have an extra element at the end with value 1, to
   // implement the bias, but it doesn't actually have it.
   // Computes the base C++ implementation.
-  static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, const std::vector<double> &scales,
-                              const int8_t *u, double *v);
+  static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, const std::vector<TFloat> &scales,
+                              const int8_t *u, TFloat *v);
 
   // Rounds the input up to a multiple of the given factor.
   static int Roundup(int input, int factor) {
@@ -95,8 +97,8 @@ struct TESS_API IntSimdMatrix {
   // RoundInputs above.
   // The input will be over-read to the extent of the padding. There are no
   // alignment requirements.
-  using MatrixDotVectorFunction = void (*)(int, int, const int8_t *, const double *, const int8_t *,
-                                           double *);
+  using MatrixDotVectorFunction = void (*)(int, int, const int8_t *, const TFloat *, const int8_t *,
+                                           TFloat *);
   MatrixDotVectorFunction matrixDotVectorFunction;
 
   // Number of 32 bit outputs held in each register.
@@ -112,10 +114,10 @@ struct TESS_API IntSimdMatrix {
 
   static const IntSimdMatrix *intSimdMatrix;
   // Only available with NEON.
-  static const IntSimdMatrix intSimdMatrixNEON;
+  static const IntSimdMatrix *intSimdMatrixNEON;
   // Only available with AVX2 / SSE.
-  static const IntSimdMatrix intSimdMatrixAVX2;
-  static const IntSimdMatrix intSimdMatrixSSE;
+  static const IntSimdMatrix *intSimdMatrixAVX2;
+  static const IntSimdMatrix *intSimdMatrixSSE;
 };
 
 } // namespace tesseract