Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tfloat float and double coexistance -- working towards that goal #7

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
731726a
Add TFloat data type for neural network
stweil Jun 30, 2021
fa1850f
Fix some compiler warnings
stweil Jul 7, 2021
507b8cb
Optimize DotProductStdInnerProduct for float
stweil Jul 7, 2021
0f9acea
Avoid double / float conversion
stweil Jul 13, 2021
16437ca
Implement TFloat for IntSimdMatrix
stweil Jul 13, 2021
8e77429
Test more implementations of DotProduct
stweil Jul 13, 2021
c7034f0
Add unittest for dotproduct
stweil Jul 10, 2021
f497c18
Support Apple Accelerate framework for training and best models
stweil Feb 28, 2021
4676d22
Fix TFloat builds for Apple M1
stweil Jul 13, 2021
82236f6
Fix DotProductNative for TFloat
stweil Jul 13, 2021
1402521
bugfix of FMA port to FAST_FLOAT: 8 float FPs fit in a single 256bit …
GerHobbelt Jul 13, 2021
00feac2
extracted from 3490: implements DotProductSSE() for FAST_FLOAT
GerHobbelt Jul 13, 2021
dd3b5f2
bugfixing the AVX2 Extract8+16 codes, where there's lines like `__m25…
GerHobbelt Jul 13, 2021
a9ff366
Improve build code for native dotproduct
stweil Jul 13, 2021
9c6503b
Enhance unittest/dotproduct_test
stweil Jul 13, 2021
284fdb0
Remove test code for fast float dotproduct
stweil Jul 13, 2021
a71edc9
Implement fast float dotproduct for SSE IntSimdMatrix
stweil Jul 13, 2021
ffea0f2
Partially revert "Merge pull request #3330 from Sintun/master"
stweil Jul 14, 2021
77cd861
Place TFloat type in the tesseract namespace, same as has been done w…
GerHobbelt Jul 15, 2021
8d1c1e1
just a couple of 'shadowed local variables' compiler warning fixes th…
GerHobbelt Jul 13, 2021
97834d0
This is how the idea expressed in #3490 looks like: using function te…
GerHobbelt Jul 15, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -147,41 +147,53 @@ libtesseract_native_la_CXXFLAGS = -O3 -ffast-math
if OPENMP_SIMD
libtesseract_native_la_CXXFLAGS += -fopenmp-simd -DOPENMP_SIMD
endif
if HAVE_AVX
libtesseract_native_la_CXXFLAGS += -mavx
endif
if MARCH_NATIVE_OPT
libtesseract_native_la_CXXFLAGS += -march=native -mtune=native
endif
libtesseract_native_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_native_la_SOURCES = src/arch/dotproduct.cpp

if HAVE_AVX
libtesseract_avx_la_CXXFLAGS = -mavx
libtesseract_avx_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_avx_la_SOURCES = src/arch/dotproductavx.cpp
libtesseract_la_LIBADD += libtesseract_avx.la
noinst_LTLIBRARIES += libtesseract_avx.la
endif

if HAVE_AVX2
libtesseract_avx2_la_CXXFLAGS = -mavx2
libtesseract_avx2_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_avx2_la_SOURCES = src/arch/intsimdmatrixavx2.cpp
libtesseract_la_LIBADD += libtesseract_avx2.la
noinst_LTLIBRARIES += libtesseract_avx2.la
endif

if HAVE_FMA
libtesseract_fma_la_CXXFLAGS = -mfma
libtesseract_fma_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_fma_la_SOURCES = src/arch/dotproductfma.cpp
libtesseract_la_LIBADD += libtesseract_fma.la
noinst_LTLIBRARIES += libtesseract_fma.la
endif

if HAVE_SSE4_1
libtesseract_sse_la_CXXFLAGS = -msse4.1
libtesseract_sse_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
if OPENMP_SIMD
libtesseract_sse_la_CXXFLAGS += -fopenmp-simd -DOPENMP_SIMD
endif
libtesseract_sse_la_SOURCES = src/arch/dotproductsse.cpp src/arch/intsimdmatrixsse.cpp
libtesseract_la_LIBADD += libtesseract_sse.la
noinst_LTLIBRARIES += libtesseract_sse.la
endif

if HAVE_NEON
libtesseract_neon_la_CXXFLAGS = $(NEON_CXXFLAGS)
libtesseract_neon_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_neon_la_SOURCES = src/arch/intsimdmatrixneon.cpp
libtesseract_la_LIBADD += libtesseract_neon.la
noinst_LTLIBRARIES += libtesseract_neon.la
Expand Down Expand Up @@ -1233,6 +1245,7 @@ check_PROGRAMS += commandlineflags_test
check_PROGRAMS += dawg_test
endif # ENABLE_TRAINING
check_PROGRAMS += denorm_test
check_PROGRAMS += dotproduct_test
if !DISABLED_LEGACY_ENGINE
check_PROGRAMS += equationdetect_test
endif # !DISABLED_LEGACY_ENGINE
Expand Down Expand Up @@ -1359,6 +1372,16 @@ denorm_test_SOURCES = unittest/denorm_test.cc
denorm_test_CPPFLAGS = $(unittest_CPPFLAGS)
denorm_test_LDADD = $(TESS_LIBS)

dotproduct_test_SOURCES = unittest/dotproduct_test.cc
dotproduct_test_CPPFLAGS = $(unittest_CPPFLAGS)
if HAVE_AVX2
dotproduct_test_CPPFLAGS += -DHAVE_AVX2
endif
if HAVE_SSE4_1
dotproduct_test_CPPFLAGS += -DHAVE_SSE4_1
endif
dotproduct_test_LDADD = $(TESS_LIBS)

if !DISABLED_LEGACY_ENGINE
equationdetect_test_SOURCES = unittest/equationdetect_test.cc
equationdetect_test_CPPFLAGS = $(unittest_CPPFLAGS)
Expand Down
11 changes: 6 additions & 5 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -297,13 +297,14 @@ OPENCL_CPPFLAGS=''
OPENCL_LDFLAGS=''
case "${host_os}" in
*darwin* | *-macos10*)
echo "checking for OpenCL framework"
MY_CHECK_FRAMEWORK([OpenCL])
if test $my_cv_framework_OpenCL = yes; then
have_opencl_lib=true
MY_CHECK_FRAMEWORK([Accelerate])
if test $my_cv_framework_Accelerate = yes; then
AM_CPPFLAGS="-DHAVE_FRAMEWORK_ACCELERATE $AM_CPPFLAGS"
LDFLAGS="$LDFLAGS -framework Accelerate"
fi
MY_CHECK_FRAMEWORK([OpenCL])
if test "$enable_opencl" = "yes"; then
if !($have_opencl_lib); then
if test $my_cv_framework_OpenCL = no; then
AC_MSG_ERROR([Required OpenCL library not found!])
fi
AM_CPPFLAGS="-DUSE_OPENCL $AM_CPPFLAGS"
Expand Down
1 change: 0 additions & 1 deletion src/api/hocrrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,6 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
hocr_str << "\n </span>";
++scnt;
} else if (lstm_choice_mode == 2) {
tesseract::ChoiceIterator ci(*res_it);
hocr_str << "\n <span class='ocrx_cinfo'"
<< " id='"
<< "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt << "'>";
Expand Down
11 changes: 8 additions & 3 deletions src/arch/dotproduct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,20 @@
namespace tesseract {

// Computes and returns the dot product of the two n-vectors u and v.
double DotProductNative(const double *u, const double *v, int n) {
double total = 0.0;
template <class TFloat>
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
TFloat total = 0;
#if defined(OPENMP_SIMD) || defined(_OPENMP)
#pragma omp simd reduction(+:total)
#endif
for (int k = 0; k < n; ++k) {
for (int k = 0; k < n; k++) {
total += u[k] * v[k];
}
return total;
}

// two instantiations: float & double.
template float DotProductNative<float>(const float *u, const float *v, int n);
template double DotProductNative<double>(const double *u, const double *v, int n);

} // namespace tesseract
30 changes: 29 additions & 1 deletion src/arch/dotproduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,48 @@
#ifndef TESSERACT_ARCH_DOTPRODUCT_H_
#define TESSERACT_ARCH_DOTPRODUCT_H_

#include "tfloat.h"

namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
double DotProductNative(const double *u, const double *v, int n);
template <class TFloat>
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n);

// ------------ FAST FLOAT specializations -----------------

// Uses Intel AVX intrinsics to access the SIMD instruction set.
float DotProductAVX(const float *u, const float *v, int n);
float DotProductAVX1(const float *u, const float *v, int n);
float DotProductAVX2(const float *u, const float *v, int n);
float DotProductAVX3(const float *u, const float *v, int n);
float DotProductAVX4(const float *u, const float *v, int n);

// Use Intel FMA.
float DotProductFMA(const float *u, const float *v, int n);

// Uses Intel SSE intrinsics to access the SIMD instruction set.
float DotProductSSE(const float *u, const float *v, int n);

float DotProductAccelerate(const float *u, const float *v, int n);

// ------------ HIGH PRECISION DOUBLE specializations -----------------

// Uses Intel AVX intrinsics to access the SIMD instruction set.
double DotProductAVX(const double *u, const double *v, int n);
double DotProductAVX1(const double *u, const double *v, int n);
double DotProductAVX2(const double *u, const double *v, int n);
double DotProductAVX3(const double *u, const double *v, int n);
double DotProductAVX4(const double *u, const double *v, int n);

// Use Intel FMA.
double DotProductFMA(const double *u, const double *v, int n);

// Uses Intel SSE intrinsics to access the SIMD instruction set.
double DotProductSSE(const double *u, const double *v, int n);

double DotProductAccelerate(const double *u, const double *v, int n);

} // namespace tesseract.

#endif // TESSERACT_ARCH_DOTPRODUCT_H_
99 changes: 94 additions & 5 deletions src/arch/dotproductavx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,95 @@
// limitations under the License.
///////////////////////////////////////////////////////////////////////

#if !defined(__AVX__)
# if defined(__i686__) || defined(__x86_64__)
# error Implementation only for AVX capable architectures
# endif
#else
#include "intsimdmatrix.h"

#if defined(__AVX__)

# include <immintrin.h>
# include <cstdint>
# include "dotproduct.h"

namespace tesseract {

// ---------------------------- FAST FLOAT section ------------------------

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel AVX intrinsics to access the SIMD instruction set.
float DotProductAVX(const float *u, const float *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
__m256 t0 = _mm256_setzero_ps();
for (unsigned k = 0; k < quot; k++) {
__m256 f0 = _mm256_loadu_ps(u);
__m256 f1 = _mm256_loadu_ps(v);
f0 = _mm256_mul_ps(f0, f1);
t0 = _mm256_add_ps(t0, f0);
u += 8;
v += 8;
}
alignas(32) float tmp[8];
_mm256_store_ps(tmp, t0);
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}

float DotProductAVX1(const float *u, const float *v, int n) {
const unsigned quot = n / 16;
const unsigned rem = n % 16;
__m256 t0 = _mm256_setzero_ps();
__m256 t1 = _mm256_setzero_ps();
for (unsigned k = 0; k < quot; k++) {
__m256 f0 = _mm256_loadu_ps(u);
__m256 f1 = _mm256_loadu_ps(v);
__m256 f2 = _mm256_loadu_ps(u + 8);
__m256 f3 = _mm256_loadu_ps(v + 8);
f0 = _mm256_mul_ps(f0, f1);
f2 = _mm256_mul_ps(f2, f3);
t0 = _mm256_add_ps(t0, f0);
t1 = _mm256_add_ps(t1, f2);
u += 16;
v += 16;
}
t0 = _mm256_hadd_ps(t0, t1);
alignas(32) float tmp[8];
_mm256_store_ps(tmp, t0);
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}

// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------

double DotProductAVX1(const double *u, const double *v, int n) {
__m256d t0 = _mm256_setzero_pd();
__m256d t1 = _mm256_setzero_pd();
for (unsigned quot = n / 8; quot > 0; quot--) {
__m256d f0 = _mm256_loadu_pd(u);
__m256d f1 = _mm256_loadu_pd(v);
__m256d f2 = _mm256_loadu_pd(u + 4);
__m256d f3 = _mm256_loadu_pd(v + 4);
f0 = _mm256_mul_pd(f0, f1);
f2 = _mm256_mul_pd(f2, f3);
t0 = _mm256_add_pd(t0, f0);
t1 = _mm256_add_pd(t1, f2);
u += 8;
v += 8;
}
t0 = _mm256_hadd_pd(t0, t1);
alignas(32) double tmp[4];
_mm256_store_pd(tmp, t0);
double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
for (unsigned rem = n % 8; rem > 0; rem--) {
result += *u++ * *v++;
}
return result;
}

double DotProductAVX(const double *u, const double *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
Expand Down Expand Up @@ -58,6 +133,20 @@ double DotProductAVX(const double *u, const double *v, int n) {
return result;
}

// ---------------------------- END FLOAT/DOUBLE sections ------------------------

} // namespace tesseract.

#else

namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel FMA intrinsics to access the SIMD instruction set.
inline TFloat DotProductAVX(const TFloat* u, const TFloat* v, int n) {
return DotProductFMA(u, v, n);
}

}

#endif
54 changes: 49 additions & 5 deletions src/arch/dotproductfma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,47 @@
// limitations under the License.
///////////////////////////////////////////////////////////////////////

#if !defined(__FMA__)
# if defined(__i686__) || defined(__x86_64__)
# error Implementation only for FMA capable architectures
# endif
#else
#if defined(__FMA__)

# include <immintrin.h>
# include <cstdint>
# include "dotproduct.h"

namespace tesseract {

// ---------------------------- FAST FLOAT section ------------------------

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel FMA intrinsics to access the SIMD instruction set.
float DotProductFMA(const float *u, const float *v, int n) {
const unsigned quot = n / 16;
const unsigned rem = n % 16;
__m256 t0 = _mm256_setzero_ps();
__m256 t1 = _mm256_setzero_ps();
for (unsigned k = 0; k < quot; k++) {
__m256 f0 = _mm256_loadu_ps(u);
__m256 f1 = _mm256_loadu_ps(v);
t0 = _mm256_fmadd_ps(f0, f1, t0);
u += 8;
v += 8;
__m256 f2 = _mm256_loadu_ps(u);
__m256 f3 = _mm256_loadu_ps(v);
t1 = _mm256_fmadd_ps(f2, f3, t1);
u += 8;
v += 8;
}
t0 = _mm256_hadd_ps(t0, t1);
alignas(32) float tmp[8];
_mm256_store_ps(tmp, t0);
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}

// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------

double DotProductFMA(const double *u, const double *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
Expand Down Expand Up @@ -56,6 +83,23 @@ double DotProductFMA(const double *u, const double *v, int n) {
return result;
}

// ---------------------------- END section ------------------------

} // namespace tesseract.

#else

namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel FMA intrinsics to access the SIMD instruction set.
inline float DotProductFMA(const float *u, const float *v, int n) {
return DotProductSSE(u, v, n);
}
inline double DotProductFMA(const double *u, const double *v, int n) {
return DotProductSSE(u, v, n);
}

}

#endif
Loading