Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved #3494: AVX2 bugfixes + no code duplication for the integer workhorses in there #3495

Closed
wants to merge 11 commits into from
17 changes: 17 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -147,38 +147,44 @@ libtesseract_native_la_CXXFLAGS = -O3 -ffast-math
if MARCH_NATIVE_OPT
libtesseract_native_la_CXXFLAGS += -march=native -mtune=native
endif
libtesseract_native_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_native_la_SOURCES = src/arch/dotproduct.cpp

if HAVE_AVX
libtesseract_avx_la_CXXFLAGS = -mavx
libtesseract_avx_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_avx_la_SOURCES = src/arch/dotproductavx.cpp
libtesseract_la_LIBADD += libtesseract_avx.la
noinst_LTLIBRARIES += libtesseract_avx.la
endif

if HAVE_AVX2
libtesseract_avx2_la_CXXFLAGS = -mavx2
libtesseract_avx2_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_avx2_la_SOURCES = src/arch/intsimdmatrixavx2.cpp
libtesseract_la_LIBADD += libtesseract_avx2.la
noinst_LTLIBRARIES += libtesseract_avx2.la
endif

if HAVE_FMA
libtesseract_fma_la_CXXFLAGS = -mfma
libtesseract_fma_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_fma_la_SOURCES = src/arch/dotproductfma.cpp
libtesseract_la_LIBADD += libtesseract_fma.la
noinst_LTLIBRARIES += libtesseract_fma.la
endif

if HAVE_SSE4_1
libtesseract_sse_la_CXXFLAGS = -msse4.1
libtesseract_sse_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_sse_la_SOURCES = src/arch/dotproductsse.cpp src/arch/intsimdmatrixsse.cpp
libtesseract_la_LIBADD += libtesseract_sse.la
noinst_LTLIBRARIES += libtesseract_sse.la
endif

if HAVE_NEON
libtesseract_neon_la_CXXFLAGS = $(NEON_CXXFLAGS)
libtesseract_neon_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_neon_la_SOURCES = src/arch/intsimdmatrixneon.cpp
libtesseract_la_LIBADD += libtesseract_neon.la
noinst_LTLIBRARIES += libtesseract_neon.la
Expand Down Expand Up @@ -1230,6 +1236,7 @@ check_PROGRAMS += commandlineflags_test
check_PROGRAMS += dawg_test
endif # ENABLE_TRAINING
check_PROGRAMS += denorm_test
check_PROGRAMS += dotproduct_test
if !DISABLED_LEGACY_ENGINE
check_PROGRAMS += equationdetect_test
endif # !DISABLED_LEGACY_ENGINE
Expand Down Expand Up @@ -1356,6 +1363,16 @@ denorm_test_SOURCES = unittest/denorm_test.cc
denorm_test_CPPFLAGS = $(unittest_CPPFLAGS)
denorm_test_LDADD = $(TESS_LIBS)

dotproduct_test_SOURCES = unittest/dotproduct_test.cc
dotproduct_test_CPPFLAGS = $(unittest_CPPFLAGS)
if HAVE_AVX2
dotproduct_test_CPPFLAGS += -DHAVE_AVX2
endif
if HAVE_SSE4_1
dotproduct_test_CPPFLAGS += -DHAVE_SSE4_1
endif
dotproduct_test_LDADD = $(TESS_LIBS)

if !DISABLED_LEGACY_ENGINE
equationdetect_test_SOURCES = unittest/equationdetect_test.cc
equationdetect_test_CPPFLAGS = $(unittest_CPPFLAGS)
Expand Down
13 changes: 7 additions & 6 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ m4_define([MY_CHECK_FRAMEWORK],
])
if test "$my_cv_framework_$1"="yes"; then
AC_DEFINE(AS_TR_CPP([HAVE_FRAMEWORK_$1]), 1,
[Define if you have the $1 framework])
[Define if you have the $1 framework])
AS_TR_CPP([FRAMEWORK_$1])="-framework $1"
AC_SUBST(AS_TR_CPP([FRAMEWORK_$1]))
fi]
Expand All @@ -295,13 +295,14 @@ OPENCL_CPPFLAGS=''
OPENCL_LDFLAGS=''
case "${host_os}" in
*darwin* | *-macos10*)
echo "checking for OpenCL framework"
MY_CHECK_FRAMEWORK([OpenCL])
if test $my_cv_framework_OpenCL = yes; then
have_opencl_lib=true
MY_CHECK_FRAMEWORK([Accelerate])
if test $my_cv_framework_Accelerate = yes; then
AM_CPPFLAGS="-DHAVE_FRAMEWORK_ACCELERATE $AM_CPPFLAGS"
LDFLAGS="$LDFLAGS -framework Accelerate"
fi
MY_CHECK_FRAMEWORK([OpenCL])
if test "$enable_opencl" = "yes"; then
if !($have_opencl_lib); then
if test $my_cv_framework_OpenCL = no; then
AC_MSG_ERROR([Required OpenCL library not found!])
fi
AM_CPPFLAGS="-DUSE_OPENCL $AM_CPPFLAGS"
Expand Down
7 changes: 4 additions & 3 deletions src/arch/dotproduct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@
namespace tesseract {

// Computes and returns the dot product of the two n-vectors u and v.
double DotProductNative(const double *u, const double *v, int n) {
double total = 0.0;
for (int k = 0; k < n; ++k) {
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
TFloat total = 0;
#pragma omp simdi reduction(+:total)
for (int k = 0; k < n; k++) {
total += u[k] * v[k];
}
return total;
Expand Down
15 changes: 11 additions & 4 deletions src/arch/dotproduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,27 @@
#ifndef TESSERACT_ARCH_DOTPRODUCT_H_
#define TESSERACT_ARCH_DOTPRODUCT_H_

#include "tfloat.h"

namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
double DotProductNative(const double *u, const double *v, int n);
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n);

// Uses Intel AVX intrinsics to access the SIMD instruction set.
double DotProductAVX(const double *u, const double *v, int n);
TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX1(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX2(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX3(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX4(const TFloat *u, const TFloat *v, int n);

// Use Intel FMA.
double DotProductFMA(const double *u, const double *v, int n);
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n);

// Uses Intel SSE intrinsics to access the SIMD instruction set.
double DotProductSSE(const double *u, const double *v, int n);
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n);

TFloat DotProductAccelerate(const TFloat *u, const TFloat *v, int n);
} // namespace tesseract.

#endif // TESSERACT_ARCH_DOTPRODUCT_H_
74 changes: 74 additions & 0 deletions src/arch/dotproductavx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,79 @@ namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel AVX intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
float DotProductAVX(const float *u, const float *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
__m256 t0 = _mm256_setzero_ps();
for (unsigned k = 0; k < quot; k++) {
__m256 f0 = _mm256_loadu_ps(u);
__m256 f1 = _mm256_loadu_ps(v);
f0 = _mm256_mul_ps(f0, f1);
t0 = _mm256_add_ps(t0, f0);
u += 8;
v += 8;
}
alignas(32) float tmp[8];
_mm256_store_ps(tmp, t0);
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
float DotProductAVX1(const float *u, const float *v, int n) {
const unsigned quot = n / 16;
const unsigned rem = n % 16;
__m256 t0 = _mm256_setzero_ps();
__m256 t1 = _mm256_setzero_ps();
for (unsigned k = 0; k < quot; k++) {
__m256 f0 = _mm256_loadu_ps(u);
__m256 f1 = _mm256_loadu_ps(v);
__m256 f2 = _mm256_loadu_ps(u + 8);
__m256 f3 = _mm256_loadu_ps(v + 8);
f0 = _mm256_mul_ps(f0, f1);
f2 = _mm256_mul_ps(f2, f3);
t0 = _mm256_add_ps(t0, f0);
t1 = _mm256_add_ps(t1, f2);
u += 16;
v += 16;
}
t0 = _mm256_hadd_ps(t0, t1);
alignas(32) float tmp[8];
_mm256_store_ps(tmp, t0);
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
#else
double DotProductAVX1(const double *u, const double *v, int n) {
__m256d t0 = _mm256_setzero_pd();
__m256d t1 = _mm256_setzero_pd();
for (unsigned quot = n / 8; quot > 0; quot--) {
__m256d f0 = _mm256_loadu_pd(u);
__m256d f1 = _mm256_loadu_pd(v);
__m256d f2 = _mm256_loadu_pd(u + 4);
__m256d f3 = _mm256_loadu_pd(v + 4);
f0 = _mm256_mul_pd(f0, f1);
f2 = _mm256_mul_pd(f2, f3);
t0 = _mm256_add_pd(t0, f0);
t1 = _mm256_add_pd(t1, f2);
u += 8;
v += 8;
}
t0 = _mm256_hadd_pd(t0, t1);
alignas(32) double tmp[4];
_mm256_store_pd(tmp, t0);
double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
for (unsigned rem = n % 8; rem > 0; rem--) {
result += *u++ * *v++;
}
return result;
}

double DotProductAVX(const double *u, const double *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
Expand Down Expand Up @@ -57,6 +130,7 @@ double DotProductAVX(const double *u, const double *v, int n) {
}
return result;
}
#endif

} // namespace tesseract.

Expand Down
29 changes: 29 additions & 0 deletions src/arch/dotproductfma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,34 @@ namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel FMA intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
__m256 t0 = _mm256_setzero_ps();
__m256 t1 = _mm256_setzero_ps();
for (unsigned k = 0; k < quot; k++) {
__m256 f0 = _mm256_loadu_ps(u);
__m256 f1 = _mm256_loadu_ps(v);
t0 = _mm256_fmadd_ps(f0, f1, t0);
u += 4;
v += 4;
__m256 f2 = _mm256_loadu_ps(u);
__m256 f3 = _mm256_loadu_ps(v);
t1 = _mm256_fmadd_ps(f2, f3, t1);
u += 4;
v += 4;
}
t0 = _mm256_hadd_ps(t0, t1);
alignas(32) float tmp[4];
_mm256_store_ps(tmp, t0);
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
#else
double DotProductFMA(const double *u, const double *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
Expand All @@ -55,6 +83,7 @@ double DotProductFMA(const double *u, const double *v, int n) {
}
return result;
}
#endif

} // namespace tesseract.

Expand Down
10 changes: 10 additions & 0 deletions src/arch/dotproductsse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel SSE intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {
TFloat total = 0.0;
for (int k = 0; k < n; ++k) {
total += u[k] * v[k];
}
return total;
}
#else
double DotProductSSE(const double *u, const double *v, int n) {
int max_offset = n - 2;
int offset = 0;
Expand Down Expand Up @@ -78,6 +87,7 @@ double DotProductSSE(const double *u, const double *v, int n) {
}
return result;
}
#endif

} // namespace tesseract.

Expand Down
2 changes: 1 addition & 1 deletion src/arch/intsimdmatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t>
// u is imagined to have an extra element at the end with value 1, to
// implement the bias, but it doesn't actually have it.
void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w,
const std::vector<double> &scales, const int8_t *u, double *v) {
const std::vector<TFloat> &scales, const int8_t *u, TFloat *v) {
int num_out = w.dim1();
int num_in = w.dim2() - 1;
// Base implementation.
Expand Down
16 changes: 9 additions & 7 deletions src/arch/intsimdmatrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include <cstdint>
#include <vector>

#include "tfloat.h"

namespace tesseract {

template <class T>
Expand Down Expand Up @@ -78,8 +80,8 @@ struct TESS_API IntSimdMatrix {
// u is imagined to have an extra element at the end with value 1, to
// implement the bias, but it doesn't actually have it.
// Computes the base C++ implementation.
static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, const std::vector<double> &scales,
const int8_t *u, double *v);
static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, const std::vector<TFloat> &scales,
const int8_t *u, TFloat *v);

// Rounds the input up to a multiple of the given factor.
static int Roundup(int input, int factor) {
Expand All @@ -95,8 +97,8 @@ struct TESS_API IntSimdMatrix {
// RoundInputs above.
// The input will be over-read to the extent of the padding. There are no
// alignment requirements.
using MatrixDotVectorFunction = void (*)(int, int, const int8_t *, const double *, const int8_t *,
double *);
using MatrixDotVectorFunction = void (*)(int, int, const int8_t *, const TFloat *, const int8_t *,
TFloat *);
MatrixDotVectorFunction matrixDotVectorFunction;

// Number of 32 bit outputs held in each register.
Expand All @@ -112,10 +114,10 @@ struct TESS_API IntSimdMatrix {

static const IntSimdMatrix *intSimdMatrix;
// Only available with NEON.
static const IntSimdMatrix intSimdMatrixNEON;
static const IntSimdMatrix *intSimdMatrixNEON;
// Only available with AVX2 / SSE.
static const IntSimdMatrix intSimdMatrixAVX2;
static const IntSimdMatrix intSimdMatrixSSE;
static const IntSimdMatrix *intSimdMatrixAVX2;
static const IntSimdMatrix *intSimdMatrixSSE;
};

} // namespace tesseract
Expand Down
Loading