Skip to content

Commit

Permalink
This is how the idea expressed in tesseract-ocr#3490 looks like: usin…
Browse files Browse the repository at this point in the history
…g function templates for TFloat float & double implementations to co-exist in the run-time without cluttering the code with #if/#else and no run-time switches (yet).

## Observations thus far

- DRY? Check!
- the whole function template (and let the C++ compiler do the heavy lifting) idea of stops somewhere. This regrettably happens to be at the weightmatrix.cpp code, where the code calls the CPU+configuration-selected SIMD implementation via function pointer: `intSimdMatrix->matrixDotVectorFunction` -- this would require code duplication of some kind (e.g. a FP32 callback pointer co-existing with a FP64 callback ptr in the struct and then have the code pick the right one, depending on current TFloat size, for example) and is thus deemed unsatisfactory (my opinion).
- So far, and very probably independent of any solutions for the co-existence issue at higher levels in the code, this template approach works out well, with the compiler smartly picking the one matching the current float/double choice.
- while we have double the number of specialized SIMD implementations (obviously), these do not need #if/#else checks as we can let the C++ compiler do its prototype matching job --> cleaner code.
- the template functions also help clean up the serialization/de-serialization code as the `<T, ST>` dual-type approach there allows one to specify the run-time type (TFloat) and the file-storage type at the same time: also do note how this cleans up the 'Old' scales deserialization code, as the old file storage is simply 'float' instead of 'double'.
- the added cost there is a double copy of file data when T==ST, but that turned out negligible in the preliminary tests as that bit of code didn't even reach the Top20 CPU Guzzlers Chart, so that extra copy can wait for smarter C++ template writers to take care of when microtuning is called for.
  • Loading branch information
GerHobbelt committed Jul 15, 2021
1 parent 8d1c1e1 commit 97834d0
Show file tree
Hide file tree
Showing 9 changed files with 172 additions and 295 deletions.
5 changes: 5 additions & 0 deletions src/arch/dotproduct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
namespace tesseract {

// Computes and returns the dot product of the two n-vectors u and v.
template <class TFloat>
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
TFloat total = 0;
#if defined(OPENMP_SIMD) || defined(_OPENMP)
Expand All @@ -30,4 +31,8 @@ TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
return total;
}

// two instantiations: float & double.
template float DotProductNative<float>(const float *u, const float *v, int n);
template double DotProductNative<double>(const double *u, const double *v, int n);

} // namespace tesseract
37 changes: 29 additions & 8 deletions src/arch/dotproduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,43 @@
namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
template <class TFloat>
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n);

// ------------ FAST FLOAT specializations -----------------

// Uses Intel AVX intrinsics to access the SIMD instruction set.
float DotProductAVX(const float *u, const float *v, int n);
float DotProductAVX1(const float *u, const float *v, int n);
float DotProductAVX2(const float *u, const float *v, int n);
float DotProductAVX3(const float *u, const float *v, int n);
float DotProductAVX4(const float *u, const float *v, int n);

// Use Intel FMA.
float DotProductFMA(const float *u, const float *v, int n);

// Uses Intel SSE intrinsics to access the SIMD instruction set.
float DotProductSSE(const float *u, const float *v, int n);

float DotProductAccelerate(const float *u, const float *v, int n);

// ------------ HIGH PRECISION DOUBLE specializations -----------------

// Uses Intel AVX intrinsics to access the SIMD instruction set.
TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX1(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX2(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX3(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX4(const TFloat *u, const TFloat *v, int n);
double DotProductAVX(const double *u, const double *v, int n);
double DotProductAVX1(const double *u, const double *v, int n);
double DotProductAVX2(const double *u, const double *v, int n);
double DotProductAVX3(const double *u, const double *v, int n);
double DotProductAVX4(const double *u, const double *v, int n);

// Use Intel FMA.
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n);
double DotProductFMA(const double *u, const double *v, int n);

// Uses Intel SSE intrinsics to access the SIMD instruction set.
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n);
double DotProductSSE(const double *u, const double *v, int n);

double DotProductAccelerate(const double *u, const double *v, int n);

TFloat DotProductAccelerate(const TFloat *u, const TFloat *v, int n);
} // namespace tesseract.

#endif // TESSERACT_ARCH_DOTPRODUCT_H_
31 changes: 23 additions & 8 deletions src/arch/dotproductavx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,20 @@
// limitations under the License.
///////////////////////////////////////////////////////////////////////

#if !defined(__AVX__)
# if defined(__i686__) || defined(__x86_64__)
# error Implementation only for AVX capable architectures
# endif
#else
#include "intsimdmatrix.h"

#if defined(__AVX__)

# include <immintrin.h>
# include <cstdint>
# include "dotproduct.h"

namespace tesseract {

// ---------------------------- FAST FLOAT section ------------------------

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel AVX intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
float DotProductAVX(const float *u, const float *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
Expand All @@ -50,6 +49,7 @@ float DotProductAVX(const float *u, const float *v, int n) {
}
return result;
}

float DotProductAVX1(const float *u, const float *v, int n) {
const unsigned quot = n / 16;
const unsigned rem = n % 16;
Expand All @@ -76,7 +76,9 @@ float DotProductAVX1(const float *u, const float *v, int n) {
}
return result;
}
#else

// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------

double DotProductAVX1(const double *u, const double *v, int n) {
__m256d t0 = _mm256_setzero_pd();
__m256d t1 = _mm256_setzero_pd();
Expand Down Expand Up @@ -130,8 +132,21 @@ double DotProductAVX(const double *u, const double *v, int n) {
}
return result;
}
#endif

// ---------------------------- END FLOAT/DOUBLE sections ------------------------

} // namespace tesseract.

#else

namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel FMA intrinsics to access the SIMD instruction set.
inline TFloat DotProductAVX(const TFloat* u, const TFloat* v, int n) {
return DotProductFMA(u, v, n);
}

}

#endif
37 changes: 26 additions & 11 deletions src/arch/dotproductfma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,19 @@
// limitations under the License.
///////////////////////////////////////////////////////////////////////

#if !defined(__FMA__)
# if defined(__i686__) || defined(__x86_64__)
# error Implementation only for FMA capable architectures
# endif
#else
#if defined(__FMA__)

# include <immintrin.h>
# include <cstdint>
# include "dotproduct.h"

namespace tesseract {

// ---------------------------- FAST FLOAT section ------------------------

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel FMA intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
float DotProductFMA(const float *u, const float *v, int n) {
const unsigned quot = n / 16;
const unsigned rem = n % 16;
__m256 t0 = _mm256_setzero_ps();
Expand All @@ -48,15 +45,17 @@ TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
v += 8;
}
t0 = _mm256_hadd_ps(t0, t1);
alignas(32) TFloat tmp[8];
alignas(32) float tmp[8];
_mm256_store_ps(tmp, t0);
TFloat result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
#else

// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------

double DotProductFMA(const double *u, const double *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
Expand All @@ -83,8 +82,24 @@ double DotProductFMA(const double *u, const double *v, int n) {
}
return result;
}
#endif

// ---------------------------- END section ------------------------

} // namespace tesseract.

#else

namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel FMA intrinsics to access the SIMD instruction set.
inline float DotProductFMA(const float *u, const float *v, int n) {
return DotProductSSE(u, v, n);
}
inline double DotProductFMA(const double *u, const double *v, int n) {
return DotProductSSE(u, v, n);
}

}

#endif
28 changes: 20 additions & 8 deletions src/arch/dotproductsse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,7 @@
// limitations under the License.
///////////////////////////////////////////////////////////////////////

#if !defined(__SSE4_1__)
# if defined(__i686__) || defined(__x86_64__)
# error Implementation only for SSE 4.1 capable architectures
# endif
#else
#if defined(__SSE4_1__)

# include <emmintrin.h>
# include <smmintrin.h>
Expand All @@ -28,9 +24,10 @@

namespace tesseract {

// ---------------------------- FAST FLOAT section ------------------------

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel SSE intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
float DotProductSSE(const float *u, const float *v, int n) {
int max_offset = n - 4;
int offset = 0;
Expand Down Expand Up @@ -89,7 +86,9 @@ float DotProductSSE(const float *u, const float *v, int n) {
}
return result;
}
#else

// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------

double DotProductSSE(const double *u, const double *v, int n) {
int max_offset = n - 2;
int offset = 0;
Expand Down Expand Up @@ -139,8 +138,21 @@ double DotProductSSE(const double *u, const double *v, int n) {
}
return result;
}
#endif

// ---------------------------- END section ------------------------

} // namespace tesseract.

#else

namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel FMA intrinsics to access the SIMD instruction set.
inline TFloat DotProductSSE(const TFloat* u, const TFloat* v, int n) {
return DotProductNative(u, v, n);
}

}

#endif
Loading

0 comments on commit 97834d0

Please sign in to comment.