From b02f23994b1411f82ac1c3fc03168714ac6cd3c4 Mon Sep 17 00:00:00 2001 From: Tyler McDaniel Date: Tue, 14 Jul 2020 15:38:25 -0400 Subject: [PATCH] try non fortran indexing inside kgemm --- kgemm_nn.hpp | 29 ++++++++--------------------- kgemm_nt.hpp | 29 ++++++++--------------------- kroncommon.hpp | 10 ++++++++++ 3 files changed, 26 insertions(+), 42 deletions(-) diff --git a/kgemm_nn.hpp b/kgemm_nn.hpp index e744637..36b403f 100644 --- a/kgemm_nn.hpp +++ b/kgemm_nn.hpp @@ -37,15 +37,15 @@ DEVICE_FUNCTION void kgemm_nn(int const mm, int const nn, int const kk, #endif auto A = [&](int const ia, int const ja) -> T const & { - return (A_[indx2f(ia, ja, ldA)]); + return (A_[indx2(ia, ja, ldA)]); }; auto B = [&](int const ib, int const jb) -> T const & { - return (B_[indx2f(ib, jb, ldB)]); + return (B_[indx2(ib, jb, ldB)]); }; auto C = [&](int const ic, int const jc) -> T & { - return (C_[indx2f(ic, jc, ldC)]); + return (C_[indx2(ic, jc, ldC)]); }; // --------------------------- @@ -54,28 +54,15 @@ DEVICE_FUNCTION void kgemm_nn(int const mm, int const nn, int const kk, for (int ij0 = ij_start; ij0 < (mm * nn); ij0 += ij_size) { - int const i = (ij0 % mm) + 1; - int const j = ((ij0 - (i - 1)) / mm) + 1; + int const i = ij0 % mm; + int const j = (ij0 - i) / mm; T cij = 0; - bool constexpr use_pointer = true; - if (use_pointer) { - - int k = 1; - T const *Ap = &(A(i, k)); - int64_t inc_A = &(A(i, k + 1)) - Ap; - T const *Bp = &(B(k, j)); - int64_t inc_B = &(B(k + 1, j)) - Bp; - for (k = 0; k < kk; k++) { - cij += (*Ap) * (*Bp); - Ap += inc_A; - Bp += inc_B; - }; - } else { - for (int k = 1; k <= kk; k++) { + + for (int k = 0; k < kk; k++) { cij += A(i, k) * B(k, j); }; - }; + // ------------------ // store results to C diff --git a/kgemm_nt.hpp b/kgemm_nt.hpp index 0d8c3e8..d50970c 100644 --- a/kgemm_nt.hpp +++ b/kgemm_nt.hpp @@ -45,15 +45,15 @@ DEVICE_FUNCTION void kgemm_nt(int const mm, int const nn, int const kk, // ------------------------------------ auto A = [&](int const ia, int const ja) -> T const & { - return (A_[indx2f(ia, ja, ldA)]); + return (A_[indx2(ia, ja, ldA)]); }; auto B = [&](int const ib, int const jb) -> T const & { - return (B_[indx2f(ib, jb, ldB)]); + return (B_[indx2(ib, jb, ldB)]); }; auto C = [&](int const ic, int const jc) -> T & { - return (C_[indx2f(ic, jc, ldC)]); + return (C_[indx2(ic, jc, ldC)]); }; // --------------------------- @@ -61,27 +61,14 @@ DEVICE_FUNCTION void kgemm_nt(int const mm, int const nn, int const kk, // --------------------------- for (int ij0 = ij_start; ij0 < (mm * nn); ij0 += ij_size) { - int const i = (ij0 % mm) + 1; - int const j = (ij0 - (i - 1)) / mm + 1; + int const i = ij0 % mm; + int const j = (ij0 - i) / mm; T cij = 0; - bool constexpr use_pointer = true; - if (use_pointer) { - int k = 1; - - T const *Ap = &(A(i, k)); - int64_t const inc_A = &(A(i, k + 1)) - Ap; - T const *Bp = &(B(j, k)); - int64_t const inc_B = &(B(j, k + 1)) - Bp; - for (k = 0; k < kk; k++) { - cij += (*Ap) * (*Bp); - Ap += inc_A; - Bp += inc_B; - }; - } else { - for (int k = 1; k <= kk; k++) { + + for (int k = 0; k < kk; k++) { cij += A(i, k) * B(j, k); }; - }; + // ------------------ // store results to C diff --git a/kroncommon.hpp b/kroncommon.hpp index 6612b2b..97647e9 100644 --- a/kroncommon.hpp +++ b/kroncommon.hpp @@ -55,6 +55,16 @@ float atomicAdd( float volatile *p, float dvalue) #endif +static inline +HOST_FUNCTION DEVICE_FUNCTION +int indx2( int const i, + int const j, + int const ld ) +{ + return( i + j*ld ); +} + + static inline HOST_FUNCTION DEVICE_FUNCTION int indx2f( int const i,