Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Bmi2 instrunction to optmize compact protocol int64 code and deco… #2780

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions lib/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,58 @@ if(UNIX)
endif()
endif()

if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you kindly change the indenting to 4 white spaces, instead of two?

set(PREV_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -mbmi2 -mbmi -mlzcnt -msse3 -mavx512bw -mavx512vl")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Questions:

  • Not sure if these flags are portable to MSVC? On which platforms did you test?
  • Do you not enforce here already that the CPU supports all these flags? Then, further down, you test which CPU features are supported? Am I missing something?

Problems:

  • Not all users build for their current platforms. I.e. people may want to build on a modern CPU, but may plan to ship their binaries to broader platforms. This is for example the case with Linux distributors. It may be required to make these changes optional, particularly for the very advanced features. I.e. building on an AVX512 capable CPU should not necessarily make the binaries non-portable to non-AVX512-CPUs (which still exist quite a lot).

check_cxx_source_compiles(
"
#include <immintrin.h>
int main(){unsigned int a,b;_pdep_u32(a,b); return 0;}
"
HAVE_BMI2)
check_cxx_source_compiles(
"
#include <immintrin.h>
int main(){unsigned int a; _tzcnt_u32(a); return 0;}
"
HAVE_BMI)
check_cxx_source_compiles(
"
#include <immintrin.h>
int main(){unsigned int c;_lzcnt_u32(c); return 0;}
"
HAVE_LZCNT)
check_cxx_source_compiles(
"
#include <immintrin.h>
int main(){const __m128i* p;_mm_lddqu_si128(p); return 0;}
"
HAVE_SSE3)
check_cxx_source_compiles(
"
#include <immintrin.h>
int main(){__m128i a,b;_mm_mask_cmp_epi8_mask(0x3ff,a,b,_MM_CMPINT_NLT); return 0;}
"
HAVE_AVX512BW_AVX512VL)

if (HAVE_BMI2)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mbmi2")
endif()
if (HAVE_BMI)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mbmi")
endif()
if (HAVE_LZCNT)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlzcnt")
endif()
if (HAVE_SSE3)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3")
endif()
if (HAVE_AVX512BW_AVX512VL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512bw -mavx512vl")
endif()
set(CMAKE_REQUIRED_FLAGS ${PREV_CMAKE_REQUIRED_FLAGS})
endif ()

set(thriftcpp_threads_SOURCES
src/thrift/concurrency/ThreadFactory.cpp
src/thrift/concurrency/Thread.cpp
Expand Down
20 changes: 20 additions & 0 deletions lib/cpp/src/thrift/protocol/TCompactProtocol.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,14 @@ class TCompactProtocolT : public TVirtualProtocol<TCompactProtocolT<Transport_>
uint32_t writeListEnd() { return 0; }
uint32_t writeSetEnd() { return 0; }
uint32_t writeFieldEnd() { return 0; }
private:
template<bool needConsume=true>
inline __attribute__((always_inline)) uint32_t writeVarint64NoneBMI2(uint64_t n);

#if defined(__BMI2__) && defined(__LZCNT__)
template<bool needConsume=true>
inline __attribute__((always_inline)) uint32_t writeVarint64BMI2(uint64_t n);
#endif
protected:
int32_t writeFieldBeginInternal(const char* name,
const TType fieldType,
Expand Down Expand Up @@ -223,6 +230,19 @@ class TCompactProtocolT : public TVirtualProtocol<TCompactProtocolT<Transport_>
uint32_t readListEnd() { return 0; }
uint32_t readSetEnd() { return 0; }

private:
template<bool needConsume=true>
inline __attribute__((always_inline)) uint32_t readVarint64FastPathNoneAVX(const uint8_t* buf,const std::size_t bufsz,int64_t& i64);
template<bool needConsume=true>
inline __attribute__((always_inline)) uint32_t readVarint64SlowPathNoneAVX(uint8_t* buf,const std::size_t bufsz,int64_t& i64);
#if defined(__SSE3__) && defined(__AVX512BW__) && defined(__AVX512VL__) && \
defined(__BMI2__) && defined(__BMI__)
template<bool needConsume=true>
inline __attribute__((always_inline)) uint32_t readVarint64FastPathAVX(const uint8_t* buf,const std::size_t bufsz,int64_t& i64);
template<bool needConsume=true>
inline __attribute__((always_inline)) uint32_t readVarint64SlowPathAVX(uint8_t* buf,const std::size_t bufsz,int64_t& i64);
#endif

protected:
uint32_t readVarint32(int32_t& i32);
uint32_t readVarint64(int64_t& i64);
Expand Down
Loading