From 68cf2dd23a0d64bfb59ffc418ed53a856f7c965a Mon Sep 17 00:00:00 2001 From: Andrew Au Date: Tue, 20 Feb 2024 11:44:43 -0800 Subject: [PATCH] Getting vxsort working on Linux amd64 Co-authored-by: Jan Vorlicek Co-authored-by: Adeel Mujahid <3840695+am11@users.noreply.github.com> --- .../dlls/mscoree/coreclr/CMakeLists.txt | 6 +++ src/coreclr/gc/CMakeLists.txt | 26 +++++----- src/coreclr/gc/gc.cpp | 6 +-- src/coreclr/gc/gcsvr.cpp | 2 +- src/coreclr/gc/gcwks.cpp | 2 +- src/coreclr/gc/unix/gcenv.unix.cpp | 6 --- src/coreclr/gc/vxsort/CMakeLists.txt | 29 +++++++++++ src/coreclr/gc/vxsort/defs.h | 31 +----------- src/coreclr/gc/vxsort/machine_traits.avx2.h | 4 +- src/coreclr/gc/vxsort/machine_traits.avx512.h | 4 +- src/coreclr/gc/vxsort/packer.h | 12 ++--- .../bitonic_sort.AVX2.int32_t.generated.h | 2 +- .../bitonic_sort.AVX2.int64_t.generated.h | 2 +- .../bitonic_sort.AVX512.int32_t.generated.h | 2 +- .../bitonic_sort.AVX512.int64_t.generated.h | 2 +- .../gc/vxsort/smallsort/codegen/avx2.py | 2 +- .../gc/vxsort/smallsort/codegen/avx512.py | 2 +- src/coreclr/gc/vxsort/vxsort.h | 49 ++++++++++++++----- src/coreclr/inc/palclr_win.h | 4 -- .../Microsoft.NETCore.Native.Unix.targets | 3 ++ src/coreclr/nativeaot/Runtime/CMakeLists.txt | 4 +- .../nativeaot/Runtime/Full/CMakeLists.txt | 16 +++--- src/coreclr/pal/inc/pal.h | 8 --- src/coreclr/pal/inc/rt/specstrings_strict.h | 1 - src/coreclr/pal/inc/rt/specstrings_undef.h | 1 - src/coreclr/pal/src/include/pal/palinternal.h | 6 --- src/coreclr/vm/CMakeLists.txt | 15 ------ .../Directory.Build.props | 2 + src/native/libs/Common/pal_io_common.h | 1 - src/native/libs/Common/pal_utilities.h | 12 +---- src/native/minipal/utils.h | 19 +++++++ .../SmokeTests/HardwareIntrinsics/Program.cs | 2 +- 32 files changed, 142 insertions(+), 141 deletions(-) create mode 100644 src/coreclr/gc/vxsort/CMakeLists.txt diff --git a/src/coreclr/dlls/mscoree/coreclr/CMakeLists.txt b/src/coreclr/dlls/mscoree/coreclr/CMakeLists.txt index 2e2a8bf87eccd..c600af1fb6aad 100644 --- a/src/coreclr/dlls/mscoree/coreclr/CMakeLists.txt +++ b/src/coreclr/dlls/mscoree/coreclr/CMakeLists.txt @@ -111,6 +111,12 @@ set(CORECLR_LIBRARIES gc_pal ) +if(CLR_CMAKE_TARGET_ARCH_AMD64) + list(APPEND CORECLR_LIBRARIES + gc_vxsort + ) +endif(CLR_CMAKE_TARGET_ARCH_AMD64) + if(CLR_CMAKE_TARGET_WIN32) list(APPEND CORECLR_LIBRARIES ${STATIC_MT_CRT_LIB} diff --git a/src/coreclr/gc/CMakeLists.txt b/src/coreclr/gc/CMakeLists.txt index a1509b9898b62..89937554c0417 100644 --- a/src/coreclr/gc/CMakeLists.txt +++ b/src/coreclr/gc/CMakeLists.txt @@ -36,20 +36,9 @@ else() windows/Native.rc) endif(CLR_CMAKE_HOST_UNIX) -if (CLR_CMAKE_TARGET_ARCH_AMD64 AND CLR_CMAKE_TARGET_WIN32) - set (GC_SOURCES - ${GC_SOURCES} - vxsort/isa_detection.cpp - vxsort/do_vxsort_avx2.cpp - vxsort/do_vxsort_avx512.cpp - vxsort/machine_traits.avx2.cpp - vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp - vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp - vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp - vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp - vxsort/smallsort/avx2_load_mask_tables.cpp -) -endif (CLR_CMAKE_TARGET_ARCH_AMD64 AND CLR_CMAKE_TARGET_WIN32) +if (CLR_CMAKE_TARGET_ARCH_AMD64) + add_subdirectory(vxsort) +endif (CLR_CMAKE_TARGET_ARCH_AMD64) if (CLR_CMAKE_TARGET_WIN32) set(GC_HEADERS @@ -87,7 +76,7 @@ if (CLR_CMAKE_TARGET_WIN32) handletablepriv.h objecthandle.h softwarewritewatch.h - vxsort/do_vxsort.h) + ) endif(CLR_CMAKE_TARGET_WIN32) if(CLR_CMAKE_HOST_WIN32) @@ -100,6 +89,13 @@ endif(CLR_CMAKE_HOST_WIN32) set (GC_LINK_LIBRARIES ${GC_LINK_LIBRARIES} gc_pal) +if(CLR_CMAKE_TARGET_ARCH_AMD64) + list(APPEND GC_LINK_LIBRARIES + gc_vxsort + ) +endif(CLR_CMAKE_TARGET_ARCH_AMD64) + + list(APPEND GC_SOURCES ${GC_HEADERS}) convert_to_absolute_path(GC_SOURCES ${GC_SOURCES}) diff --git a/src/coreclr/gc/gc.cpp b/src/coreclr/gc/gc.cpp index 40cb8694fd4d5..902469979eddd 100644 --- a/src/coreclr/gc/gc.cpp +++ b/src/coreclr/gc/gc.cpp @@ -18,7 +18,7 @@ #include "gcpriv.h" -#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS) +#ifdef TARGET_AMD64 #define USE_VXSORT #else #define USE_INTROSORT @@ -10305,11 +10305,11 @@ static void do_vxsort (uint8_t** item_array, ptrdiff_t item_count, uint8_t* rang { // above this threshold, using AVX2 for sorting will likely pay off // despite possible downclocking on some devices - const size_t AVX2_THRESHOLD_SIZE = 8 * 1024; + const ptrdiff_t AVX2_THRESHOLD_SIZE = 8 * 1024; // above this threshold, using AVX512F for sorting will likely pay off // despite possible downclocking on current devices - const size_t AVX512F_THRESHOLD_SIZE = 128 * 1024; + const ptrdiff_t AVX512F_THRESHOLD_SIZE = 128 * 1024; if (item_count <= 1) return; diff --git a/src/coreclr/gc/gcsvr.cpp b/src/coreclr/gc/gcsvr.cpp index 9e4a784735302..5dc848f40c3f7 100644 --- a/src/coreclr/gc/gcsvr.cpp +++ b/src/coreclr/gc/gcsvr.cpp @@ -20,7 +20,7 @@ #define SERVER_GC 1 -#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS) +#ifdef TARGET_AMD64 #include "vxsort/do_vxsort.h" #endif diff --git a/src/coreclr/gc/gcwks.cpp b/src/coreclr/gc/gcwks.cpp index 7d599e8d8e51f..6b4cfe1681463 100644 --- a/src/coreclr/gc/gcwks.cpp +++ b/src/coreclr/gc/gcwks.cpp @@ -20,7 +20,7 @@ #undef SERVER_GC #endif -#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS) +#ifdef TARGET_AMD64 #include "vxsort/do_vxsort.h" #endif diff --git a/src/coreclr/gc/unix/gcenv.unix.cpp b/src/coreclr/gc/unix/gcenv.unix.cpp index 6f1a254a0528c..21a7494cfbced 100644 --- a/src/coreclr/gc/unix/gcenv.unix.cpp +++ b/src/coreclr/gc/unix/gcenv.unix.cpp @@ -35,12 +35,6 @@ #define __has_cpp_attribute(x) (0) #endif -#if __has_cpp_attribute(fallthrough) -#define FALLTHROUGH [[fallthrough]] -#else -#define FALLTHROUGH -#endif - #include #if HAVE_SYS_TIME_H diff --git a/src/coreclr/gc/vxsort/CMakeLists.txt b/src/coreclr/gc/vxsort/CMakeLists.txt new file mode 100644 index 0000000000000..fc55956832e3d --- /dev/null +++ b/src/coreclr/gc/vxsort/CMakeLists.txt @@ -0,0 +1,29 @@ +set(CMAKE_INCLUDE_CURRENT_DIR ON) +include_directories("../env") + +if(CLR_CMAKE_HOST_UNIX) + set_source_files_properties(isa_detection.cpp PROPERTIES COMPILE_FLAGS -mavx2) + set_source_files_properties(do_vxsort_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2) + set_source_files_properties(do_vxsort_avx512.cpp PROPERTIES COMPILE_FLAGS -mavx2) + set_source_files_properties(machine_traits.avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2) + set_source_files_properties(smallsort/bitonic_sort.AVX2.int64_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2) + set_source_files_properties(smallsort/bitonic_sort.AVX2.int32_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2) + set_source_files_properties(smallsort/bitonic_sort.AVX512.int64_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2) + set_source_files_properties(smallsort/bitonic_sort.AVX512.int32_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2) + set_source_files_properties(smallsort/avx2_load_mask_tables.cpp PROPERTIES COMPILE_FLAGS -mavx2) +endif(CLR_CMAKE_HOST_UNIX) + +set (VXSORT_SOURCES + isa_detection.cpp + do_vxsort_avx2.cpp + do_vxsort_avx512.cpp + machine_traits.avx2.cpp + smallsort/bitonic_sort.AVX2.int64_t.generated.cpp + smallsort/bitonic_sort.AVX2.int32_t.generated.cpp + smallsort/bitonic_sort.AVX512.int64_t.generated.cpp + smallsort/bitonic_sort.AVX512.int32_t.generated.cpp + smallsort/avx2_load_mask_tables.cpp + do_vxsort.h +) + +add_library(gc_vxsort STATIC ${VXSORT_SOURCES}) diff --git a/src/coreclr/gc/vxsort/defs.h b/src/coreclr/gc/vxsort/defs.h index 0cc72b23fa24e..d048185884770 100644 --- a/src/coreclr/gc/vxsort/defs.h +++ b/src/coreclr/gc/vxsort/defs.h @@ -45,36 +45,7 @@ #define NOINLINE __attribute__((noinline)) #endif -namespace std { -template -class numeric_limits { - public: - static constexpr _Ty Max() { static_assert(sizeof(_Ty) != sizeof(_Ty), "func must be specialized!"); return _Ty(); } - static constexpr _Ty Min() { static_assert(sizeof(_Ty) != sizeof(_Ty), "func must be specialized!"); return _Ty(); } -}; - -template <> -class numeric_limits { -public: - static constexpr int32_t Max() { return 0x7fffffff; } - static constexpr int32_t Min() { return -0x7fffffff - 1; } -}; - -template <> -class numeric_limits { -public: - static constexpr uint32_t Max() { return 0xffffffff; } - static constexpr uint32_t Min() { return 0; } -}; - -template <> -class numeric_limits { - public: - static constexpr int64_t Max() { return 0x7fffffffffffffffi64; } - - static constexpr int64_t Min() { return -0x7fffffffffffffffi64 - 1; } -}; -} // namespace std +#include #ifndef max template diff --git a/src/coreclr/gc/vxsort/machine_traits.avx2.h b/src/coreclr/gc/vxsort/machine_traits.avx2.h index ccadc2a9a27a5..7aca281e288ea 100644 --- a/src/coreclr/gc/vxsort/machine_traits.avx2.h +++ b/src/coreclr/gc/vxsort/machine_traits.avx2.h @@ -13,6 +13,7 @@ #include #include #include +#include #include "defs.h" #include "machine_traits.h" @@ -123,8 +124,7 @@ class vxsort_machine_traits { template static constexpr bool can_pack(T span) { - const auto PACK_LIMIT = (((TU) std::numeric_limits::Max() + 1)) << Shift; - return ((TU) span) < PACK_LIMIT; + return ((TU) span) < ((((TU) std::numeric_limits::max() + 1)) << Shift); } static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); } diff --git a/src/coreclr/gc/vxsort/machine_traits.avx512.h b/src/coreclr/gc/vxsort/machine_traits.avx512.h index 8df8660aa13a7..78f59dee99a36 100644 --- a/src/coreclr/gc/vxsort/machine_traits.avx512.h +++ b/src/coreclr/gc/vxsort/machine_traits.avx512.h @@ -11,6 +11,7 @@ #include "vxsort_targets_enable_avx512.h" #include +#include #include "defs.h" #include "machine_traits.h" @@ -92,8 +93,7 @@ class vxsort_machine_traits { template static constexpr bool can_pack(T span) { - const auto PACK_LIMIT = (((TU) std::numeric_limits::Max() + 1)) << Shift; - return ((TU) span) < PACK_LIMIT; + return ((TU) span) < ((((TU) std::numeric_limits::max() + 1)) << Shift); } static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); } diff --git a/src/coreclr/gc/vxsort/packer.h b/src/coreclr/gc/vxsort/packer.h index be50b7d5fb41b..94f293dac71f7 100644 --- a/src/coreclr/gc/vxsort/packer.h +++ b/src/coreclr/gc/vxsort/packer.h @@ -56,7 +56,7 @@ class packer { public: static void pack(TFrom *mem, size_t len, TFrom base) { - TFrom offset = MT::template shift_n_sub(base, (TFrom) std::numeric_limits::Min()); + TFrom offset = MT::template shift_n_sub(base, (TFrom) std::numeric_limits::min()); auto baseVec = MT::broadcast(offset); auto pre_aligned_mem = reinterpret_cast(reinterpret_cast(mem) & ~ALIGN_MASK); @@ -87,8 +87,8 @@ class packer { assert(AH::is_aligned(mem_read)); - auto memv_read = (TV *) mem_read; - auto memv_write = (TV *) mem_write; + TV * memv_read = (TV *) mem_read; + TV * memv_write = (TV *) mem_write; auto lenv = len / N; len -= (lenv * N); @@ -156,7 +156,7 @@ class packer { static void unpack(TTo *mem, size_t len, TFrom base) { - TFrom offset = MT::template shift_n_sub(base, (TFrom) std::numeric_limits::Min()); + TFrom offset = MT::template shift_n_sub(base, (TFrom) std::numeric_limits::min()); auto baseVec = MT::broadcast(offset); auto mem_read = mem + len; @@ -184,8 +184,8 @@ class packer { assert(AH::is_aligned(mem_read)); auto lenv = len / (N * 2); - auto memv_read = ((TV *) mem_read) - 1; - auto memv_write = ((TV *) mem_write) - 2; + TV * memv_read = ((TV *) mem_read) - 1; + TV * memv_write = ((TV *) mem_write) - 2; len -= lenv * N * 2; while (lenv >= Unroll) { diff --git a/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h b/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h index c3f141c1046bb..c805a425fbeae 100644 --- a/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h +++ b/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h @@ -39,7 +39,7 @@ extern "C" const uint8_t mask_table_8[M8_SIZE]; template<> struct bitonic { static const int N = 8; - static constexpr int32_t MAX = std::numeric_limits::Max(); + static constexpr int32_t MAX = std::numeric_limits::max(); public: static INLINE void sort_01v_ascending(__m256i& d01) { diff --git a/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h b/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h index a012161c99dd9..c3403bbe31aaa 100644 --- a/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h +++ b/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h @@ -39,7 +39,7 @@ extern "C" const uint8_t mask_table_8[M8_SIZE]; template<> struct bitonic { static const int N = 4; - static constexpr int64_t MAX = std::numeric_limits::Max(); + static constexpr int64_t MAX = std::numeric_limits::max(); public: static INLINE void sort_01v_ascending(__m256i& d01) { diff --git a/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h b/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h index 1326c8fee5e5c..eb9ee4d275926 100644 --- a/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h +++ b/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h @@ -36,7 +36,7 @@ namespace vxsort { namespace smallsort { template<> struct bitonic { static const int N = 16; - static constexpr int32_t MAX = std::numeric_limits::Max(); + static constexpr int32_t MAX = std::numeric_limits::max(); public: static INLINE void sort_01v_ascending(__m512i& d01) { diff --git a/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h b/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h index ac44992fe2392..98fe507b73430 100644 --- a/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h +++ b/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h @@ -36,7 +36,7 @@ namespace vxsort { namespace smallsort { template<> struct bitonic { static const int N = 8; - static constexpr int64_t MAX = std::numeric_limits::Max(); + static constexpr int64_t MAX = std::numeric_limits::max(); public: static INLINE void sort_01v_ascending(__m512i& d01) { diff --git a/src/coreclr/gc/vxsort/smallsort/codegen/avx2.py b/src/coreclr/gc/vxsort/smallsort/codegen/avx2.py index 9944cbbc8968e..b9c39770d549c 100644 --- a/src/coreclr/gc/vxsort/smallsort/codegen/avx2.py +++ b/src/coreclr/gc/vxsort/smallsort/codegen/avx2.py @@ -303,7 +303,7 @@ def generate_prologue(self, f): template<> struct bitonic<{t}, AVX2> {{ static const int N = {self.vector_size()}; - static constexpr {t} MAX = std::numeric_limits<{t}>::Max(); + static constexpr {t} MAX = std::numeric_limits<{t}>::max(); public: """ print(s, file=f) diff --git a/src/coreclr/gc/vxsort/smallsort/codegen/avx512.py b/src/coreclr/gc/vxsort/smallsort/codegen/avx512.py index e259027c5636b..9b417723c6e3b 100644 --- a/src/coreclr/gc/vxsort/smallsort/codegen/avx512.py +++ b/src/coreclr/gc/vxsort/smallsort/codegen/avx512.py @@ -299,7 +299,7 @@ def generate_prologue(self, f): namespace smallsort {{ template<> struct bitonic<{t}, AVX512> {{ static const int N = {self.vector_size()}; - static constexpr {t} MAX = std::numeric_limits<{t}>::Max(); + static constexpr {t} MAX = std::numeric_limits<{t}>::max(); public: """ print(s, file=f) diff --git a/src/coreclr/gc/vxsort/vxsort.h b/src/coreclr/gc/vxsort/vxsort.h index b8eaac51f4213..ace20c10734fd 100644 --- a/src/coreclr/gc/vxsort/vxsort.h +++ b/src/coreclr/gc/vxsort/vxsort.h @@ -13,10 +13,11 @@ #endif #endif - #include #include +#include + #include "defs.h" #include "alignment.h" #include "machine_traits.h" @@ -374,7 +375,7 @@ class vxsort { auto pivot = *right; // We do this here just in case we need to pre-align to the right // We end up - *right = std::numeric_limits::Max(); + *right = std::numeric_limits::max(); // Broadcast the selected pivot const TV P = MT::broadcast(pivot); @@ -421,16 +422,16 @@ class vxsort { // From now on, we are fully aligned // and all reading is done in full vector units - auto readLeftV = (TV*) readLeft; - auto readRightV = (TV*) readRight; + TV* readLeftV = (TV*) readLeft; + TV* readRightV = (TV*) readRight; #ifndef NDEBUG readLeft = nullptr; readRight = nullptr; #endif for (auto u = 0; u < InnerUnroll; u++) { - auto dl = MT::load_vec(readLeftV + u); - auto dr = MT::load_vec(readRightV - (u + 1)); + TV dl = MT::load_vec(readLeftV + u); + TV dr = MT::load_vec(readRightV - (u + 1)); partition_block(dl, P, tmpLeft, tmpRight); partition_block(dr, P, tmpLeft, tmpRight); } @@ -458,31 +459,53 @@ class vxsort { switch (InnerUnroll) { case 12: d12 = MT::load_vec(nextPtr + InnerUnroll - 12); + FALLTHROUGH; case 11: d11 = MT::load_vec(nextPtr + InnerUnroll - 11); + FALLTHROUGH; case 10: d10 = MT::load_vec(nextPtr + InnerUnroll - 10); + FALLTHROUGH; case 9: d09 = MT::load_vec(nextPtr + InnerUnroll - 9); + FALLTHROUGH; case 8: d08 = MT::load_vec(nextPtr + InnerUnroll - 8); + FALLTHROUGH; case 7: d07 = MT::load_vec(nextPtr + InnerUnroll - 7); + FALLTHROUGH; case 6: d06 = MT::load_vec(nextPtr + InnerUnroll - 6); + FALLTHROUGH; case 5: d05 = MT::load_vec(nextPtr + InnerUnroll - 5); + FALLTHROUGH; case 4: d04 = MT::load_vec(nextPtr + InnerUnroll - 4); + FALLTHROUGH; case 3: d03 = MT::load_vec(nextPtr + InnerUnroll - 3); + FALLTHROUGH; case 2: d02 = MT::load_vec(nextPtr + InnerUnroll - 2); + FALLTHROUGH; case 1: d01 = MT::load_vec(nextPtr + InnerUnroll - 1); } switch (InnerUnroll) { case 12: partition_block(d12, P, writeLeft, writeRight); + FALLTHROUGH; case 11: partition_block(d11, P, writeLeft, writeRight); + FALLTHROUGH; case 10: partition_block(d10, P, writeLeft, writeRight); + FALLTHROUGH; case 9: partition_block(d09, P, writeLeft, writeRight); + FALLTHROUGH; case 8: partition_block(d08, P, writeLeft, writeRight); + FALLTHROUGH; case 7: partition_block(d07, P, writeLeft, writeRight); + FALLTHROUGH; case 6: partition_block(d06, P, writeLeft, writeRight); + FALLTHROUGH; case 5: partition_block(d05, P, writeLeft, writeRight); + FALLTHROUGH; case 4: partition_block(d04, P, writeLeft, writeRight); + FALLTHROUGH; case 3: partition_block(d03, P, writeLeft, writeRight); + FALLTHROUGH; case 2: partition_block(d02, P, writeLeft, writeRight); + FALLTHROUGH; case 1: partition_block(d01, P, writeLeft, writeRight); } } @@ -499,7 +522,7 @@ class vxsort { readLeftV += 1; } - auto d = MT::load_vec(nextPtr); + TV d = MT::load_vec(nextPtr); partition_block(d, P, writeLeft, writeRight); //partition_block_without_compress(d, P, writeLeft, writeRight); } @@ -534,8 +557,8 @@ class vxsort { const auto rightAlign = hint.right_align; const auto rai = ~((rightAlign - 1) >> 31); const auto lai = leftAlign >> 31; - const auto preAlignedLeft = (TV*) (left + leftAlign); - const auto preAlignedRight = (TV*) (right + rightAlign - N); + TV* const preAlignedLeft = (TV*) (left + leftAlign); + TV* const preAlignedRight = (TV*) (right + rightAlign - N); #ifdef VXSORT_STATS vxsort_stats::bump_vec_loads(2); @@ -554,8 +577,8 @@ class vxsort { // were actually needed to be written to the right hand side // e) We write the right portion of the left vector to the right side // now that its write position has been updated - auto RT0 = MT::load_vec(preAlignedRight); - auto LT0 = MT::load_vec(preAlignedLeft); + TV RT0 = MT::load_vec(preAlignedRight); + TV LT0 = MT::load_vec(preAlignedLeft); auto rtMask = MT::get_cmpgt_mask(RT0, P); auto ltMask = MT::get_cmpgt_mask(LT0, P); const auto rtPopCountRightPart = max(_mm_popcnt_u32(rtMask), rightAlign); @@ -617,8 +640,8 @@ class vxsort { * larger-than than all values contained within the provided array. */ NOINLINE void sort(T* left, T* right, - T left_hint = std::numeric_limits::Min(), - T right_hint = std::numeric_limits::Max()) + T left_hint = std::numeric_limits::min(), + T right_hint = std::numeric_limits::max()) { // init_isa_detection(); diff --git a/src/coreclr/inc/palclr_win.h b/src/coreclr/inc/palclr_win.h index a9ee78e32f42f..be0b725e1a689 100644 --- a/src/coreclr/inc/palclr_win.h +++ b/src/coreclr/inc/palclr_win.h @@ -140,8 +140,4 @@ typedef HMODULE NATIVE_LIBRARY_HANDLE; #endif // HOST_WINDOWS -#ifndef FALLTHROUGH -#define FALLTHROUGH __fallthrough -#endif // FALLTHROUGH - #endif // __PALCLR_WIN_H__ diff --git a/src/coreclr/nativeaot/BuildIntegration/Microsoft.NETCore.Native.Unix.targets b/src/coreclr/nativeaot/BuildIntegration/Microsoft.NETCore.Native.Unix.targets index 42a05293f7697..d82f02fd7f17a 100644 --- a/src/coreclr/nativeaot/BuildIntegration/Microsoft.NETCore.Native.Unix.targets +++ b/src/coreclr/nativeaot/BuildIntegration/Microsoft.NETCore.Native.Unix.targets @@ -59,6 +59,8 @@ The .NET Foundation licenses this file to you under the MIT license. libeventpipe-enabled true + libRuntime.VxsortEnabled + libRuntime.VxsortDisabled libstandalonegc-disabled libstandalonegc-enabled @@ -117,6 +119,7 @@ The .NET Foundation licenses this file to you under the MIT license. + diff --git a/src/coreclr/nativeaot/Runtime/CMakeLists.txt b/src/coreclr/nativeaot/Runtime/CMakeLists.txt index 3d0dc1541af04..c1bb58caa30b9 100644 --- a/src/coreclr/nativeaot/Runtime/CMakeLists.txt +++ b/src/coreclr/nativeaot/Runtime/CMakeLists.txt @@ -185,7 +185,7 @@ if (CLR_CMAKE_TARGET_APPLE) ) endif (CLR_CMAKE_TARGET_APPLE) -if (CLR_CMAKE_TARGET_ARCH_AMD64 AND CLR_CMAKE_TARGET_WIN32) +if (CLR_CMAKE_TARGET_ARCH_AMD64) set(VXSORT_SOURCES ${GC_DIR}/vxsort/isa_detection.cpp ${GC_DIR}/vxsort/do_vxsort_avx2.cpp @@ -201,7 +201,7 @@ if (CLR_CMAKE_TARGET_ARCH_AMD64 AND CLR_CMAKE_TARGET_WIN32) set(DUMMY_VXSORT_SOURCES ${GC_DIR}/vxsort/dummy.cpp ) -endif (CLR_CMAKE_TARGET_ARCH_AMD64 AND CLR_CMAKE_TARGET_WIN32) +endif (CLR_CMAKE_TARGET_ARCH_AMD64) list(APPEND RUNTIME_SOURCES_ARCH_ASM ${ARCH_SOURCES_DIR}/AllocFast.${ASM_SUFFIX} diff --git a/src/coreclr/nativeaot/Runtime/Full/CMakeLists.txt b/src/coreclr/nativeaot/Runtime/Full/CMakeLists.txt index e665a6c88ee10..f9b390e18d117 100644 --- a/src/coreclr/nativeaot/Runtime/Full/CMakeLists.txt +++ b/src/coreclr/nativeaot/Runtime/Full/CMakeLists.txt @@ -41,10 +41,10 @@ if(CLR_CMAKE_TARGET_WIN32) add_dependencies(standalonegc-enabled aot_etw_headers) endif() -if (CLR_CMAKE_TARGET_WIN32 AND CLR_CMAKE_TARGET_ARCH_AMD64) +if (CLR_CMAKE_TARGET_ARCH_AMD64) add_library(Runtime.VxsortEnabled STATIC ${VXSORT_SOURCES}) add_library(Runtime.VxsortDisabled STATIC ${DUMMY_VXSORT_SOURCES}) -endif (CLR_CMAKE_TARGET_WIN32 AND CLR_CMAKE_TARGET_ARCH_AMD64) +endif (CLR_CMAKE_TARGET_ARCH_AMD64) target_compile_definitions(Runtime.ServerGC PRIVATE -DFEATURE_SVR_GC) @@ -116,13 +116,15 @@ install_static_library(Runtime.ServerGC aotsdk nativeaot) install_static_library(standalonegc-disabled aotsdk nativeaot) install_static_library(standalonegc-enabled aotsdk nativeaot) if (CLR_CMAKE_TARGET_WIN32) - if (CLR_CMAKE_TARGET_ARCH_AMD64) - install_static_library(Runtime.VxsortEnabled aotsdk nativeaot) - install_static_library(Runtime.VxsortDisabled aotsdk nativeaot) - install_static_library(Runtime.VxsortEnabled.GuardCF aotsdk nativeaot) - endif (CLR_CMAKE_TARGET_ARCH_AMD64) install_static_library(Runtime.ServerGC.GuardCF aotsdk nativeaot) add_dependencies(Runtime.ServerGC.GuardCF aot_eventing_headers) install_static_library(standalonegc-disabled.GuardCF aotsdk nativeaot) install_static_library(standalonegc-enabled.GuardCF aotsdk nativeaot) endif (CLR_CMAKE_TARGET_WIN32) +if (CLR_CMAKE_TARGET_ARCH_AMD64) + install_static_library(Runtime.VxsortEnabled aotsdk nativeaot) + install_static_library(Runtime.VxsortDisabled aotsdk nativeaot) + if (CLR_CMAKE_TARGET_WIN32) + install_static_library(Runtime.VxsortEnabled.GuardCF aotsdk nativeaot) + endif (CLR_CMAKE_TARGET_WIN32) +endif (CLR_CMAKE_TARGET_ARCH_AMD64) \ No newline at end of file diff --git a/src/coreclr/pal/inc/pal.h b/src/coreclr/pal/inc/pal.h index 0016424977006..7d27a1109b450 100644 --- a/src/coreclr/pal/inc/pal.h +++ b/src/coreclr/pal/inc/pal.h @@ -175,14 +175,6 @@ extern bool g_arm64_atomics_present; #define __has_cpp_attribute(x) (0) #endif -#ifndef FALLTHROUGH -#if __has_cpp_attribute(fallthrough) -#define FALLTHROUGH [[fallthrough]] -#else // __has_cpp_attribute(fallthrough) -#define FALLTHROUGH -#endif // __has_cpp_attribute(fallthrough) -#endif // FALLTHROUGH - #ifndef PAL_STDCPP_COMPAT #if __GNUC__ diff --git a/src/coreclr/pal/inc/rt/specstrings_strict.h b/src/coreclr/pal/inc/rt/specstrings_strict.h index dadb49930ceb8..52ade79cde13c 100644 --- a/src/coreclr/pal/inc/rt/specstrings_strict.h +++ b/src/coreclr/pal/inc/rt/specstrings_strict.h @@ -630,7 +630,6 @@ #define __callback __allowed(on_function) #define __format_string __allowed(on_parameter_or_return) #define __blocksOn(resource) __allowed(on_function) -#define __fallthrough __allowed(as_statement) #define __range(lb,ub) __allowed(on_return) #define __in_range(lb,ub) _SAL_VERSION_CHECK(__in_range) #define __out_range(lb,ub) _SAL_VERSION_CHECK(__out_range) diff --git a/src/coreclr/pal/inc/rt/specstrings_undef.h b/src/coreclr/pal/inc/rt/specstrings_undef.h index b0e1848c5eb86..374b10069c1bf 100644 --- a/src/coreclr/pal/inc/rt/specstrings_undef.h +++ b/src/coreclr/pal/inc/rt/specstrings_undef.h @@ -261,7 +261,6 @@ #undef __encoded_array #undef __encoded_pointer #undef __exceptthat -#undef __fallthrough #undef __field_bcount #undef __field_bcount_full #undef __field_bcount_full_opt diff --git a/src/coreclr/pal/src/include/pal/palinternal.h b/src/coreclr/pal/src/include/pal/palinternal.h index 041118d391651..15887d0377382 100644 --- a/src/coreclr/pal/src/include/pal/palinternal.h +++ b/src/coreclr/pal/src/include/pal/palinternal.h @@ -769,12 +769,6 @@ const char StackOverflowMessage[] = "Stack overflow.\n"; #endif // __cplusplus -#if __has_cpp_attribute(fallthrough) -#define FALLTHROUGH [[fallthrough]] -#else -#define FALLTHROUGH -#endif - DWORD PALAPI GetCurrentSessionId(); #endif /* _PAL_INTERNAL_H_ */ diff --git a/src/coreclr/vm/CMakeLists.txt b/src/coreclr/vm/CMakeLists.txt index 844b8b4731ec8..345d5ac35f00e 100644 --- a/src/coreclr/vm/CMakeLists.txt +++ b/src/coreclr/vm/CMakeLists.txt @@ -487,21 +487,6 @@ set(GC_SOURCES_WKS ../gc/softwarewritewatch.cpp ../gc/handletablecache.cpp) -if (CLR_CMAKE_TARGET_ARCH_AMD64 AND CLR_CMAKE_TARGET_WIN32) - set ( GC_SOURCES_WKS - ${GC_SOURCES_WKS} - ../gc/vxsort/isa_detection.cpp - ../gc/vxsort/do_vxsort_avx2.cpp - ../gc/vxsort/do_vxsort_avx512.cpp - ../gc/vxsort/machine_traits.avx2.cpp - ../gc/vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp - ../gc/vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp - ../gc/vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp - ../gc/vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp - ../gc/vxsort/smallsort/avx2_load_mask_tables.cpp -) -endif (CLR_CMAKE_TARGET_ARCH_AMD64 AND CLR_CMAKE_TARGET_WIN32) - set(GC_HEADERS_WKS ${GC_HEADERS_DAC_AND_WKS_COMMON} ../gc/gceventstatus.h diff --git a/src/installer/pkg/sfx/Microsoft.NETCore.App/Directory.Build.props b/src/installer/pkg/sfx/Microsoft.NETCore.App/Directory.Build.props index aedd89c6147df..e1e1ef7c08aef 100644 --- a/src/installer/pkg/sfx/Microsoft.NETCore.App/Directory.Build.props +++ b/src/installer/pkg/sfx/Microsoft.NETCore.App/Directory.Build.props @@ -145,6 +145,8 @@ + + diff --git a/src/native/libs/Common/pal_io_common.h b/src/native/libs/Common/pal_io_common.h index 328b33f43022c..27022e5c8fe6a 100644 --- a/src/native/libs/Common/pal_io_common.h +++ b/src/native/libs/Common/pal_io_common.h @@ -8,7 +8,6 @@ #include #include #include -#include /** * Our intermediate pollfd struct to normalize the data types diff --git a/src/native/libs/Common/pal_utilities.h b/src/native/libs/Common/pal_utilities.h index 3fece3a08aa3e..7b5fa63b6cac0 100644 --- a/src/native/libs/Common/pal_utilities.h +++ b/src/native/libs/Common/pal_utilities.h @@ -15,6 +15,8 @@ #include #include +#include + #ifdef DEBUG #define assert_err(cond, msg, err) do \ { \ @@ -43,16 +45,6 @@ #define CONST_CAST2(TOTYPE, FROMTYPE, X) ((union { FROMTYPE _q; TOTYPE _nq; }){ ._q = (X) }._nq) #define CONST_CAST(TYPE, X) CONST_CAST2(TYPE, const TYPE, (X)) -#ifndef __has_attribute -#define __has_attribute(x) (0) -#endif - -#if __has_attribute(fallthrough) -#define FALLTHROUGH __attribute__((fallthrough)) -#else -#define FALLTHROUGH -#endif - /** * Abstraction helper method to safely copy strings using strlcpy or strcpy_s * or a different safe copy method, depending on the current platform. diff --git a/src/native/minipal/utils.h b/src/native/minipal/utils.h index 644ed21f2714f..ef840a529f48f 100644 --- a/src/native/minipal/utils.h +++ b/src/native/minipal/utils.h @@ -13,6 +13,25 @@ #define __has_builtin(x) 0 #endif +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif + +#ifdef __cplusplus +# ifndef __has_cpp_attribute +# define __has_cpp_attribute(x) 0 +# endif +# if __has_cpp_attribute(fallthrough) +# define FALLTHROUGH [[fallthrough]] +# else +# define FALLTHROUGH +# endif +#elif __has_attribute(fallthrough) +# define FALLTHROUGH __attribute__((fallthrough)) +#else +# define FALLTHROUGH +#endif + #if defined(_MSC_VER) # if defined(__SANITIZE_ADDRESS__) # define HAS_ADDRESS_SANITIZER diff --git a/src/tests/nativeaot/SmokeTests/HardwareIntrinsics/Program.cs b/src/tests/nativeaot/SmokeTests/HardwareIntrinsics/Program.cs index e628938c57db8..28b678b5c2b22 100644 --- a/src/tests/nativeaot/SmokeTests/HardwareIntrinsics/Program.cs +++ b/src/tests/nativeaot/SmokeTests/HardwareIntrinsics/Program.cs @@ -22,7 +22,7 @@ static int Main() long lowerBound, upperBound; lowerBound = 1300 * 1024; // ~1.3 MB - upperBound = 1750 * 1024; // ~1.75 MB + upperBound = 1900 * 1024; // ~1.90 MB if (fileSize < lowerBound || fileSize > upperBound) {