Getting vxsort working on Linux amd64 (#98712)

Co-authored-by: Jan Vorlicek <[email protected]> Co-authored-by: Adeel Mujahid <[email protected]>
dotnet · Apr 3, 2024 · c08bd7b · c08bd7b
1 parent 71f45aa
commit c08bd7b
Show file tree

Hide file tree

Showing 32 changed files with 142 additions and 141 deletions.
diff --git a/src/coreclr/dlls/mscoree/coreclr/CMakeLists.txt b/src/coreclr/dlls/mscoree/coreclr/CMakeLists.txt
@@ -111,6 +111,12 @@ set(CORECLR_LIBRARIES
     gc_pal
 )
 
+if(CLR_CMAKE_TARGET_ARCH_AMD64)
+    list(APPEND CORECLR_LIBRARIES
+        gc_vxsort
+    )
+endif(CLR_CMAKE_TARGET_ARCH_AMD64)
+
 if(CLR_CMAKE_TARGET_WIN32)
     list(APPEND CORECLR_LIBRARIES
         ${STATIC_MT_CRT_LIB}

diff --git a/src/coreclr/gc/CMakeLists.txt b/src/coreclr/gc/CMakeLists.txt
@@ -36,20 +36,9 @@ else()
     windows/Native.rc)
 endif(CLR_CMAKE_HOST_UNIX)
 
-if (CLR_CMAKE_TARGET_ARCH_AMD64 AND CLR_CMAKE_TARGET_WIN32)
-  set (GC_SOURCES
-    ${GC_SOURCES}
-    vxsort/isa_detection.cpp
-    vxsort/do_vxsort_avx2.cpp
-    vxsort/do_vxsort_avx512.cpp
-    vxsort/machine_traits.avx2.cpp
-    vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp
-    vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp
-    vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp
-    vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp
-    vxsort/smallsort/avx2_load_mask_tables.cpp
-)
-endif (CLR_CMAKE_TARGET_ARCH_AMD64 AND CLR_CMAKE_TARGET_WIN32)
+if (CLR_CMAKE_TARGET_ARCH_AMD64)
+  add_subdirectory(vxsort)
+endif (CLR_CMAKE_TARGET_ARCH_AMD64)
 
 if (CLR_CMAKE_TARGET_WIN32)
   set(GC_HEADERS
@@ -87,7 +76,7 @@ if (CLR_CMAKE_TARGET_WIN32)
     handletablepriv.h
     objecthandle.h
     softwarewritewatch.h
-    vxsort/do_vxsort.h)
+  )
 endif(CLR_CMAKE_TARGET_WIN32)
 
 if(CLR_CMAKE_HOST_WIN32)
@@ -100,6 +89,13 @@ endif(CLR_CMAKE_HOST_WIN32)
 
 set (GC_LINK_LIBRARIES ${GC_LINK_LIBRARIES} gc_pal)
 
+if(CLR_CMAKE_TARGET_ARCH_AMD64)
+    list(APPEND GC_LINK_LIBRARIES
+        gc_vxsort
+    )
+endif(CLR_CMAKE_TARGET_ARCH_AMD64)
+
+
 list(APPEND GC_SOURCES ${GC_HEADERS})
 
 convert_to_absolute_path(GC_SOURCES ${GC_SOURCES})

diff --git a/src/coreclr/gc/gc.cpp b/src/coreclr/gc/gc.cpp
@@ -18,7 +18,7 @@
 
 #include "gcpriv.h"
 
-#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS)
+#ifdef TARGET_AMD64
 #define USE_VXSORT
 #else
 #define USE_INTROSORT
@@ -10305,11 +10305,11 @@ static void do_vxsort (uint8_t** item_array, ptrdiff_t item_count, uint8_t* rang
 {
     // above this threshold, using AVX2 for sorting will likely pay off
     // despite possible downclocking on some devices
-    const size_t AVX2_THRESHOLD_SIZE = 8 * 1024;
+    const ptrdiff_t AVX2_THRESHOLD_SIZE = 8 * 1024;
 
     // above this threshold, using AVX512F for sorting will likely pay off
     // despite possible downclocking on current devices
-    const size_t AVX512F_THRESHOLD_SIZE = 128 * 1024;
+    const ptrdiff_t AVX512F_THRESHOLD_SIZE = 128 * 1024;
 
     if (item_count <= 1)
         return;

diff --git a/src/coreclr/gc/gcsvr.cpp b/src/coreclr/gc/gcsvr.cpp
@@ -20,7 +20,7 @@
 
 #define SERVER_GC 1
 
-#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS)
+#ifdef TARGET_AMD64
 #include "vxsort/do_vxsort.h"
 #endif
 

diff --git a/src/coreclr/gc/gcwks.cpp b/src/coreclr/gc/gcwks.cpp
@@ -20,7 +20,7 @@
 #undef SERVER_GC
 #endif
 
-#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS)
+#ifdef TARGET_AMD64
 #include "vxsort/do_vxsort.h"
 #endif
 

diff --git a/src/coreclr/gc/unix/gcenv.unix.cpp b/src/coreclr/gc/unix/gcenv.unix.cpp
@@ -35,12 +35,6 @@
 #define __has_cpp_attribute(x) (0)
 #endif
 
-#if __has_cpp_attribute(fallthrough)
-#define FALLTHROUGH [[fallthrough]]
-#else
-#define FALLTHROUGH
-#endif
-
 #include <algorithm>
 
 #if HAVE_SYS_TIME_H

diff --git a/src/coreclr/gc/vxsort/CMakeLists.txt b/src/coreclr/gc/vxsort/CMakeLists.txt
@@ -0,0 +1,29 @@
+set(CMAKE_INCLUDE_CURRENT_DIR ON)
+include_directories("../env")
+
+if(CLR_CMAKE_HOST_UNIX)
+  set_source_files_properties(isa_detection.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+  set_source_files_properties(do_vxsort_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+  set_source_files_properties(do_vxsort_avx512.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+  set_source_files_properties(machine_traits.avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+  set_source_files_properties(smallsort/bitonic_sort.AVX2.int64_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+  set_source_files_properties(smallsort/bitonic_sort.AVX2.int32_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+  set_source_files_properties(smallsort/bitonic_sort.AVX512.int64_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+  set_source_files_properties(smallsort/bitonic_sort.AVX512.int32_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+  set_source_files_properties(smallsort/avx2_load_mask_tables.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+endif(CLR_CMAKE_HOST_UNIX)
+
+set (VXSORT_SOURCES
+  isa_detection.cpp
+  do_vxsort_avx2.cpp
+  do_vxsort_avx512.cpp
+  machine_traits.avx2.cpp
+  smallsort/bitonic_sort.AVX2.int64_t.generated.cpp
+  smallsort/bitonic_sort.AVX2.int32_t.generated.cpp
+  smallsort/bitonic_sort.AVX512.int64_t.generated.cpp
+  smallsort/bitonic_sort.AVX512.int32_t.generated.cpp
+  smallsort/avx2_load_mask_tables.cpp
+  do_vxsort.h
+)
+
+add_library(gc_vxsort STATIC ${VXSORT_SOURCES})
diff --git a/src/coreclr/gc/vxsort/defs.h b/src/coreclr/gc/vxsort/defs.h
@@ -45,36 +45,7 @@
 #define NOINLINE __attribute__((noinline))
 #endif
 
-namespace std {
-template <class _Ty>
-class numeric_limits {
-   public:
-    static constexpr _Ty Max() { static_assert(sizeof(_Ty) != sizeof(_Ty), "func must be specialized!"); return _Ty(); }
-    static constexpr _Ty Min() { static_assert(sizeof(_Ty) != sizeof(_Ty), "func must be specialized!"); return _Ty(); }
-};
-
-template <>
-class numeric_limits<int32_t> {
-public:
-    static constexpr int32_t Max() { return 0x7fffffff; }
-    static constexpr int32_t Min() { return -0x7fffffff - 1; }
-};
-
-template <>
-class numeric_limits<uint32_t> {
-public:
-    static constexpr uint32_t Max() { return 0xffffffff; }
-    static constexpr uint32_t Min() { return 0; }
-};
-
-template <>
-class numeric_limits<int64_t> {
-   public:
-    static constexpr int64_t Max() { return 0x7fffffffffffffffi64; }
-
-    static constexpr int64_t Min() { return -0x7fffffffffffffffi64 - 1; }
-};
-}  // namespace std
+#include <limits>
 
 #ifndef max
 template <typename T>

diff --git a/src/coreclr/gc/vxsort/machine_traits.avx2.h b/src/coreclr/gc/vxsort/machine_traits.avx2.h
@@ -13,6 +13,7 @@
 #include <immintrin.h>
 #include <assert.h>
 #include <inttypes.h>
+#include <type_traits>
 #include "defs.h"
 #include "machine_traits.h"
 
@@ -123,8 +124,7 @@ class vxsort_machine_traits<int64_t, AVX2> {
 
     template <int Shift>
     static constexpr bool can_pack(T span) {
-        const auto PACK_LIMIT = (((TU) std::numeric_limits<uint32_t>::Max() + 1)) << Shift;
-        return ((TU) span) < PACK_LIMIT;
+        return ((TU) span) < ((((TU) std::numeric_limits<uint32_t>::max() + 1)) << Shift);
     }
 
     static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); }

diff --git a/src/coreclr/gc/vxsort/machine_traits.avx512.h b/src/coreclr/gc/vxsort/machine_traits.avx512.h
@@ -11,6 +11,7 @@
 #include "vxsort_targets_enable_avx512.h"
 
 #include <immintrin.h>
+#include <type_traits>
 #include "defs.h"
 #include "machine_traits.h"
 
@@ -92,8 +93,7 @@ class vxsort_machine_traits<int64_t, AVX512> {
 
     template <int Shift>
     static constexpr bool can_pack(T span) {
-        const auto PACK_LIMIT = (((TU) std::numeric_limits<uint32_t>::Max() + 1)) << Shift;
-        return ((TU) span) < PACK_LIMIT;
+        return ((TU) span) < ((((TU) std::numeric_limits<uint32_t>::max() + 1)) << Shift);
     }
 
     static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); }

diff --git a/src/coreclr/gc/vxsort/packer.h b/src/coreclr/gc/vxsort/packer.h
@@ -56,7 +56,7 @@ class packer {
    public:
 
     static void pack(TFrom *mem, size_t len, TFrom base) {
-        TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::Min());
+        TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::min());
         auto baseVec = MT::broadcast(offset);
 
         auto pre_aligned_mem = reinterpret_cast<TFrom *>(reinterpret_cast<size_t>(mem) & ~ALIGN_MASK);
@@ -87,8 +87,8 @@ class packer {
 
         assert(AH::is_aligned(mem_read));
 
-        auto memv_read = (TV *) mem_read;
-        auto memv_write = (TV *) mem_write;
+        TV * memv_read = (TV *) mem_read;
+        TV * memv_write = (TV *) mem_write;
 
         auto lenv = len / N;
         len -= (lenv * N);
@@ -156,7 +156,7 @@ class packer {
 
 
     static void unpack(TTo *mem, size_t len, TFrom base) {
-        TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::Min());
+        TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::min());
         auto baseVec = MT::broadcast(offset);
 
         auto mem_read = mem + len;
@@ -184,8 +184,8 @@ class packer {
         assert(AH::is_aligned(mem_read));
 
         auto lenv = len / (N * 2);
-        auto memv_read = ((TV *) mem_read) - 1;
-        auto memv_write = ((TV *) mem_write) - 2;
+        TV * memv_read = ((TV *) mem_read) - 1;
+        TV * memv_write = ((TV *) mem_write) - 2;
         len -= lenv * N * 2;
 
         while (lenv >= Unroll) {

diff --git a/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h b/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h
@@ -39,7 +39,7 @@ extern "C" const uint8_t mask_table_8[M8_SIZE];
 
 template<> struct bitonic<int32_t, AVX2> {
     static const int N = 8;
-    static constexpr int32_t MAX = std::numeric_limits<int32_t>::Max();
+    static constexpr int32_t MAX = std::numeric_limits<int32_t>::max();
 public:
 
     static INLINE void sort_01v_ascending(__m256i& d01) {

diff --git a/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h b/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h
@@ -39,7 +39,7 @@ extern "C" const uint8_t mask_table_8[M8_SIZE];
 
 template<> struct bitonic<int64_t, AVX2> {
     static const int N = 4;
-    static constexpr int64_t MAX = std::numeric_limits<int64_t>::Max();
+    static constexpr int64_t MAX = std::numeric_limits<int64_t>::max();
 public:
 
     static INLINE void sort_01v_ascending(__m256i& d01) {

diff --git a/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h b/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h
@@ -36,7 +36,7 @@ namespace vxsort {
 namespace smallsort {
 template<> struct bitonic<int32_t, AVX512> {
     static const int N = 16;
-    static constexpr int32_t MAX = std::numeric_limits<int32_t>::Max();
+    static constexpr int32_t MAX = std::numeric_limits<int32_t>::max();
 public:
 
     static INLINE void sort_01v_ascending(__m512i& d01) {

diff --git a/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h b/src/coreclr/gc/vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h
@@ -36,7 +36,7 @@ namespace vxsort {
 namespace smallsort {
 template<> struct bitonic<int64_t, AVX512> {
     static const int N = 8;
-    static constexpr int64_t MAX = std::numeric_limits<int64_t>::Max();
+    static constexpr int64_t MAX = std::numeric_limits<int64_t>::max();
 public:
 
     static INLINE void sort_01v_ascending(__m512i& d01) {

diff --git a/src/coreclr/gc/vxsort/smallsort/codegen/avx2.py b/src/coreclr/gc/vxsort/smallsort/codegen/avx2.py
@@ -303,7 +303,7 @@ def generate_prologue(self, f):
 
 template<> struct bitonic<{t}, AVX2> {{
     static const int N = {self.vector_size()};
-    static constexpr {t} MAX = std::numeric_limits<{t}>::Max();
+    static constexpr {t} MAX = std::numeric_limits<{t}>::max();
 public:
 """
         print(s, file=f)

diff --git a/src/coreclr/gc/vxsort/smallsort/codegen/avx512.py b/src/coreclr/gc/vxsort/smallsort/codegen/avx512.py
@@ -299,7 +299,7 @@ def generate_prologue(self, f):
 namespace smallsort {{
 template<> struct bitonic<{t}, AVX512> {{
     static const int N = {self.vector_size()};
-    static constexpr {t} MAX = std::numeric_limits<{t}>::Max();
+    static constexpr {t} MAX = std::numeric_limits<{t}>::max();
 public:
 """
         print(s, file=f)