Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Getting vxsort working on Linux amd64 #98712

Merged
merged 1 commit into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/coreclr/dlls/mscoree/coreclr/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,12 @@ set(CORECLR_LIBRARIES
gc_pal
)

if(CLR_CMAKE_TARGET_ARCH_AMD64)
list(APPEND CORECLR_LIBRARIES
gc_vxsort
)
endif(CLR_CMAKE_TARGET_ARCH_AMD64)

if(CLR_CMAKE_TARGET_WIN32)
list(APPEND CORECLR_LIBRARIES
${STATIC_MT_CRT_LIB}
Expand Down
26 changes: 11 additions & 15 deletions src/coreclr/gc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,9 @@ else()
windows/Native.rc)
endif(CLR_CMAKE_HOST_UNIX)

if (CLR_CMAKE_TARGET_ARCH_AMD64 AND CLR_CMAKE_TARGET_WIN32)
set (GC_SOURCES
${GC_SOURCES}
vxsort/isa_detection.cpp
vxsort/do_vxsort_avx2.cpp
vxsort/do_vxsort_avx512.cpp
vxsort/machine_traits.avx2.cpp
vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp
vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp
vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp
vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp
vxsort/smallsort/avx2_load_mask_tables.cpp
)
endif (CLR_CMAKE_TARGET_ARCH_AMD64 AND CLR_CMAKE_TARGET_WIN32)
if (CLR_CMAKE_TARGET_ARCH_AMD64)
add_subdirectory(vxsort)
endif (CLR_CMAKE_TARGET_ARCH_AMD64)

if (CLR_CMAKE_TARGET_WIN32)
set(GC_HEADERS
Expand Down Expand Up @@ -87,7 +76,7 @@ if (CLR_CMAKE_TARGET_WIN32)
handletablepriv.h
objecthandle.h
softwarewritewatch.h
vxsort/do_vxsort.h)
)
endif(CLR_CMAKE_TARGET_WIN32)

if(CLR_CMAKE_HOST_WIN32)
Expand All @@ -100,6 +89,13 @@ endif(CLR_CMAKE_HOST_WIN32)

set (GC_LINK_LIBRARIES ${GC_LINK_LIBRARIES} gc_pal)

if(CLR_CMAKE_TARGET_ARCH_AMD64)
list(APPEND GC_LINK_LIBRARIES
gc_vxsort
)
endif(CLR_CMAKE_TARGET_ARCH_AMD64)


list(APPEND GC_SOURCES ${GC_HEADERS})

convert_to_absolute_path(GC_SOURCES ${GC_SOURCES})
Expand Down
6 changes: 3 additions & 3 deletions src/coreclr/gc/gc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

#include "gcpriv.h"

#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS)
#ifdef TARGET_AMD64
cshung marked this conversation as resolved.
Show resolved Hide resolved
#define USE_VXSORT
#else
#define USE_INTROSORT
Expand Down Expand Up @@ -10305,11 +10305,11 @@ static void do_vxsort (uint8_t** item_array, ptrdiff_t item_count, uint8_t* rang
{
// above this threshold, using AVX2 for sorting will likely pay off
// despite possible downclocking on some devices
const size_t AVX2_THRESHOLD_SIZE = 8 * 1024;
const ptrdiff_t AVX2_THRESHOLD_SIZE = 8 * 1024;

// above this threshold, using AVX512F for sorting will likely pay off
// despite possible downclocking on current devices
const size_t AVX512F_THRESHOLD_SIZE = 128 * 1024;
const ptrdiff_t AVX512F_THRESHOLD_SIZE = 128 * 1024;

if (item_count <= 1)
return;
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/gc/gcsvr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

#define SERVER_GC 1

#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS)
#ifdef TARGET_AMD64
#include "vxsort/do_vxsort.h"
#endif

Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/gc/gcwks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#undef SERVER_GC
#endif

#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS)
#ifdef TARGET_AMD64
#include "vxsort/do_vxsort.h"
#endif

Expand Down
6 changes: 0 additions & 6 deletions src/coreclr/gc/unix/gcenv.unix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,6 @@
#define __has_cpp_attribute(x) (0)
#endif

#if __has_cpp_attribute(fallthrough)
#define FALLTHROUGH [[fallthrough]]
#else
#define FALLTHROUGH
#endif

#include <algorithm>

#if HAVE_SYS_TIME_H
Expand Down
29 changes: 29 additions & 0 deletions src/coreclr/gc/vxsort/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
set(CMAKE_INCLUDE_CURRENT_DIR ON)
include_directories("../env")

if(CLR_CMAKE_HOST_UNIX)
set_source_files_properties(isa_detection.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(do_vxsort_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(do_vxsort_avx512.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(machine_traits.avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(smallsort/bitonic_sort.AVX2.int64_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(smallsort/bitonic_sort.AVX2.int32_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(smallsort/bitonic_sort.AVX512.int64_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(smallsort/bitonic_sort.AVX512.int32_t.generated.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(smallsort/avx2_load_mask_tables.cpp PROPERTIES COMPILE_FLAGS -mavx2)
endif(CLR_CMAKE_HOST_UNIX)

set (VXSORT_SOURCES
isa_detection.cpp
do_vxsort_avx2.cpp
do_vxsort_avx512.cpp
machine_traits.avx2.cpp
smallsort/bitonic_sort.AVX2.int64_t.generated.cpp
smallsort/bitonic_sort.AVX2.int32_t.generated.cpp
smallsort/bitonic_sort.AVX512.int64_t.generated.cpp
smallsort/bitonic_sort.AVX512.int32_t.generated.cpp
smallsort/avx2_load_mask_tables.cpp
do_vxsort.h
)

add_library(gc_vxsort STATIC ${VXSORT_SOURCES})
31 changes: 1 addition & 30 deletions src/coreclr/gc/vxsort/defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,36 +45,7 @@
#define NOINLINE __attribute__((noinline))
#endif

namespace std {
template <class _Ty>
class numeric_limits {
public:
static constexpr _Ty Max() { static_assert(sizeof(_Ty) != sizeof(_Ty), "func must be specialized!"); return _Ty(); }
static constexpr _Ty Min() { static_assert(sizeof(_Ty) != sizeof(_Ty), "func must be specialized!"); return _Ty(); }
};

template <>
class numeric_limits<int32_t> {
public:
static constexpr int32_t Max() { return 0x7fffffff; }
static constexpr int32_t Min() { return -0x7fffffff - 1; }
};

template <>
class numeric_limits<uint32_t> {
public:
static constexpr uint32_t Max() { return 0xffffffff; }
static constexpr uint32_t Min() { return 0; }
};

template <>
class numeric_limits<int64_t> {
public:
static constexpr int64_t Max() { return 0x7fffffffffffffffi64; }

static constexpr int64_t Min() { return -0x7fffffffffffffffi64 - 1; }
};
} // namespace std
#include <limits>

#ifndef max
template <typename T>
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/gc/vxsort/machine_traits.avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <immintrin.h>
#include <assert.h>
#include <inttypes.h>
#include <type_traits>
#include "defs.h"
#include "machine_traits.h"

Expand Down Expand Up @@ -123,8 +124,7 @@ class vxsort_machine_traits<int64_t, AVX2> {

template <int Shift>
static constexpr bool can_pack(T span) {
const auto PACK_LIMIT = (((TU) std::numeric_limits<uint32_t>::Max() + 1)) << Shift;
return ((TU) span) < PACK_LIMIT;
return ((TU) span) < ((((TU) std::numeric_limits<uint32_t>::max() + 1)) << Shift);
}

static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); }
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/gc/vxsort/machine_traits.avx512.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "vxsort_targets_enable_avx512.h"

#include <immintrin.h>
#include <type_traits>
#include "defs.h"
#include "machine_traits.h"

Expand Down Expand Up @@ -92,8 +93,7 @@ class vxsort_machine_traits<int64_t, AVX512> {

template <int Shift>
static constexpr bool can_pack(T span) {
const auto PACK_LIMIT = (((TU) std::numeric_limits<uint32_t>::Max() + 1)) << Shift;
return ((TU) span) < PACK_LIMIT;
return ((TU) span) < ((((TU) std::numeric_limits<uint32_t>::max() + 1)) << Shift);
}

static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); }
Expand Down
12 changes: 6 additions & 6 deletions src/coreclr/gc/vxsort/packer.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class packer {
public:

static void pack(TFrom *mem, size_t len, TFrom base) {
TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::Min());
TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::min());
auto baseVec = MT::broadcast(offset);

auto pre_aligned_mem = reinterpret_cast<TFrom *>(reinterpret_cast<size_t>(mem) & ~ALIGN_MASK);
Expand Down Expand Up @@ -87,8 +87,8 @@ class packer {

assert(AH::is_aligned(mem_read));

auto memv_read = (TV *) mem_read;
auto memv_write = (TV *) mem_write;
TV * memv_read = (TV *) mem_read;
TV * memv_write = (TV *) mem_write;

auto lenv = len / N;
len -= (lenv * N);
Expand Down Expand Up @@ -156,7 +156,7 @@ class packer {


static void unpack(TTo *mem, size_t len, TFrom base) {
TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::Min());
TFrom offset = MT::template shift_n_sub<Shift>(base, (TFrom) std::numeric_limits<TTo>::min());
auto baseVec = MT::broadcast(offset);

auto mem_read = mem + len;
Expand Down Expand Up @@ -184,8 +184,8 @@ class packer {
assert(AH::is_aligned(mem_read));

auto lenv = len / (N * 2);
auto memv_read = ((TV *) mem_read) - 1;
auto memv_write = ((TV *) mem_write) - 2;
TV * memv_read = ((TV *) mem_read) - 1;
TV * memv_write = ((TV *) mem_write) - 2;
len -= lenv * N * 2;

while (lenv >= Unroll) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ extern "C" const uint8_t mask_table_8[M8_SIZE];

template<> struct bitonic<int32_t, AVX2> {
static const int N = 8;
static constexpr int32_t MAX = std::numeric_limits<int32_t>::Max();
static constexpr int32_t MAX = std::numeric_limits<int32_t>::max();
public:

static INLINE void sort_01v_ascending(__m256i& d01) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ extern "C" const uint8_t mask_table_8[M8_SIZE];

template<> struct bitonic<int64_t, AVX2> {
static const int N = 4;
static constexpr int64_t MAX = std::numeric_limits<int64_t>::Max();
static constexpr int64_t MAX = std::numeric_limits<int64_t>::max();
public:

static INLINE void sort_01v_ascending(__m256i& d01) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ namespace vxsort {
namespace smallsort {
template<> struct bitonic<int32_t, AVX512> {
static const int N = 16;
static constexpr int32_t MAX = std::numeric_limits<int32_t>::Max();
static constexpr int32_t MAX = std::numeric_limits<int32_t>::max();
public:

static INLINE void sort_01v_ascending(__m512i& d01) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ namespace vxsort {
namespace smallsort {
template<> struct bitonic<int64_t, AVX512> {
static const int N = 8;
static constexpr int64_t MAX = std::numeric_limits<int64_t>::Max();
static constexpr int64_t MAX = std::numeric_limits<int64_t>::max();
public:

static INLINE void sort_01v_ascending(__m512i& d01) {
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/gc/vxsort/smallsort/codegen/avx2.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def generate_prologue(self, f):
template<> struct bitonic<{t}, AVX2> {{
static const int N = {self.vector_size()};
static constexpr {t} MAX = std::numeric_limits<{t}>::Max();
static constexpr {t} MAX = std::numeric_limits<{t}>::max();
public:
"""
print(s, file=f)
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/gc/vxsort/smallsort/codegen/avx512.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def generate_prologue(self, f):
namespace smallsort {{
template<> struct bitonic<{t}, AVX512> {{
static const int N = {self.vector_size()};
static constexpr {t} MAX = std::numeric_limits<{t}>::Max();
static constexpr {t} MAX = std::numeric_limits<{t}>::max();
public:
"""
print(s, file=f)
Expand Down
Loading
Loading