Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vector_algorithms.cpp: find, find_last, count: make AVX2 path avoid SSE path and (for some types) fallback #4570

Merged
merged 4 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 31 additions & 18 deletions benchmarks/src/find_and_count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
#include <benchmark/benchmark.h>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <ranges>
#include <vector>

enum class Op {
FindSized,
Expand All @@ -15,39 +17,50 @@ enum class Op {

using namespace std;

template <class T, size_t Size, size_t Pos, Op Operation>
template <class T, Op Operation>
void bm(benchmark::State& state) {
T a[Size];
const auto size = static_cast<size_t>(state.range(0));
const auto pos = static_cast<size_t>(state.range(1));

fill_n(a, Size, T{'0'});
if constexpr (Pos < Size) {
a[Pos] = T{'1'};
vector<T> a(size, T{'0'});

if (pos < size) {
a[pos] = T{'1'};
} else {
static_assert(Operation != Op::FindUnsized);
if constexpr (Operation == Op::FindUnsized) {
abort();
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
}
}

for (auto _ : state) {
if constexpr (Operation == Op::FindSized) {
benchmark::DoNotOptimize(ranges::find(a, a + Size, T{'1'}));
benchmark::DoNotOptimize(ranges::find(a.begin(), a.end(), T{'1'}));
} else if constexpr (Operation == Op::FindUnsized) {
benchmark::DoNotOptimize(ranges::find(a, unreachable_sentinel, T{'1'}));
benchmark::DoNotOptimize(ranges::find(a.begin(), unreachable_sentinel, T{'1'}));
} else if constexpr (Operation == Op::Count) {
benchmark::DoNotOptimize(ranges::count(a, a + Size, T{'1'}));
benchmark::DoNotOptimize(ranges::count(a.begin(), a.end(), T{'1'}));
}
}
}

BENCHMARK(bm<uint8_t, 8021, 3056, Op::FindSized>);
BENCHMARK(bm<uint8_t, 8021, 3056, Op::FindUnsized>);
BENCHMARK(bm<uint8_t, 8021, 3056, Op::Count>);
void common_args(auto bm) {
bm->Args({8021, 3056});
// AVX tail tests
bm->Args({63, 62})->Args({31, 30})->Args({15, 14})->Args({7, 6});
}


BENCHMARK(bm<uint8_t, Op::FindSized>)->Apply(common_args);
BENCHMARK(bm<uint8_t, Op::FindUnsized>)->Apply(common_args);
BENCHMARK(bm<uint8_t, Op::Count>)->Apply(common_args);

BENCHMARK(bm<uint16_t, 8021, 3056, Op::FindSized>);
BENCHMARK(bm<uint16_t, 8021, 3056, Op::Count>);
BENCHMARK(bm<uint16_t, Op::FindSized>)->Apply(common_args);
BENCHMARK(bm<uint16_t, Op::Count>)->Apply(common_args);

BENCHMARK(bm<uint32_t, 8021, 3056, Op::FindSized>);
BENCHMARK(bm<uint32_t, 8021, 3056, Op::Count>);
BENCHMARK(bm<uint32_t, Op::FindSized>)->Apply(common_args);
BENCHMARK(bm<uint32_t, Op::Count>)->Apply(common_args);

BENCHMARK(bm<uint64_t, 8021, 3056, Op::FindSized>);
BENCHMARK(bm<uint64_t, 8021, 3056, Op::Count>);
BENCHMARK(bm<uint64_t, Op::FindSized>)->Apply(common_args);
BENCHMARK(bm<uint64_t, Op::Count>)->Apply(common_args);

BENCHMARK_MAIN();
92 changes: 68 additions & 24 deletions stl/src/vector_algorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1837,15 +1837,15 @@ namespace {
template <class _Traits, class _Ty>
const void* __stdcall __std_find_trivial_impl(const void* _First, const void* _Last, _Ty _Val) noexcept {
#ifndef _M_ARM64EC
size_t _Size_bytes = _Byte_length(_First, _Last);
const size_t _Size_bytes = _Byte_length(_First, _Last);

const size_t _Avx_size = _Size_bytes & ~size_t{0x1F};
if (_Avx_size != 0 && _Use_avx2()) {
if (const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; _Avx_size != 0 && _Use_avx2()) {
_Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414

const __m256i _Comparand = _Traits::_Set_avx(_Val);
const void* _Stop_at = _First;
_Advance_bytes(_Stop_at, _Avx_size);

do {
const __m256i _Data = _mm256_loadu_si256(static_cast<const __m256i*>(_First));
const int _Bingo = _mm256_movemask_epi8(_Traits::_Cmp_avx(_Data, _Comparand));
Expand All @@ -1858,14 +1858,30 @@ namespace {

_Advance_bytes(_First, 32);
} while (_First != _Stop_at);
_Size_bytes &= 0x1F;
}

const size_t _Sse_size = _Size_bytes & ~size_t{0xF};
if (_Sse_size != 0 && _Use_sse42()) {
if (const size_t _Avx_tail_size = _Size_bytes & 0x1C; _Avx_tail_size != 0) {
const __m256i _Tail_mask = _Avx2_tail_mask_32(_Avx_tail_size >> 2);
const __m256i _Data = _mm256_maskload_epi32(static_cast<const int*>(_First), _Tail_mask);
const int _Bingo =
_mm256_movemask_epi8(_mm256_and_si256(_Traits::_Cmp_avx(_Data, _Comparand), _Tail_mask));

if (_Bingo != 0) {
const unsigned long _Offset = _tzcnt_u32(_Bingo);
_Advance_bytes(_First, _Offset);
return _First;
}

_Advance_bytes(_First, _Avx_tail_size);
}

if constexpr (sizeof(_Ty) >= 4) {
return _First;
}
} else if (const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; _Sse_size != 0 && _Use_sse42()) {
const __m128i _Comparand = _Traits::_Set_sse(_Val);
const void* _Stop_at = _First;
_Advance_bytes(_Stop_at, _Sse_size);

do {
const __m128i _Data = _mm_loadu_si128(static_cast<const __m128i*>(_First));
const int _Bingo = _mm_movemask_epi8(_Traits::_Cmp_sse(_Data, _Comparand));
Expand All @@ -1892,15 +1908,15 @@ namespace {
const void* __stdcall __std_find_last_trivial_impl(const void* _First, const void* _Last, _Ty _Val) noexcept {
const void* const _Real_last = _Last;
#ifndef _M_ARM64EC
size_t _Size_bytes = _Byte_length(_First, _Last);
const size_t _Size_bytes = _Byte_length(_First, _Last);

const size_t _Avx_size = _Size_bytes & ~size_t{0x1F};
if (_Avx_size != 0 && _Use_avx2()) {
if (const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; _Avx_size != 0 && _Use_avx2()) {
_Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414

const __m256i _Comparand = _Traits::_Set_avx(_Val);
const void* _Stop_at = _Last;
_Rewind_bytes(_Stop_at, _Avx_size);

do {
_Rewind_bytes(_Last, 32);
const __m256i _Data = _mm256_loadu_si256(static_cast<const __m256i*>(_Last));
Expand All @@ -1912,14 +1928,29 @@ namespace {
return _Last;
}
} while (_Last != _Stop_at);
_Size_bytes &= 0x1F;
}

const size_t _Sse_size = _Size_bytes & ~size_t{0xF};
if (_Sse_size != 0 && _Use_sse42()) {
if (const size_t _Avx_tail_size = _Size_bytes & 0x1C; _Avx_tail_size != 0) {
_Rewind_bytes(_Last, _Avx_tail_size);
const __m256i _Tail_mask = _Avx2_tail_mask_32(_Avx_tail_size >> 2);
const __m256i _Data = _mm256_maskload_epi32(static_cast<const int*>(_Last), _Tail_mask);
const int _Bingo =
_mm256_movemask_epi8(_mm256_and_si256(_Traits::_Cmp_avx(_Data, _Comparand), _Tail_mask));

if (_Bingo != 0) {
const unsigned long _Offset = _lzcnt_u32(_Bingo);
_Advance_bytes(_Last, (31 - _Offset) - (sizeof(_Ty) - 1));
return _Last;
}
}

if constexpr (sizeof(_Ty) >= 4) {
return _Real_last;
}
} else if (const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; _Sse_size != 0 && _Use_sse42()) {
const __m128i _Comparand = _Traits::_Set_sse(_Val);
const void* _Stop_at = _Last;
_Rewind_bytes(_Stop_at, _Sse_size);

do {
_Rewind_bytes(_Last, 16);
const __m128i _Data = _mm_loadu_si128(static_cast<const __m128i*>(_Last));
Expand Down Expand Up @@ -1952,40 +1983,53 @@ namespace {
size_t _Result = 0;

#ifndef _M_ARM64EC
size_t _Size_bytes = _Byte_length(_First, _Last);
const size_t _Size_bytes = _Byte_length(_First, _Last);

const size_t _Avx_size = _Size_bytes & ~size_t{0x1F};
if (_Avx_size != 0 && _Use_avx2()) {
if (const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; _Avx_size != 0 && _Use_avx2()) {
const __m256i _Comparand = _Traits::_Set_avx(_Val);
const void* _Stop_at = _First;
_Advance_bytes(_Stop_at, _Avx_size);

do {
const __m256i _Data = _mm256_loadu_si256(static_cast<const __m256i*>(_First));
const int _Bingo = _mm256_movemask_epi8(_Traits::_Cmp_avx(_Data, _Comparand));
_Result += __popcnt(_Bingo); // Assume available with SSE4.2
_Advance_bytes(_First, 32);
} while (_First != _Stop_at);
_Size_bytes &= 0x1F;

if (const size_t _Avx_tail_size = _Size_bytes & 0x1C; _Avx_tail_size != 0) {
const __m256i _Tail_mask = _Avx2_tail_mask_32(_Avx_tail_size >> 2);
const __m256i _Data = _mm256_maskload_epi32(static_cast<const int*>(_First), _Tail_mask);
const int _Bingo =
_mm256_movemask_epi8(_mm256_and_si256(_Traits::_Cmp_avx(_Data, _Comparand), _Tail_mask));
_Result += __popcnt(_Bingo); // Assume available with SSE4.2
_Advance_bytes(_First, _Avx_tail_size);
}

_mm256_zeroupper(); // TRANSITION, DevCom-10331414
}

const size_t _Sse_size = _Size_bytes & ~size_t{0xF};
if (_Sse_size != 0 && _Use_sse42()) {
_Result >>= _Traits::_Shift;

if constexpr (sizeof(_Ty) >= 4) {
return _Result;
}
} else if (const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; _Sse_size != 0 && _Use_sse42()) {
const __m128i _Comparand = _Traits::_Set_sse(_Val);
const void* _Stop_at = _First;
_Advance_bytes(_Stop_at, _Sse_size);

do {
const __m128i _Data = _mm_loadu_si128(static_cast<const __m128i*>(_First));
const int _Bingo = _mm_movemask_epi8(_Traits::_Cmp_sse(_Data, _Comparand));
_Result += __popcnt(_Bingo); // Assume available with SSE4.2
_Advance_bytes(_First, 16);
} while (_First != _Stop_at);

_Result >>= _Traits::_Shift;
}
#endif // !_M_ARM64EC
_Result >>= _Traits::_Shift;
auto _Ptr = static_cast<const _Ty*>(_First);
for (; _Ptr != _Last; ++_Ptr) {

for (auto _Ptr = static_cast<const _Ty*>(_First); _Ptr != _Last; ++_Ptr) {
if (*_Ptr == _Val) {
++_Result;
}
Expand Down
Loading