Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vectorize basic_string::find_last_of #4934

Merged
merged 20 commits into from
Oct 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 30 additions & 11 deletions benchmarks/src/find_first_of.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,38 @@
#include <benchmark/benchmark.h>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <limits>
#include <numeric>
#include <string>
#include <type_traits>
#include <vector>

using namespace std;

enum class AlgType : bool { std_func, str_member };
enum class AlgType { std_func, str_member_first, str_member_last };

template <AlgType Alg, class T, T Start = T{'a'}>
template <AlgType Alg, class T, T Start = T{'!'}>
void bm(benchmark::State& state) {
const size_t Pos = static_cast<size_t>(state.range(0));
const size_t NSize = static_cast<size_t>(state.range(1));
const size_t HSize = Pos * 2;
const size_t Which = 0;

using container = conditional_t<Alg == AlgType::str_member, basic_string<T>, vector<T>>;
using container = conditional_t<Alg == AlgType::std_func, vector<T>, basic_string<T>>;

container h(HSize, T{'.'});
constexpr T HaystackFiller{' '};
static_assert(HaystackFiller < Start, "The following iota() should not produce the haystack filler.");

container h(HSize, HaystackFiller);
container n(NSize, T{0});

if (NSize - 1 > static_cast<size_t>(numeric_limits<T>::max()) - static_cast<size_t>(Start)) {
puts("ERROR: The following iota() would overflow.");
abort();
}

iota(n.begin(), n.end(), Start);

if (Pos >= HSize || Which >= NSize) {
Expand All @@ -37,26 +48,34 @@ void bm(benchmark::State& state) {
for (auto _ : state) {
benchmark::DoNotOptimize(h);
benchmark::DoNotOptimize(n);
if constexpr (Alg == AlgType::str_member) {
benchmark::DoNotOptimize(h.find_first_of(n.data(), 0, n.size()));
if constexpr (Alg == AlgType::str_member_first) {
benchmark::DoNotOptimize(h.find_first_of(n));
} else if constexpr (Alg == AlgType::str_member_last) {
benchmark::DoNotOptimize(h.find_last_of(n));
} else {
benchmark::DoNotOptimize(find_first_of(h.begin(), h.end(), n.begin(), n.end()));
}
}
}

void common_args(auto bm) {
bm->Args({2, 3})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2});
bm->Args({102, 4})->Args({325, 1})->Args({1011, 11})->Args({1502, 23})->Args({3056, 7});
bm->Args({2, 3})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2})->Args({102, 4});
bm->Args({325, 1})->Args({400, 50})->Args({1011, 11})->Args({1502, 23})->Args({3056, 7});
}

BENCHMARK(bm<AlgType::std_func, uint8_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::std_func, uint16_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::std_func, uint32_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::std_func, uint64_t>)->Apply(common_args);

BENCHMARK(bm<AlgType::str_member, char>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member, wchar_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member, wchar_t, L'\x03B1'>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member_first, char>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member_first, wchar_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member_first, wchar_t, L'\x03B1'>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member_first, char32_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member_first, char32_t, U'\x03B1'>)->Apply(common_args);

BENCHMARK(bm<AlgType::str_member_last, char>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member_last, wchar_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member_last, wchar_t, L'\x03B1'>)->Apply(common_args);

BENCHMARK_MAIN();
116 changes: 92 additions & 24 deletions stl/inc/__msvc_string_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,42 @@ _STL_DISABLE_CLANG_WARNINGS
#pragma push_macro("new")
#undef new

#if _USE_STD_VECTOR_ALGORITHMS
extern "C" {
// The "noalias" attribute tells the compiler optimizer that pointers going into these hand-vectorized algorithms
// won't be stored beyond the lifetime of the function, and that the function will only reference arrays denoted by
// those pointers. The optimizer also assumes in that case that a pointer parameter is not returned to the caller via
// the return value, so functions using "noalias" must usually return void. This attribute is valuable because these
// functions are in native code objects that the compiler cannot analyze. In the absence of the noalias attribute, the
// compiler has to assume that the denoted arrays are "globally address taken", and that any later calls to
// unanalyzable routines may modify those arrays.

__declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_1(
const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept;
__declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_2(
const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept;

} // extern "C"

_STD_BEGIN

template <class _Ty1, class _Ty2>
size_t _Find_last_of_pos_vectorized(const _Ty1* const _Haystack, const size_t _Haystack_length,
const _Ty2* const _Needle, const size_t _Needle_length) noexcept {
_STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2));
if constexpr (sizeof(_Ty1) == 1) {
return ::__std_find_last_of_trivial_pos_1(_Haystack, _Haystack_length, _Needle, _Needle_length);
} else if constexpr (sizeof(_Ty1) == 2) {
return ::__std_find_last_of_trivial_pos_2(_Haystack, _Haystack_length, _Needle, _Needle_length);
} else {
_STL_INTERNAL_STATIC_ASSERT(false); // unexpected size
}
}

_STD_END

#endif // _USE_STD_VECTOR_ALGORITHMS

_STD_BEGIN
#ifdef __clang__
#define _HAS_MEMCPY_MEMMOVE_INTRINSICS 1
Expand Down Expand Up @@ -731,10 +767,14 @@ constexpr size_t _Traits_find_first_of(_In_reads_(_Hay_size) const _Traits_ptr_t
const bool _Try_vectorize = _Hay_size - _Start_at > _Threshold_find_first_of;

// Additional condition for when the vectorization outperforms the table lookup
const bool _Use_bitmap = !_Try_vectorize || (sizeof(_Elem) > 1 && sizeof(_Elem) * _Needle_size > 16);
#else
constexpr size_t _Find_first_of_bitmap_threshold = sizeof(_Elem) == 1 ? 48
: sizeof(_Elem) == 8 ? 8
: 16;

const bool _Use_bitmap = !_Try_vectorize || _Needle_size > _Find_first_of_bitmap_threshold;
#else // ^^^ _USE_STD_VECTOR_ALGORITHMS / !_USE_STD_VECTOR_ALGORITHMS vvv
const bool _Use_bitmap = true;
#endif // _USE_STD_VECTOR_ALGORITHMS
#endif // ^^^ !_USE_STD_VECTOR_ALGORITHMS ^^^

if (_Use_bitmap) {
_String_bitmap<_Elem> _Matches;
Expand Down Expand Up @@ -776,42 +816,70 @@ constexpr size_t _Traits_find_first_of(_In_reads_(_Hay_size) const _Traits_ptr_t
return static_cast<size_t>(-1); // no match
}

template <class _Traits, bool _Special = _Is_implementation_handled_char_traits<_Traits>>
template <class _Traits>
constexpr size_t _Traits_find_last_of(_In_reads_(_Hay_size) const _Traits_ptr_t<_Traits> _Haystack,
const size_t _Hay_size, const size_t _Start_at, _In_reads_(_Needle_size) const _Traits_ptr_t<_Traits> _Needle,
const size_t _Needle_size) noexcept {
// in [_Haystack, _Haystack + _Hay_size), look for last of [_Needle, _Needle + _Needle_size), before _Start_at
if (_Needle_size != 0 && _Hay_size != 0) { // worth searching, do it
if constexpr (_Special) {
_String_bitmap<typename _Traits::char_type> _Matches;
if (!_Matches._Mark(_Needle, _Needle + _Needle_size)) { // couldn't put one of the characters into the
// bitmap, fall back to the serial algorithm
return _Traits_find_last_of<_Traits, false>(_Haystack, _Hay_size, _Start_at, _Needle, _Needle_size);
if (_Needle_size == 0 || _Hay_size == 0) { // not worth searching
return static_cast<size_t>(-1);
}

const auto _Hay_start = (_STD min)(_Start_at, _Hay_size - 1);

if constexpr (_Is_implementation_handled_char_traits<_Traits>) {
if (!_STD _Is_constant_evaluated()) {
using _Elem = typename _Traits::char_type;

bool _Use_bitmap = true;
#if _USE_STD_VECTOR_ALGORITHMS
bool _Try_vectorize = false;

if constexpr (sizeof(_Elem) <= 2) {
_Try_vectorize = _Hay_start + 1 > _Threshold_find_first_of;
// Additional condition for when the vectorization outperforms the table lookup
constexpr size_t _Find_last_of_bitmap_threshold = sizeof(_Elem) == 1 ? 48 : 8;

_Use_bitmap = !_Try_vectorize || _Needle_size > _Find_last_of_bitmap_threshold;
}
#endif // _USE_STD_VECTOR_ALGORITHMS

for (auto _Match_try = _Haystack + (_STD min)(_Start_at, _Hay_size - 1);; --_Match_try) {
if (_Matches._Match(*_Match_try)) {
return static_cast<size_t>(_Match_try - _Haystack); // found a match
}
if (_Use_bitmap) {
_String_bitmap<_Elem> _Matches;
if (_Matches._Mark(_Needle, _Needle + _Needle_size)) {
for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) {
if (_Matches._Match(*_Match_try)) {
return static_cast<size_t>(_Match_try - _Haystack); // found a match
}

if (_Match_try == _Haystack) {
break; // at beginning, no more chance for match
if (_Match_try == _Haystack) {
return static_cast<size_t>(-1); // at beginning, no more chance for match
}
}
}

// couldn't put one of the characters into the bitmap, fall back to vectorized or serial algorithms
}
} else {
for (auto _Match_try = _Haystack + (_STD min)(_Start_at, _Hay_size - 1);; --_Match_try) {
if (_Traits::find(_Needle, _Needle_size, *_Match_try)) {
return static_cast<size_t>(_Match_try - _Haystack); // found a match
}

if (_Match_try == _Haystack) {
break; // at beginning, no more chance for match
#if _USE_STD_VECTOR_ALGORITHMS
if constexpr (sizeof(_Elem) <= 2) {
if (_Try_vectorize) {
return _STD _Find_last_of_pos_vectorized(_Haystack, _Hay_start + 1, _Needle, _Needle_size);
}
}
#endif // _USE_STD_VECTOR_ALGORITHMS
}
}

return static_cast<size_t>(-1); // no match
for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) {
if (_Traits::find(_Needle, _Needle_size, *_Match_try)) {
return static_cast<size_t>(_Match_try - _Haystack); // found a match
}

if (_Match_try == _Haystack) {
return static_cast<size_t>(-1); // at beginning, no more chance for match
}
}
}

template <class _Traits, bool _Special = _Is_implementation_handled_char_traits<_Traits>>
Expand Down
Loading