diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index e0de355b2a..bfce592432 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -147,11 +147,13 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_1(void* _Firs do { _Advance_bytes(_Last, -32); // vpermq to load left and right, and transpose the lanes - const __m256i _Left = _mm256_permute4x64_epi64(_mm256_loadu_si256(static_cast<__m256i*>(_First)), 78); - const __m256i _Right = _mm256_permute4x64_epi64(_mm256_loadu_si256(static_cast<__m256i*>(_Last)), 78); + const __m256i _Left = _mm256_loadu_si256(static_cast<__m256i*>(_First)); + const __m256i _Right = _mm256_loadu_si256(static_cast<__m256i*>(_Last)); + const __m256i _Left_perm = _mm256_permute4x64_epi64(_Left, _MM_SHUFFLE(1, 0, 3, 2)); + const __m256i _Right_perm = _mm256_permute4x64_epi64(_Right, _MM_SHUFFLE(1, 0, 3, 2)); // transpose all the chars in the lanes - const __m256i _Left_reversed = _mm256_shuffle_epi8(_Left, _Reverse_char_lanes_avx); - const __m256i _Right_reversed = _mm256_shuffle_epi8(_Right, _Reverse_char_lanes_avx); + const __m256i _Left_reversed = _mm256_shuffle_epi8(_Left_perm, _Reverse_char_lanes_avx); + const __m256i _Right_reversed = _mm256_shuffle_epi8(_Right_perm, _Reverse_char_lanes_avx); _mm256_storeu_si256(static_cast<__m256i*>(_First), _Right_reversed); _mm256_storeu_si256(static_cast<__m256i*>(_Last), _Left_reversed); _Advance_bytes(_First, 32); @@ -188,10 +190,12 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_2(void* _Firs _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 6 << 5); do { _Advance_bytes(_Last, -32); - const __m256i _Left = _mm256_permute4x64_epi64(_mm256_loadu_si256(static_cast<__m256i*>(_First)), 78); - const __m256i _Right = _mm256_permute4x64_epi64(_mm256_loadu_si256(static_cast<__m256i*>(_Last)), 78); - const __m256i _Left_reversed = _mm256_shuffle_epi8(_Left, _Reverse_short_lanes_avx); - const __m256i _Right_reversed = _mm256_shuffle_epi8(_Right, _Reverse_short_lanes_avx); + const __m256i _Left = _mm256_loadu_si256(static_cast<__m256i*>(_First)); + const __m256i _Right = _mm256_loadu_si256(static_cast<__m256i*>(_Last)); + const __m256i _Left_perm = _mm256_permute4x64_epi64(_Left, _MM_SHUFFLE(1, 0, 3, 2)); + const __m256i _Right_perm = _mm256_permute4x64_epi64(_Right, _MM_SHUFFLE(1, 0, 3, 2)); + const __m256i _Left_reversed = _mm256_shuffle_epi8(_Left_perm, _Reverse_short_lanes_avx); + const __m256i _Right_reversed = _mm256_shuffle_epi8(_Right_perm, _Reverse_short_lanes_avx); _mm256_storeu_si256(static_cast<__m256i*>(_First), _Right_reversed); _mm256_storeu_si256(static_cast<__m256i*>(_Last), _Left_reversed); _Advance_bytes(_First, 32); @@ -225,10 +229,12 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_4(void* _Firs _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 6 << 5); do { _Advance_bytes(_Last, -32); - const __m256i _Left = _mm256_permute4x64_epi64(_mm256_loadu_si256(static_cast<__m256i*>(_First)), 78); - const __m256i _Right = _mm256_permute4x64_epi64(_mm256_loadu_si256(static_cast<__m256i*>(_Last)), 78); - const __m256i _Left_reversed = _mm256_shuffle_epi32(_Left, 27); - const __m256i _Right_reversed = _mm256_shuffle_epi32(_Right, 27); + const __m256i _Left = _mm256_loadu_si256(static_cast<__m256i*>(_First)); + const __m256i _Right = _mm256_loadu_si256(static_cast<__m256i*>(_Last)); + const __m256i _Left_perm = _mm256_permute4x64_epi64(_Left, _MM_SHUFFLE(1, 0, 3, 2)); + const __m256i _Right_perm = _mm256_permute4x64_epi64(_Right, _MM_SHUFFLE(1, 0, 3, 2)); + const __m256i _Left_reversed = _mm256_shuffle_epi32(_Left_perm, _MM_SHUFFLE(0, 1, 2, 3)); + const __m256i _Right_reversed = _mm256_shuffle_epi32(_Right_perm, _MM_SHUFFLE(0, 1, 2, 3)); _mm256_storeu_si256(static_cast<__m256i*>(_First), _Right_reversed); _mm256_storeu_si256(static_cast<__m256i*>(_Last), _Left_reversed); _Advance_bytes(_First, 32); @@ -247,8 +253,8 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_4(void* _Firs _Advance_bytes(_Last, -16); const __m128i _Left = _mm_loadu_si128(static_cast<__m128i*>(_First)); const __m128i _Right = _mm_loadu_si128(static_cast<__m128i*>(_Last)); - const __m128i _Left_reversed = _mm_shuffle_epi32(_Left, 27); - const __m128i _Right_reversed = _mm_shuffle_epi32(_Right, 27); + const __m128i _Left_reversed = _mm_shuffle_epi32(_Left, _MM_SHUFFLE(0, 1, 2, 3)); + const __m128i _Right_reversed = _mm_shuffle_epi32(_Right, _MM_SHUFFLE(0, 1, 2, 3)); _mm_storeu_si128(static_cast<__m128i*>(_First), _Right_reversed); _mm_storeu_si128(static_cast<__m128i*>(_Last), _Left_reversed); _Advance_bytes(_First, 16); @@ -267,8 +273,8 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_8(void* _Firs _Advance_bytes(_Last, -32); const __m256i _Left = _mm256_loadu_si256(static_cast<__m256i*>(_First)); const __m256i _Right = _mm256_loadu_si256(static_cast<__m256i*>(_Last)); - const __m256i _Left_reversed = _mm256_permute4x64_epi64(_Left, 27); - const __m256i _Right_reversed = _mm256_permute4x64_epi64(_Right, 27); + const __m256i _Left_reversed = _mm256_permute4x64_epi64(_Left, _MM_SHUFFLE(0, 1, 2, 3)); + const __m256i _Right_reversed = _mm256_permute4x64_epi64(_Right, _MM_SHUFFLE(0, 1, 2, 3)); _mm256_storeu_si256(static_cast<__m256i*>(_First), _Right_reversed); _mm256_storeu_si256(static_cast<__m256i*>(_Last), _Left_reversed); _Advance_bytes(_First, 32); @@ -287,8 +293,8 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_8(void* _Firs _Advance_bytes(_Last, -16); const __m128i _Left = _mm_loadu_si128(static_cast<__m128i*>(_First)); const __m128i _Right = _mm_loadu_si128(static_cast<__m128i*>(_Last)); - const __m128i _Left_reversed = _mm_shuffle_epi32(_Left, 78); - const __m128i _Right_reversed = _mm_shuffle_epi32(_Right, 78); + const __m128i _Left_reversed = _mm_shuffle_epi32(_Left, _MM_SHUFFLE(1, 0, 3, 2)); + const __m128i _Right_reversed = _mm_shuffle_epi32(_Right, _MM_SHUFFLE(1, 0, 3, 2)); _mm_storeu_si128(static_cast<__m128i*>(_First), _Right_reversed); _mm_storeu_si128(static_cast<__m128i*>(_Last), _Left_reversed); _Advance_bytes(_First, 16); @@ -309,8 +315,9 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_1( _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 5 << 5); do { _Advance_bytes(_Last, -32); - const __m256i _Block = _mm256_permute4x64_epi64(_mm256_loadu_si256(static_cast(_Last)), 78); - const __m256i _Block_reversed = _mm256_shuffle_epi8(_Block, _Reverse_char_lanes_avx); + const __m256i _Block = _mm256_loadu_si256(static_cast(_Last)); + const __m256i _Block_permuted = _mm256_permute4x64_epi64(_Block, _MM_SHUFFLE(1, 0, 3, 2)); + const __m256i _Block_reversed = _mm256_shuffle_epi8(_Block_permuted, _Reverse_char_lanes_avx); _mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Block_reversed); _Advance_bytes(_Dest, 32); } while (_Dest != _Stop_at); @@ -345,8 +352,9 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_2( _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 5 << 5); do { _Advance_bytes(_Last, -32); - const __m256i _Block = _mm256_permute4x64_epi64(_mm256_loadu_si256(static_cast(_Last)), 78); - const __m256i _Block_reversed = _mm256_shuffle_epi8(_Block, _Reverse_short_lanes_avx); + const __m256i _Block = _mm256_loadu_si256(static_cast(_Last)); + const __m256i _Block_permuted = _mm256_permute4x64_epi64(_Block, _MM_SHUFFLE(1, 0, 3, 2)); + const __m256i _Block_reversed = _mm256_shuffle_epi8(_Block_permuted, _Reverse_short_lanes_avx); _mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Block_reversed); _Advance_bytes(_Dest, 32); } while (_Dest != _Stop_at); @@ -378,8 +386,9 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_4( _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 5 << 5); do { _Advance_bytes(_Last, -32); - const __m256i _Block = _mm256_permute4x64_epi64(_mm256_loadu_si256(static_cast(_Last)), 78); - const __m256i _Block_reversed = _mm256_shuffle_epi32(_Block, 27); + const __m256i _Block = _mm256_loadu_si256(static_cast(_Last)); + const __m256i _Block_permuted = _mm256_permute4x64_epi64(_Block, _MM_SHUFFLE(1, 0, 3, 2)); + const __m256i _Block_reversed = _mm256_shuffle_epi32(_Block_permuted, _MM_SHUFFLE(0, 1, 2, 3)); _mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Block_reversed); _Advance_bytes(_Dest, 32); } while (_Dest != _Stop_at); @@ -396,7 +405,7 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_4( do { _Advance_bytes(_Last, -16); const __m128i _Block = _mm_loadu_si128(static_cast(_Last)); - const __m128i _Block_reversed = _mm_shuffle_epi32(_Block, 27); + const __m128i _Block_reversed = _mm_shuffle_epi32(_Block, _MM_SHUFFLE(0, 1, 2, 3)); _mm_storeu_si128(static_cast<__m128i*>(_Dest), _Block_reversed); _Advance_bytes(_Dest, 16); } while (_Dest != _Stop_at); @@ -415,7 +424,7 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8( do { _Advance_bytes(_Last, -32); const __m256i _Block = _mm256_loadu_si256(static_cast(_Last)); - const __m256i _Block_reversed = _mm256_permute4x64_epi64(_Block, 27); + const __m256i _Block_reversed = _mm256_permute4x64_epi64(_Block, _MM_SHUFFLE(0, 1, 2, 3)); _mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Block_reversed); _Advance_bytes(_Dest, 32); } while (_Dest != _Stop_at); @@ -432,7 +441,7 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8( do { _Advance_bytes(_Last, -16); const __m128i _Block = _mm_loadu_si128(static_cast(_Last)); - const __m128i _Block_reversed = _mm_shuffle_epi32(_Block, 78); + const __m128i _Block_reversed = _mm_shuffle_epi32(_Block, _MM_SHUFFLE(1, 0, 3, 2)); _mm_storeu_si128(static_cast<__m128i*>(_Dest), _Block_reversed); _Advance_bytes(_Dest, 16); } while (_Dest != _Stop_at);