From 9888d7fa31082813aa31db067a84c749f8cd77b6 Mon Sep 17 00:00:00 2001 From: Aaron O'Mullan Date: Mon, 2 Sep 2024 15:29:26 +0900 Subject: [PATCH] perf(simd): avx2 fallack to swar instead of sse4.2 This has massive implications on the default runtime perf, improving how the code is lowered/inlined. (Falling back to SSE4.2 for a handful of bytes was wasteful). Should supersede #175, #156 --- src/simd/avx2.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/simd/avx2.rs b/src/simd/avx2.rs index 6a7edc1..c1a41f9 100644 --- a/src/simd/avx2.rs +++ b/src/simd/avx2.rs @@ -1,7 +1,7 @@ use crate::iter::Bytes; #[inline] -#[target_feature(enable = "avx2", enable = "sse4.2")] +#[target_feature(enable = "avx2")] pub unsafe fn match_uri_vectored(bytes: &mut Bytes) { while bytes.as_ref().len() >= 32 { let advance = match_url_char_32_avx(bytes.as_ref()); @@ -11,8 +11,8 @@ pub unsafe fn match_uri_vectored(bytes: &mut Bytes) { return; } } - // do both, since avx2 only works when bytes.len() >= 32 - super::sse42::match_uri_vectored(bytes) + // NOTE: use SWAR for <32B, more efficient than falling back to SSE4.2 + super::swar::match_uri_vectored(bytes) } #[inline(always)] @@ -56,7 +56,7 @@ unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize { r.trailing_zeros() as usize } -#[target_feature(enable = "avx2", enable = "sse4.2")] +#[target_feature(enable = "avx2")] pub unsafe fn match_header_value_vectored(bytes: &mut Bytes) { while bytes.as_ref().len() >= 32 { let advance = match_header_value_char_32_avx(bytes.as_ref()); @@ -66,8 +66,8 @@ pub unsafe fn match_header_value_vectored(bytes: &mut Bytes) { return; } } - // do both, since avx2 only works when bytes.len() >= 32 - super::sse42::match_header_value_vectored(bytes) + // NOTE: use SWAR for <32B, more efficient than falling back to SSE4.2 + super::swar::match_header_value_vectored(bytes) } #[inline(always)]