Skip to content

Commit

Permalink
perf: fix SIMD-inlining (#131)
Browse files Browse the repository at this point in the history
Drastically improving throughput on larger inputs
(3x+ for large URIs or header-values)
  • Loading branch information
AaronO authored Apr 18, 2023
1 parent 5dd152e commit d745bd2
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 22 deletions.
31 changes: 15 additions & 16 deletions src/simd/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@ pub enum Scan {
TooShort,
}

#[cfg(target_arch = "x86")]
unsafe fn parse_uri_batch_32(_: &[u8]) -> usize {
unreachable!("AVX2 detection should be disabled for x86");
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
pub unsafe fn parse_uri_batch_32(bytes: &mut Bytes) -> Scan {
while bytes.as_ref().len() >= 32 {
let advance = match_url_char_32_avx(bytes.as_ref());
Expand All @@ -20,9 +26,7 @@ pub unsafe fn parse_uri_batch_32(bytes: &mut Bytes) -> Scan {
Scan::TooShort
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
#[inline(always)]
#[allow(non_snake_case, overflowing_literals)]
unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {
debug_assert!(buf.len() >= 32);
Expand Down Expand Up @@ -59,16 +63,18 @@ unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {
let bits = _mm256_and_si256(_mm256_shuffle_epi8(ARF, cols), rbms);

let v = _mm256_cmpeq_epi8(bits, _mm256_setzero_si256());
let r = 0xffff_ffff_0000_0000 | _mm256_movemask_epi8(v) as u64;
let r = _mm256_movemask_epi8(v) as u32;

_tzcnt_u64(r) as usize
r.trailing_zeros() as usize
}

#[cfg(target_arch = "x86")]
unsafe fn match_url_char_32_avx(_: &[u8]) -> usize {
unsafe fn match_header_value_batch_32(_: &[u8]) -> usize {
unreachable!("AVX2 detection should be disabled for x86");
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
pub unsafe fn match_header_value_batch_32(bytes: &mut Bytes) -> Scan {
while bytes.as_ref().len() >= 32 {
let advance = match_header_value_char_32_avx(bytes.as_ref());
Expand All @@ -81,9 +87,7 @@ pub unsafe fn match_header_value_batch_32(bytes: &mut Bytes) -> Scan {
Scan::TooShort
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
#[inline(always)]
#[allow(non_snake_case)]
unsafe fn match_header_value_char_32_avx(buf: &[u8]) -> usize {
debug_assert!(buf.len() >= 32);
Expand All @@ -109,14 +113,9 @@ unsafe fn match_header_value_char_32_avx(buf: &[u8]) -> usize {
let del = _mm256_cmpeq_epi8(dat, DEL);
let bit = _mm256_andnot_si256(del, _mm256_or_si256(low, tab));
let rev = _mm256_cmpeq_epi8(bit, _mm256_setzero_si256());
let res = 0xffff_ffff_0000_0000 | _mm256_movemask_epi8(rev) as u64;
let res = _mm256_movemask_epi8(rev) as u32;

_tzcnt_u64(res) as usize
}

#[cfg(target_arch = "x86")]
unsafe fn match_header_value_char_32_avx(_: &[u8]) -> usize {
unreachable!("AVX2 detection should be disabled for x86");
res.trailing_zeros() as usize
}

#[test]
Expand Down
14 changes: 8 additions & 6 deletions src/simd/sse42.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::iter::Bytes;

#[target_feature(enable = "sse4.2")]
pub unsafe fn parse_uri_batch_16(bytes: &mut Bytes) {
while bytes.as_ref().len() >= 16 {
let advance = match_url_char_16_sse(bytes.as_ref());
Expand All @@ -11,7 +12,7 @@ pub unsafe fn parse_uri_batch_16(bytes: &mut Bytes) {
}
}

#[target_feature(enable = "sse4.2")]
#[inline(always)]
#[allow(non_snake_case, overflowing_literals)]
unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {
debug_assert!(buf.len() >= 16);
Expand Down Expand Up @@ -54,11 +55,12 @@ unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {
let bits = _mm_and_si128(_mm_shuffle_epi8(ARF, cols), rbms);

let v = _mm_cmpeq_epi8(bits, _mm_setzero_si128());
let r = 0xffff_0000 | _mm_movemask_epi8(v) as u32;
let r = _mm_movemask_epi8(v) as u16;

_tzcnt_u32(r) as usize
r.trailing_zeros() as usize
}

#[target_feature(enable = "sse4.2")]
pub unsafe fn match_header_value_batch_16(bytes: &mut Bytes) {
while bytes.as_ref().len() >= 16 {
let advance = match_header_value_char_16_sse(bytes.as_ref());
Expand All @@ -70,7 +72,7 @@ pub unsafe fn match_header_value_batch_16(bytes: &mut Bytes) {
}
}

#[target_feature(enable = "sse4.2")]
#[inline(always)]
#[allow(non_snake_case)]
unsafe fn match_header_value_char_16_sse(buf: &[u8]) -> usize {
debug_assert!(buf.len() >= 16);
Expand All @@ -94,9 +96,9 @@ unsafe fn match_header_value_char_16_sse(buf: &[u8]) -> usize {
let del = _mm_cmpeq_epi8(dat, DEL);
let bit = _mm_andnot_si128(del, _mm_or_si128(low, tab));
let rev = _mm_cmpeq_epi8(bit, _mm_setzero_si128());
let res = 0xffff_0000 | _mm_movemask_epi8(rev) as u32;
let res = _mm_movemask_epi8(rev) as u16;

_tzcnt_u32(res) as usize
res.trailing_zeros() as usize
}

#[test]
Expand Down

0 comments on commit d745bd2

Please sign in to comment.