From 008db067f12ff10f747f1e99f176b827ce09ce52 Mon Sep 17 00:00:00 2001 From: "Heinz N. Gies" Date: Fri, 20 Oct 2023 14:32:22 +0200 Subject: [PATCH] feature flag improvements Signed-off-by: Heinz N. Gies --- README.md | 8 ++++ examples/perf.rs | 24 ++++++++--- src/lib.rs | 110 ++++++++++++++++++++++++++++++----------------- 3 files changed, 97 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 65fcf7fb..121a3eec 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,12 @@ To be able to take advantage of `simd-json` your system needs to be SIMD capable `simd-json` supports AVX2, SSE4.2 and NEON and simd128 (wasm) natively, it also includes a unoptimized fallback implementation using native rust for other platforms, however this is a last resport measure and nothing we'd recommend relying on. +### Performance characteristics + +- CPU native cpu compilation results in the best performance. +- CPU detection for AVX and SSE4.2 is the second fastes (on x86_* only). +- portable std::simd is the next fasted implementaiton when compiled with a native cpu target. +- std::simd or the rust native implementation is the least performant. ### allocator @@ -36,6 +42,8 @@ For best performance we highly suggest using [mimalloc](https://crates.io/crates This feature allowa selecting the optimal algorithn based on availalbe features during runeimte, it has no effect on non x86 or x86_64 platforms. When neither `AVX2` nor `SSE4.2` is spported it will fallback to a native rust implementaiton. +note that a application compiled with `runtime-detection` will not run as fast as an applicaiton compiled for a specific CPU, the reason being is that rust can't optimize as far to the instruction set when it uses the generic instruction set, also non simd parts of the code won't be optimized for the given instruction set either. + ### `portable` **Currently disabled** diff --git a/examples/perf.rs b/examples/perf.rs index 2aae1158..b5786c6a 100644 --- a/examples/perf.rs +++ b/examples/perf.rs @@ -8,14 +8,24 @@ mod int { use perfcnt::linux::{HardwareEventType, PerfCounterBuilderLinux}; use perfcnt::{AbstractPerfCounter, PerfCounter}; use serde::{Deserialize, Serialize}; + use simd_json::{Deserializer, Implementation}; use std::io::BufReader; #[derive(Default, Serialize, Deserialize)] struct Stats { + algo: String, best: Stat, total: Stat, iters: u64, } + impl Stats { + fn new(algo: Implementation) -> Self { + Stats { + algo: algo.to_string(), + ..Default::default() + } + } + } #[derive(Default, Serialize, Deserialize)] struct Stat { @@ -96,7 +106,7 @@ mod int { let branch_instructions = self.total.branch_instructions / self.iters; println!( - "{:20} {:10} {:10} {:10} {:10} {:10} {:10.3} {:10.3}", + "{:20} {:10} {:10} {:10} {:10} {:10} {:10.3} {:10.3} {:21}", name, cycles, instructions, @@ -104,7 +114,8 @@ mod int { cache_misses, cache_references, ((self.best.cycles as f64) / bytes as f64), - ((cycles as f64) / bytes as f64) + ((cycles as f64) / bytes as f64), + self.algo ); } pub fn print_diff(&self, baseline: &Stats, name: &str, bytes: usize) { @@ -135,7 +146,7 @@ mod int { } println!( - "{:20} {:>10} {:>10} {:>10} {:>10} {:>10} {:10} {:10}", + "{:20} {:>10} {:>10} {:>10} {:>10} {:>10} {:10} {:10} {:21}", format!("{}(+/-)", name), d((1.0 - cycles_b as f64 / cycles as f64) * 100.0), d((1.0 - instructions_b as f64 / instructions as f64) * 100.0), @@ -144,6 +155,7 @@ mod int { d((1.0 - cache_references_b as f64 / cache_references as f64) * 100.0), d((1.0 - best_cycles_per_byte_b as f64 / best_cycles_per_byte as f64) * 100.0), d((1.0 - cycles_per_byte_b as f64 / cycles_per_byte as f64) * 100.0), + baseline.algo ); } } @@ -166,7 +178,7 @@ mod int { for mut bytes in &mut data_entries[..WARMUP as usize] { simd_json::to_borrowed_value(&mut bytes).unwrap(); } - let mut stats = Stats::default(); + let mut stats = Stats::new(Deserializer::algorithm()); for mut bytes in &mut data_entries[WARMUP as usize..] { // Set up counters let pc = stats.start(); @@ -219,8 +231,8 @@ fn main() { let matches = opts.parse(&args[1..]).unwrap(); println!( - "{:^20} {:^10} {:^21} {:^21} {:^21}", - " ", "", "Instructions", "Cache.", "Cycle/byte" + "{:^20} {:^10} {:^21} {:^21} {:^21} {:21}", + " ", "", "Instructions", "Cache.", "Cycle/byte", "Algorithm" ); println!( "{:^20} {:^10} {:^10} {:^10} {:^10} {:^10} {:^10} {:^10}", diff --git a/src/lib.rs b/src/lib.rs index 670abb52..0ae519fe 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -457,6 +457,7 @@ impl<'de> Deserializer<'de> { } #[cfg(not(any( feature = "runtime-detection", + feature = "portable", target_feature = "avx2", target_feature = "sse4.2", target_feature = "simd128", @@ -465,14 +466,20 @@ impl<'de> Deserializer<'de> { /// returns the algorithm / architecture used by the deserializer #[must_use] pub fn algorithm() -> Implementation { - #[cfg(feature = "portable")] - let r = Implementation::StdSimd; - #[cfg(not(feature = "portable"))] - let r = Implementation::Native; - r + Implementation::Native + } + #[cfg(all(feature = "portable", not(feature = "runtime-detection")))] + /// returns the algorithm / architecture used by the deserializer + #[must_use] + pub fn algorithm() -> Implementation { + Implementation::StdSimd } - #[cfg(all(target_feature = "avx2", not(feature = "runtime-detection")))] + #[cfg(all( + target_feature = "avx2", + not(feature = "portable"), + not(feature = "runtime-detection"), + ))] /// returns the algorithm / architecture used by the deserializer #[must_use] pub fn algorithm() -> Implementation { @@ -481,8 +488,9 @@ impl<'de> Deserializer<'de> { #[cfg(all( target_feature = "sse4.2", + not(target_feature = "avx2"), not(feature = "runtime-detection"), - not(target_feature = "avx2") + not(feature = "portable"), ))] /// returns the algorithm / architecture used by the deserializer #[must_use] @@ -490,13 +498,14 @@ impl<'de> Deserializer<'de> { Implementation::SSE42 } - #[cfg(target_arch = "aarch64")] + #[cfg(all(target_arch = "aarch64", not(feature = "portable")))] /// returns the algorithm / architecture used by the deserializer #[must_use] pub fn algorithm() -> Implementation { Implementation::NEON } - #[cfg(target_feature = "simd128")] + + #[cfg(all(target_feature = "simd128", not(feature = "portable")))] /// returns the algorithm / architecture used by the deserializer #[must_use] pub fn algorithm() -> Implementation { @@ -560,6 +569,7 @@ impl<'de> Deserializer<'de> { #[inline] #[cfg(not(any( feature = "runtime-detection", + feature = "portable", target_feature = "avx2", target_feature = "sse4.2", target_feature = "simd128", @@ -575,16 +585,29 @@ impl<'de> Deserializer<'de> { 'de: 'invoke, { let input: SillyWrapper<'de> = SillyWrapper::from(input); - - #[cfg(feature = "portable")] - let r = impls::portable::parse_str(input, data, buffer, idx); - #[cfg(not(feature = "portable"))] - let r = impls::native::parse_str(input, data, buffer, idx); - r + impls::native::parse_str(input, data, buffer, idx) + } + #[inline] + #[cfg(all(feature = "portable", not(feature = "runtime-detection")))] + pub(crate) unsafe fn parse_str_<'invoke>( + input: *mut u8, + data: &'invoke [u8], + buffer: &'invoke mut [u8], + idx: usize, + ) -> Result<&'de str> + where + 'de: 'invoke, + { + let input: SillyWrapper<'de> = SillyWrapper::from(input); + impls::portable::parse_str(input, data, buffer, idx) } #[inline] - #[cfg(all(target_feature = "avx2", not(feature = "runtime-detection")))] + #[cfg(all( + target_feature = "avx2", + not(feature = "portable"), + not(feature = "runtime-detection"), + ))] pub(crate) unsafe fn parse_str_<'invoke>( input: *mut u8, data: &'invoke [u8], @@ -598,8 +621,9 @@ impl<'de> Deserializer<'de> { #[inline] #[cfg(all( target_feature = "sse4.2", + not(target_feature = "avx2"), not(feature = "runtime-detection"), - not(target_feature = "avx2") + not(feature = "portable"), ))] pub(crate) unsafe fn parse_str_<'invoke>( input: *mut u8, @@ -612,7 +636,7 @@ impl<'de> Deserializer<'de> { } #[inline] - #[cfg(target_arch = "aarch64")] + #[cfg(all(target_arch = "aarch64", not(feature = "portable")))] pub(crate) unsafe fn parse_str_<'invoke>( input: *mut u8, data: &'invoke [u8], @@ -622,7 +646,7 @@ impl<'de> Deserializer<'de> { impls::neon::parse_str(input, data, buffer, idx) } #[inline] - #[cfg(target_feature = "simd128")] + #[cfg(all(target_feature = "simd128", not(feature = "portable")))] pub(crate) unsafe fn parse_str_<'invoke>( input: *mut u8, data: &'invoke [u8], @@ -678,50 +702,58 @@ impl<'de> Deserializer<'de> { mem::transmute::(fun)(input, structural_indexes) } - #[inline] #[cfg(not(any( feature = "runtime-detection", + feature = "portable", target_feature = "avx2", target_feature = "sse4.2", target_feature = "simd128", target_arch = "aarch64", )))] + #[inline] pub(crate) unsafe fn find_structural_bits( input: &[u8], structural_indexes: &mut Vec, ) -> std::result::Result<(), ErrorType> { - #[cfg(not(feature = "portable"))] - let r = { - // This is a nasty hack, we don't have a chunked implementation for native rust - // so we validate UTF8 ahead of time - match core::str::from_utf8(input) { - Ok(_) => (), - Err(_) => return Err(ErrorType::InvalidUtf8), - }; - #[cfg(not(feature = "portable"))] - Self::_find_structural_bits::(input, structural_indexes) + // This is a nasty hack, we don't have a chunked implementation for native rust + // so we validate UTF8 ahead of time + match core::str::from_utf8(input) { + Ok(_) => (), + Err(_) => return Err(ErrorType::InvalidUtf8), }; - #[cfg(feature = "portable")] - let r = - Self::_find_structural_bits::(input, structural_indexes); - r + #[cfg(not(feature = "portable"))] + Self::_find_structural_bits::(input, structural_indexes) } + #[cfg(all(feature = "portable", not(feature = "runtime-detection")))] #[inline] - #[cfg(all(target_feature = "avx2", not(feature = "runtime-detection")))] pub(crate) unsafe fn find_structural_bits( input: &[u8], structural_indexes: &mut Vec, ) -> std::result::Result<(), ErrorType> { - Self::_find_structural_bits::(input, structural_indexes) + Self::_find_structural_bits::(input, structural_indexes) } + #[cfg(all( + target_feature = "avx2", + not(feature = "portable"), + not(feature = "runtime-detection"), + ))] #[inline] + pub(crate) unsafe fn find_structural_bits( + input: &[u8], + structural_indexes: &mut Vec, + ) -> std::result::Result<(), ErrorType> { + Self::_find_structural_bits::(input, structural_indexes) + } + #[cfg(all( target_feature = "sse4.2", + not(target_feature = "avx2"), not(feature = "runtime-detection"), - not(target_feature = "avx2") + not(feature = "portable"), ))] + #[inline] pub(crate) unsafe fn find_structural_bits( input: &[u8], structural_indexes: &mut Vec, @@ -729,8 +761,8 @@ impl<'de> Deserializer<'de> { Self::_find_structural_bits::(input, structural_indexes) } + #[cfg(all(target_arch = "aarch64", not(feature = "portable")))] #[inline] - #[cfg(target_arch = "aarch64")] pub(crate) unsafe fn find_structural_bits( input: &[u8], structural_indexes: &mut Vec, @@ -738,8 +770,8 @@ impl<'de> Deserializer<'de> { Self::_find_structural_bits::(input, structural_indexes) } + #[cfg(all(target_feature = "simd128", not(feature = "portable")))] #[inline] - #[cfg(target_feature = "simd128")] pub(crate) unsafe fn find_structural_bits( input: &[u8], structural_indexes: &mut Vec,