From 76900705e8a54e8058a7d8f95da7fb873082c298 Mon Sep 17 00:00:00 2001 From: bjorn3 <17426603+bjorn3@users.noreply.github.com> Date: Mon, 5 Jun 2023 08:55:43 +0000 Subject: [PATCH 1/4] Implement all vendor intrinsics used by regex on AVX2 systems This allows it to work with --sysroot llvm --- example/std_example.rs | 47 +++++++++++ src/intrinsics/llvm_x86.rs | 158 ++++++++++++++++++++++++++++++++++++- src/value_and_place.rs | 21 +++++ 3 files changed, 225 insertions(+), 1 deletion(-) diff --git a/example/std_example.rs b/example/std_example.rs index ab4045d11a663..811dbb267cdd7 100644 --- a/example/std_example.rs +++ b/example/std_example.rs @@ -198,6 +198,9 @@ unsafe fn test_simd() { test_mm_extract_epi8(); test_mm_insert_epi16(); + test_mm256_shuffle_epi8(); + test_mm256_permute2x128_si256(); + #[rustfmt::skip] let mask1 = _mm_movemask_epi8(dbg!(_mm_setr_epi8(255u8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))); assert_eq!(mask1, 1); @@ -293,6 +296,12 @@ pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) { } } +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx")] +pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) { + assert_eq!(std::mem::transmute::<_, [u64; 4]>(a), std::mem::transmute::<_, [u64; 4]>(b)) +} + #[cfg(target_arch = "x86_64")] #[target_feature(enable = "sse2")] unsafe fn test_mm_cvtsi128_si64() { @@ -336,6 +345,44 @@ unsafe fn test_mm_insert_epi16() { assert_eq_m128i(r, e); } +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx2")] +unsafe fn test_mm256_shuffle_epi8() { + #[rustfmt::skip] + let a = _mm256_setr_epi8( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, + ); + #[rustfmt::skip] + let b = _mm256_setr_epi8( + 4, 128u8 as i8, 4, 3, 24, 12, 6, 19, + 12, 5, 5, 10, 4, 1, 8, 0, + 4, 128u8 as i8, 4, 3, 24, 12, 6, 19, + 12, 5, 5, 10, 4, 1, 8, 0, + ); + #[rustfmt::skip] + let expected = _mm256_setr_epi8( + 5, 0, 5, 4, 9, 13, 7, 4, + 13, 6, 6, 11, 5, 2, 9, 1, + 21, 0, 21, 20, 25, 29, 23, 20, + 29, 22, 22, 27, 21, 18, 25, 17, + ); + let r = _mm256_shuffle_epi8(a, b); + assert_eq_m256i(r, expected); +} + +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx2")] +unsafe fn test_mm256_permute2x128_si256() { + let a = _mm256_setr_epi64x(100, 200, 500, 600); + let b = _mm256_setr_epi64x(300, 400, 700, 800); + let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b); + let e = _mm256_setr_epi64x(700, 800, 500, 600); + assert_eq_m256i(r, e); +} + fn test_checked_mul() { let u: Option = u8::from_str_radix("1000", 10).ok(); assert_eq!(u, None); diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs index 8b6f8ca672da0..bd80559abec73 100644 --- a/src/intrinsics/llvm_x86.rs +++ b/src/intrinsics/llvm_x86.rs @@ -110,7 +110,41 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( }; let a = codegen_operand(fx, a); let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8) - .expect("llvm.x86.sse2.psrli.d imm8 not const"); + .expect("llvm.x86.sse2.pslli.d imm8 not const"); + + simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8 + .try_to_bits(Size::from_bytes(4)) + .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8)) + { + imm8 if imm8 < 32 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)), + _ => fx.bcx.ins().iconst(types::I32, 0), + }); + } + "llvm.x86.avx.psrli.d" => { + let (a, imm8) = match args { + [a, imm8] => (a, imm8), + _ => bug!("wrong number of args for intrinsic {intrinsic}"), + }; + let a = codegen_operand(fx, a); + let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8) + .expect("llvm.x86.avx.psrli.d imm8 not const"); + + simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8 + .try_to_bits(Size::from_bytes(4)) + .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8)) + { + imm8 if imm8 < 32 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)), + _ => fx.bcx.ins().iconst(types::I32, 0), + }); + } + "llvm.x86.avx.pslli.d" => { + let (a, imm8) = match args { + [a, imm8] => (a, imm8), + _ => bug!("wrong number of args for intrinsic {intrinsic}"), + }; + let a = codegen_operand(fx, a); + let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8) + .expect("llvm.x86.avx.pslli.d imm8 not const"); simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8 .try_to_bits(Size::from_bytes(4)) @@ -120,6 +154,128 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( _ => fx.bcx.ins().iconst(types::I32, 0), }); } + "llvm.x86.avx2.psrli.w" => { + let (a, imm8) = match args { + [a, imm8] => (a, imm8), + _ => bug!("wrong number of args for intrinsic {intrinsic}"), + }; + let a = codegen_operand(fx, a); + let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8) + .expect("llvm.x86.avx.psrli.w imm8 not const"); + + simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8 + .try_to_bits(Size::from_bytes(4)) + .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8)) + { + imm8 if imm8 < 16 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)), + _ => fx.bcx.ins().iconst(types::I32, 0), + }); + } + "llvm.x86.avx2.pslli.w" => { + let (a, imm8) = match args { + [a, imm8] => (a, imm8), + _ => bug!("wrong number of args for intrinsic {intrinsic}"), + }; + let a = codegen_operand(fx, a); + let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8) + .expect("llvm.x86.avx.pslli.w imm8 not const"); + + simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8 + .try_to_bits(Size::from_bytes(4)) + .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8)) + { + imm8 if imm8 < 16 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)), + _ => fx.bcx.ins().iconst(types::I32, 0), + }); + } + "llvm.x86.avx2.pshuf.b" => { + let (a, b) = match args { + [a, b] => (a, b), + _ => bug!("wrong number of args for intrinsic {intrinsic}"), + }; + let a = codegen_operand(fx, a); + let b = codegen_operand(fx, b); + + // Based on the pseudocode at https://github.com/rust-lang/stdarch/blob/1cfbca8b38fd9b4282b2f054f61c6ca69fc7ce29/crates/core_arch/src/x86/avx2.rs#L2319-L2332 + let zero = fx.bcx.ins().iconst(types::I8, 0); + for i in 0..16 { + let b_lane = b.value_lane(fx, i).load_scalar(fx); + let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80); + let a_idx = fx.bcx.ins().band_imm(b_lane, 0xf); + let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx); + let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx); + let res = fx.bcx.ins().select(is_zero, zero, a_lane); + ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted()); + } + for i in 16..32 { + let b_lane = b.value_lane(fx, i).load_scalar(fx); + let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80); + let b_lane_masked = fx.bcx.ins().band_imm(b_lane, 0xf); + let a_idx = fx.bcx.ins().iadd_imm(b_lane_masked, 16); + let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx); + let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx); + let res = fx.bcx.ins().select(is_zero, zero, a_lane); + ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted()); + } + } + "llvm.x86.avx2.vperm2i128" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256 + let (a, b, imm8) = match args { + [a, b, imm8] => (a, b, imm8), + _ => bug!("wrong number of args for intrinsic {intrinsic}"), + }; + let a = codegen_operand(fx, a); + let b = codegen_operand(fx, b); + let imm8 = codegen_operand(fx, imm8).load_scalar(fx); + + let a_0 = a.value_lane(fx, 0).load_scalar(fx); + let a_1 = a.value_lane(fx, 1).load_scalar(fx); + let a_low = fx.bcx.ins().iconcat(a_0, a_1); + let a_2 = a.value_lane(fx, 2).load_scalar(fx); + let a_3 = a.value_lane(fx, 3).load_scalar(fx); + let a_high = fx.bcx.ins().iconcat(a_2, a_3); + + let b_0 = b.value_lane(fx, 0).load_scalar(fx); + let b_1 = b.value_lane(fx, 1).load_scalar(fx); + let b_low = fx.bcx.ins().iconcat(b_0, b_1); + let b_2 = b.value_lane(fx, 2).load_scalar(fx); + let b_3 = b.value_lane(fx, 3).load_scalar(fx); + let b_high = fx.bcx.ins().iconcat(b_2, b_3); + + fn select4( + fx: &mut FunctionCx<'_, '_, '_>, + a_high: Value, + a_low: Value, + b_high: Value, + b_low: Value, + control: Value, + ) -> Value { + let a_or_b = fx.bcx.ins().band_imm(control, 0b0010); + let high_or_low = fx.bcx.ins().band_imm(control, 0b0001); + let is_zero = fx.bcx.ins().band_imm(control, 0b1000); + + let zero = fx.bcx.ins().iconst(types::I64, 0); + let zero = fx.bcx.ins().iconcat(zero, zero); + + let res_a = fx.bcx.ins().select(high_or_low, a_high, a_low); + let res_b = fx.bcx.ins().select(high_or_low, b_high, b_low); + let res = fx.bcx.ins().select(a_or_b, res_b, res_a); + fx.bcx.ins().select(is_zero, zero, res) + } + + let control0 = imm8; + let res_low = select4(fx, a_high, a_low, b_high, b_low, control0); + let (res_0, res_1) = fx.bcx.ins().isplit(res_low); + + let control1 = fx.bcx.ins().ushr_imm(imm8, 4); + let res_high = select4(fx, a_high, a_low, b_high, b_low, control1); + let (res_2, res_3) = fx.bcx.ins().isplit(res_high); + + ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted()); + ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted()); + ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted()); + ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted()); + } "llvm.x86.sse2.storeu.dq" => { intrinsic_args!(fx, args => (mem_addr, a); intrinsic); let mem_addr = mem_addr.load_scalar(fx); diff --git a/src/value_and_place.rs b/src/value_and_place.rs index b1fda6ff21337..133c989b68643 100644 --- a/src/value_and_place.rs +++ b/src/value_and_place.rs @@ -258,6 +258,27 @@ impl<'tcx> CValue<'tcx> { } } + /// Like [`CValue::value_lane`] except allowing a dynamically calculated lane index. + pub(crate) fn value_lane_dyn( + self, + fx: &mut FunctionCx<'_, '_, 'tcx>, + lane_idx: Value, + ) -> CValue<'tcx> { + let layout = self.1; + assert!(layout.ty.is_simd()); + let (_lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let lane_layout = fx.layout_of(lane_ty); + match self.0 { + CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(), + CValueInner::ByRef(ptr, None) => { + let field_offset = fx.bcx.ins().imul_imm(lane_idx, lane_layout.size.bytes() as i64); + let field_ptr = ptr.offset_value(fx, field_offset); + CValue::by_ref(field_ptr, lane_layout) + } + CValueInner::ByRef(_, Some(_)) => unreachable!(), + } + } + /// If `ty` is signed, `const_val` must already be sign extended. pub(crate) fn const_val( fx: &mut FunctionCx<'_, '_, 'tcx>, From aeac484d18f285e0f647348a36d897d2e0aff38b Mon Sep 17 00:00:00 2001 From: bjorn3 <17426603+bjorn3@users.noreply.github.com> Date: Mon, 5 Jun 2023 15:48:25 +0000 Subject: [PATCH 2/4] Run tests with LLVM sysroot in CI --- .github/workflows/main.yml | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ed562d6ebe41c..5dbdc00004ed4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -93,12 +93,6 @@ jobs: - name: Prepare dependencies run: ./y.rs prepare - - name: Build without unstable features - env: - TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }} - # This is the config rust-lang/rust uses for builds - run: ./y.rs build --no-unstable-features - - name: Build run: ./y.rs build --sysroot none @@ -107,6 +101,15 @@ jobs: TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }} run: ./y.rs test + - name: Install LLVM standard library + run: rustup target add ${{ matrix.env.TARGET_TRIPLE }} + + # This is roughly config rust-lang/rust uses for testing + - name: Test with LLVM sysroot + env: + TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }} + run: ./y.rs test --sysroot llvm --no-unstable-features + # This job doesn't use cg_clif in any way. It checks that all cg_clif tests work with cg_llvm too. test_llvm: From 1797ae5174dc1444b2711ac250e1b44bf25ae1ef Mon Sep 17 00:00:00 2001 From: bjorn3 <17426603+bjorn3@users.noreply.github.com> Date: Mon, 5 Jun 2023 16:54:37 +0000 Subject: [PATCH 3/4] Define rust_eh_personality for alloc_example x86_64-pc-windows-gnu requires it to be defined. --- example/alloc_example.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/example/alloc_example.rs b/example/alloc_example.rs index d994e2fbc0ae0..117eed5afd8ab 100644 --- a/example/alloc_example.rs +++ b/example/alloc_example.rs @@ -1,4 +1,4 @@ -#![feature(start, core_intrinsics, alloc_error_handler)] +#![feature(start, core_intrinsics, alloc_error_handler, lang_items)] #![no_std] extern crate alloc; @@ -27,6 +27,11 @@ fn alloc_error_handler(_: alloc::alloc::Layout) -> ! { core::intrinsics::abort(); } +#[lang = "eh_personality"] +fn eh_personality() -> ! { + loop {} +} + #[start] fn main(_argc: isize, _argv: *const *const u8) -> isize { let world: Box<&str> = Box::new("Hello World!\0"); From 8fbd6f521a1375c7c3d62ed157e3c2dbb98f948e Mon Sep 17 00:00:00 2001 From: bjorn3 <17426603+bjorn3@users.noreply.github.com> Date: Mon, 5 Jun 2023 17:20:59 +0000 Subject: [PATCH 4/4] Skip LLVM sysroot testing for native x86_64-pc-windows-gnu in CI It is way too slow and cross-compiled x86_64-pc-windows-gnu covers at least part of the tests. --- .github/workflows/main.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5dbdc00004ed4..abcd1affdee69 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -106,6 +106,9 @@ jobs: # This is roughly config rust-lang/rust uses for testing - name: Test with LLVM sysroot + # Skip native x86_64-pc-windows-gnu. It is way too slow and cross-compiled + # x86_64-pc-windows-gnu covers at least part of the tests. + if: matrix.os != 'windows-latest' || matrix.env.TARGET_TRIPLE != 'x86_64-pc-windows-gnu' env: TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }} run: ./y.rs test --sysroot llvm --no-unstable-features