Skip to content

Commit

Permalink
Merge pull request rust-lang#1378 from bjorn3/more_vendor_intrinsics
Browse files Browse the repository at this point in the history
Implement all vendor intrinsics used by regex on AVX2 systems
  • Loading branch information
bjorn3 authored Jun 5, 2023
2 parents e369cce + 8fbd6f5 commit 204c64b
Show file tree
Hide file tree
Showing 5 changed files with 243 additions and 8 deletions.
18 changes: 12 additions & 6 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,6 @@ jobs:
- name: Prepare dependencies
run: ./y.rs prepare

- name: Build without unstable features
env:
TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }}
# This is the config rust-lang/rust uses for builds
run: ./y.rs build --no-unstable-features

- name: Build
run: ./y.rs build --sysroot none

Expand All @@ -107,6 +101,18 @@ jobs:
TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }}
run: ./y.rs test

- name: Install LLVM standard library
run: rustup target add ${{ matrix.env.TARGET_TRIPLE }}

# This is roughly config rust-lang/rust uses for testing
- name: Test with LLVM sysroot
# Skip native x86_64-pc-windows-gnu. It is way too slow and cross-compiled
# x86_64-pc-windows-gnu covers at least part of the tests.
if: matrix.os != 'windows-latest' || matrix.env.TARGET_TRIPLE != 'x86_64-pc-windows-gnu'
env:
TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }}
run: ./y.rs test --sysroot llvm --no-unstable-features


# This job doesn't use cg_clif in any way. It checks that all cg_clif tests work with cg_llvm too.
test_llvm:
Expand Down
7 changes: 6 additions & 1 deletion example/alloc_example.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#![feature(start, core_intrinsics, alloc_error_handler)]
#![feature(start, core_intrinsics, alloc_error_handler, lang_items)]
#![no_std]

extern crate alloc;
Expand Down Expand Up @@ -27,6 +27,11 @@ fn alloc_error_handler(_: alloc::alloc::Layout) -> ! {
core::intrinsics::abort();
}

#[lang = "eh_personality"]
fn eh_personality() -> ! {
loop {}
}

#[start]
fn main(_argc: isize, _argv: *const *const u8) -> isize {
let world: Box<&str> = Box::new("Hello World!\0");
Expand Down
47 changes: 47 additions & 0 deletions example/std_example.rs
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,9 @@ unsafe fn test_simd() {
test_mm_extract_epi8();
test_mm_insert_epi16();

test_mm256_shuffle_epi8();
test_mm256_permute2x128_si256();

#[rustfmt::skip]
let mask1 = _mm_movemask_epi8(dbg!(_mm_setr_epi8(255u8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)));
assert_eq!(mask1, 1);
Expand Down Expand Up @@ -293,6 +296,12 @@ pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
}
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx")]
pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
assert_eq!(std::mem::transmute::<_, [u64; 4]>(a), std::mem::transmute::<_, [u64; 4]>(b))
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvtsi128_si64() {
Expand Down Expand Up @@ -336,6 +345,44 @@ unsafe fn test_mm_insert_epi16() {
assert_eq_m128i(r, e);
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn test_mm256_shuffle_epi8() {
#[rustfmt::skip]
let a = _mm256_setr_epi8(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32,
);
#[rustfmt::skip]
let b = _mm256_setr_epi8(
4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
12, 5, 5, 10, 4, 1, 8, 0,
4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
12, 5, 5, 10, 4, 1, 8, 0,
);
#[rustfmt::skip]
let expected = _mm256_setr_epi8(
5, 0, 5, 4, 9, 13, 7, 4,
13, 6, 6, 11, 5, 2, 9, 1,
21, 0, 21, 20, 25, 29, 23, 20,
29, 22, 22, 27, 21, 18, 25, 17,
);
let r = _mm256_shuffle_epi8(a, b);
assert_eq_m256i(r, expected);
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn test_mm256_permute2x128_si256() {
let a = _mm256_setr_epi64x(100, 200, 500, 600);
let b = _mm256_setr_epi64x(300, 400, 700, 800);
let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
let e = _mm256_setr_epi64x(700, 800, 500, 600);
assert_eq_m256i(r, e);
}

fn test_checked_mul() {
let u: Option<u8> = u8::from_str_radix("1000", 10).ok();
assert_eq!(u, None);
Expand Down
158 changes: 157 additions & 1 deletion src/intrinsics/llvm_x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,41 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.sse2.psrli.d imm8 not const");
.expect("llvm.x86.sse2.pslli.d imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 32 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx.psrli.d" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.psrli.d imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 32 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx.pslli.d" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.pslli.d imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
Expand All @@ -120,6 +154,128 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx2.psrli.w" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.psrli.w imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 16 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx2.pslli.w" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.pslli.w imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 16 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx2.pshuf.b" => {
let (a, b) = match args {
[a, b] => (a, b),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let b = codegen_operand(fx, b);

// Based on the pseudocode at https://github.com/rust-lang/stdarch/blob/1cfbca8b38fd9b4282b2f054f61c6ca69fc7ce29/crates/core_arch/src/x86/avx2.rs#L2319-L2332
let zero = fx.bcx.ins().iconst(types::I8, 0);
for i in 0..16 {
let b_lane = b.value_lane(fx, i).load_scalar(fx);
let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
let a_idx = fx.bcx.ins().band_imm(b_lane, 0xf);
let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
let res = fx.bcx.ins().select(is_zero, zero, a_lane);
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
}
for i in 16..32 {
let b_lane = b.value_lane(fx, i).load_scalar(fx);
let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
let b_lane_masked = fx.bcx.ins().band_imm(b_lane, 0xf);
let a_idx = fx.bcx.ins().iadd_imm(b_lane_masked, 16);
let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
let res = fx.bcx.ins().select(is_zero, zero, a_lane);
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
}
}
"llvm.x86.avx2.vperm2i128" => {
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256
let (a, b, imm8) = match args {
[a, b, imm8] => (a, b, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let b = codegen_operand(fx, b);
let imm8 = codegen_operand(fx, imm8).load_scalar(fx);

let a_0 = a.value_lane(fx, 0).load_scalar(fx);
let a_1 = a.value_lane(fx, 1).load_scalar(fx);
let a_low = fx.bcx.ins().iconcat(a_0, a_1);
let a_2 = a.value_lane(fx, 2).load_scalar(fx);
let a_3 = a.value_lane(fx, 3).load_scalar(fx);
let a_high = fx.bcx.ins().iconcat(a_2, a_3);

let b_0 = b.value_lane(fx, 0).load_scalar(fx);
let b_1 = b.value_lane(fx, 1).load_scalar(fx);
let b_low = fx.bcx.ins().iconcat(b_0, b_1);
let b_2 = b.value_lane(fx, 2).load_scalar(fx);
let b_3 = b.value_lane(fx, 3).load_scalar(fx);
let b_high = fx.bcx.ins().iconcat(b_2, b_3);

fn select4(
fx: &mut FunctionCx<'_, '_, '_>,
a_high: Value,
a_low: Value,
b_high: Value,
b_low: Value,
control: Value,
) -> Value {
let a_or_b = fx.bcx.ins().band_imm(control, 0b0010);
let high_or_low = fx.bcx.ins().band_imm(control, 0b0001);
let is_zero = fx.bcx.ins().band_imm(control, 0b1000);

let zero = fx.bcx.ins().iconst(types::I64, 0);
let zero = fx.bcx.ins().iconcat(zero, zero);

let res_a = fx.bcx.ins().select(high_or_low, a_high, a_low);
let res_b = fx.bcx.ins().select(high_or_low, b_high, b_low);
let res = fx.bcx.ins().select(a_or_b, res_b, res_a);
fx.bcx.ins().select(is_zero, zero, res)
}

let control0 = imm8;
let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);
let (res_0, res_1) = fx.bcx.ins().isplit(res_low);

let control1 = fx.bcx.ins().ushr_imm(imm8, 4);
let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);
let (res_2, res_3) = fx.bcx.ins().isplit(res_high);

ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());
ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());
ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());
ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());
}
"llvm.x86.sse2.storeu.dq" => {
intrinsic_args!(fx, args => (mem_addr, a); intrinsic);
let mem_addr = mem_addr.load_scalar(fx);
Expand Down
21 changes: 21 additions & 0 deletions src/value_and_place.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,27 @@ impl<'tcx> CValue<'tcx> {
}
}

/// Like [`CValue::value_lane`] except allowing a dynamically calculated lane index.
pub(crate) fn value_lane_dyn(
self,
fx: &mut FunctionCx<'_, '_, 'tcx>,
lane_idx: Value,
) -> CValue<'tcx> {
let layout = self.1;
assert!(layout.ty.is_simd());
let (_lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
let lane_layout = fx.layout_of(lane_ty);
match self.0 {
CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
CValueInner::ByRef(ptr, None) => {
let field_offset = fx.bcx.ins().imul_imm(lane_idx, lane_layout.size.bytes() as i64);
let field_ptr = ptr.offset_value(fx, field_offset);
CValue::by_ref(field_ptr, lane_layout)
}
CValueInner::ByRef(_, Some(_)) => unreachable!(),
}
}

/// If `ty` is signed, `const_val` must already be sign extended.
pub(crate) fn const_val(
fx: &mut FunctionCx<'_, '_, 'tcx>,
Expand Down

0 comments on commit 204c64b

Please sign in to comment.