diff --git a/build.rs b/build.rs index 513ea1278910..49f9f81db716 100644 --- a/build.rs +++ b/build.rs @@ -229,17 +229,17 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "x86_64"; } - // This is only implemented on aarch64. - ("simd", "simd_boolean") => { + // These are only implemented on aarch64. + ("simd", "simd_boolean") + | ("simd", "simd_f32x4_pmin_pmax") + | ("simd", "simd_f32x4_rounding") + | ("simd", "simd_f64x2_pmin_pmax") + | ("simd", "simd_f64x2_rounding") => { return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "aarch64"; } // These tests have simd operators which aren't implemented yet. - ("simd", "simd_f32x4_pmin_pmax") => return true, - ("simd", "simd_f32x4_rounding") => return true, - ("simd", "simd_f64x2_pmin_pmax") => return true, - ("simd", "simd_f64x2_rounding") => return true, - + // (currently none) _ => {} }, _ => panic!("unrecognized strategy"), diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index 053848975a02..9f759731f44b 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -3577,6 +3577,22 @@ pub(crate) fn define( .operands_out(vec![a]), ); + ig.push( + Inst::new( + "fmin_pseudo", + r#" + Floating point pseudo-minimum, propagating NaNs. This behaves differently from ``fmin``. + See https://github.com/WebAssembly/simd/pull/122 for background. + + The behaviour is defined as ``fmin_pseudo(a, b) = (b < a) ? b : a``, and the behaviour + for zero or NaN inputs follows from the behaviour of ``<`` with such inputs. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + let a = &Operand::new("a", Float).with_doc("The larger of ``x`` and ``y``"); ig.push( @@ -3593,6 +3609,22 @@ pub(crate) fn define( .operands_out(vec![a]), ); + ig.push( + Inst::new( + "fmax_pseudo", + r#" + Floating point pseudo-maximum, propagating NaNs. This behaves differently from ``fmax``. + See https://github.com/WebAssembly/simd/pull/122 for background. + + The behaviour is defined as ``fmax_pseudo(a, b) = (a < b) ? b : a``, and the behaviour + for zero or NaN inputs follows from the behaviour of ``<`` with such inputs. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + let a = &Operand::new("a", Float).with_doc("``x`` rounded to integral value"); ig.push( diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 05e76ff606e6..7b4df2449a16 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -1429,6 +1429,22 @@ impl MachInstEmit for Inst { debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); (0b1, 0b11101, enc_size & 0b1) } + VecMisc2::Frintn => { + debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); + (0b0, 0b11000, enc_size & 0b01) + } + VecMisc2::Frintz => { + debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); + (0b0, 0b11001, enc_size | 0b10) + } + VecMisc2::Frintm => { + debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); + (0b0, 0b11001, enc_size & 0b01) + } + VecMisc2::Frintp => { + debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); + (0b0, 0b11000, enc_size | 0b10) + } }; sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn)); } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 0f6a0b115b76..18d7cbe7a45a 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -3476,6 +3476,94 @@ fn test_aarch64_binemit() { "ucvtf v10.2d, v19.2d", )); + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintn, + rd: writable_vreg(11), + rn: vreg(18), + size: VectorSize::Size32x4, + }, + "4B8A214E", + "frintn v11.4s, v18.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintn, + rd: writable_vreg(12), + rn: vreg(17), + size: VectorSize::Size64x2, + }, + "2C8A614E", + "frintn v12.2d, v17.2d", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintz, + rd: writable_vreg(11), + rn: vreg(18), + size: VectorSize::Size32x4, + }, + "4B9AA14E", + "frintz v11.4s, v18.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintz, + rd: writable_vreg(12), + rn: vreg(17), + size: VectorSize::Size64x2, + }, + "2C9AE14E", + "frintz v12.2d, v17.2d", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintm, + rd: writable_vreg(11), + rn: vreg(18), + size: VectorSize::Size32x4, + }, + "4B9A214E", + "frintm v11.4s, v18.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintm, + rd: writable_vreg(12), + rn: vreg(17), + size: VectorSize::Size64x2, + }, + "2C9A614E", + "frintm v12.2d, v17.2d", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintp, + rd: writable_vreg(11), + rn: vreg(18), + size: VectorSize::Size32x4, + }, + "4B8AA14E", + "frintp v11.4s, v18.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintp, + rd: writable_vreg(12), + rn: vreg(17), + size: VectorSize::Size64x2, + }, + "2C8AE14E", + "frintp v12.2d, v17.2d", + )); + insns.push(( Inst::VecLanes { op: VecLanesOp::Uminv, diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 31a15d95802d..c2da46035bcd 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -318,6 +318,14 @@ pub enum VecMisc2 { Scvtf, /// Unsigned integer convert to floating-point Ucvtf, + /// Floating point round to integral, rounding towards nearest + Frintn, + /// Floating point round to integral, rounding towards zero + Frintz, + /// Floating point round to integral, rounding towards minus infinity + Frintm, + /// Floating point round to integral, rounding towards plus infinity + Frintp, } /// A Vector narrowing operation with two registers. @@ -3435,6 +3443,10 @@ impl Inst { VecMisc2::Fcvtzu => ("fcvtzu", size), VecMisc2::Scvtf => ("scvtf", size), VecMisc2::Ucvtf => ("ucvtf", size), + VecMisc2::Frintn => ("frintn", size), + VecMisc2::Frintz => ("frintz", size), + VecMisc2::Frintm => ("frintm", size), + VecMisc2::Frintp => ("frintp", size), }; let rd_size = if is_shll { size.widen() } else { size }; diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 5477964a444b..b93cb3d497b0 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -2373,6 +2373,43 @@ pub(crate) fn lower_insn_to_regs>( } } + Opcode::FminPseudo | Opcode::FmaxPseudo => { + let ty = ctx.input_ty(insn, 0); + if ty == F32X4 || ty == F64X2 { + // pmin(a,b) => bitsel(b, a, cmpgt(a, b)) + // pmax(a,b) => bitsel(b, a, cmpgt(b, a)) + let r_dst = get_output_reg(ctx, outputs[0]); + let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + // Since we're going to write the output register `r_dst` anyway, we might as + // well first use it to hold the comparison result. This has the slightly unusual + // effect that we modify the output register in the first instruction (`fcmgt`) + // but read both the inputs again in the second instruction (`bsl`), which means + // that the output register can't be either of the input registers. Regalloc + // should handle this correctly, nevertheless. + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Fcmgt, + rd: r_dst, + rn: if op == Opcode::FminPseudo { r_a } else { r_b }, + rm: if op == Opcode::FminPseudo { r_b } else { r_a }, + size: if ty == F32X4 { + VectorSize::Size32x4 + } else { + VectorSize::Size64x2 + }, + }); + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Bsl, + rd: r_dst, + rn: r_b, + rm: r_a, + size: VectorSize::Size8x16, + }); + } else { + panic!("Opcode::FminPseudo | Opcode::FmaxPseudo: unhandled type"); + } + } + Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => { let ty = ty.unwrap(); let bits = ty_bits(ty); @@ -2411,21 +2448,39 @@ pub(crate) fn lower_insn_to_regs>( } Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => { - let bits = ty_bits(ctx.output_ty(insn, 0)); - let op = match (op, bits) { - (Opcode::Ceil, 32) => FpuRoundMode::Plus32, - (Opcode::Ceil, 64) => FpuRoundMode::Plus64, - (Opcode::Floor, 32) => FpuRoundMode::Minus32, - (Opcode::Floor, 64) => FpuRoundMode::Minus64, - (Opcode::Trunc, 32) => FpuRoundMode::Zero32, - (Opcode::Trunc, 64) => FpuRoundMode::Zero64, - (Opcode::Nearest, 32) => FpuRoundMode::Nearest32, - (Opcode::Nearest, 64) => FpuRoundMode::Nearest64, - _ => panic!("Unknown op/bits combination"), - }; - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let rd = get_output_reg(ctx, outputs[0]); - ctx.emit(Inst::FpuRound { op, rd, rn }); + let ty = ctx.output_ty(insn, 0); + if !ty.is_vector() { + let bits = ty_bits(ty); + let op = match (op, bits) { + (Opcode::Ceil, 32) => FpuRoundMode::Plus32, + (Opcode::Ceil, 64) => FpuRoundMode::Plus64, + (Opcode::Floor, 32) => FpuRoundMode::Minus32, + (Opcode::Floor, 64) => FpuRoundMode::Minus64, + (Opcode::Trunc, 32) => FpuRoundMode::Zero32, + (Opcode::Trunc, 64) => FpuRoundMode::Zero64, + (Opcode::Nearest, 32) => FpuRoundMode::Nearest32, + (Opcode::Nearest, 64) => FpuRoundMode::Nearest64, + _ => panic!("Unknown op/bits combination (scalar)"), + }; + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::FpuRound { op, rd, rn }); + } else { + let (op, size) = match (op, ty) { + (Opcode::Ceil, F32X4) => (VecMisc2::Frintp, VectorSize::Size32x4), + (Opcode::Ceil, F64X2) => (VecMisc2::Frintp, VectorSize::Size64x2), + (Opcode::Floor, F32X4) => (VecMisc2::Frintm, VectorSize::Size32x4), + (Opcode::Floor, F64X2) => (VecMisc2::Frintm, VectorSize::Size64x2), + (Opcode::Trunc, F32X4) => (VecMisc2::Frintz, VectorSize::Size32x4), + (Opcode::Trunc, F64X2) => (VecMisc2::Frintz, VectorSize::Size64x2), + (Opcode::Nearest, F32X4) => (VecMisc2::Frintn, VectorSize::Size32x4), + (Opcode::Nearest, F64X2) => (VecMisc2::Frintn, VectorSize::Size64x2), + _ => panic!("Unknown op/ty combination (vector){:?}", ty), + }; + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::VecMisc { op, rd, rn, size }); + } } Opcode::Fma => { diff --git a/cranelift/codegen/src/preopt.serialized b/cranelift/codegen/src/preopt.serialized index 03a7b4419380..cd31c13d18fe 100644 Binary files a/cranelift/codegen/src/preopt.serialized and b/cranelift/codegen/src/preopt.serialized differ diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index f040d2939340..0b1dde61d377 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1679,6 +1679,14 @@ pub fn translate_operator( let (a, b) = pop2_with_bitcast(state, type_of(op), builder); state.push1(builder.ins().fmin(a, b)) } + Operator::F32x4PMax | Operator::F64x2PMax => { + let (a, b) = pop2_with_bitcast(state, type_of(op), builder); + state.push1(builder.ins().fmax_pseudo(a, b)) + } + Operator::F32x4PMin | Operator::F64x2PMin => { + let (a, b) = pop2_with_bitcast(state, type_of(op), builder); + state.push1(builder.ins().fmin_pseudo(a, b)) + } Operator::F32x4Sqrt | Operator::F64x2Sqrt => { let a = pop1_with_bitcast(state, type_of(op), builder); state.push1(builder.ins().sqrt(a)) @@ -1756,19 +1764,24 @@ pub fn translate_operator( state.push1(builder.ins().uwiden_high(a)) } - Operator::F32x4Ceil - | Operator::F32x4Floor - | Operator::F32x4Trunc - | Operator::F32x4Nearest - | Operator::F32x4PMin - | Operator::F32x4PMax - | Operator::F64x2Ceil - | Operator::F64x2Floor - | Operator::F64x2Trunc - | Operator::F64x2PMin - | Operator::F64x2PMax - | Operator::F64x2Nearest => { - return Err(wasm_unsupported!("proposed SIMD operator {:?}", op)); + Operator::F32x4Ceil | Operator::F64x2Ceil => { + // This is something of a misuse of `type_of`, because that produces the return type + // of `op`. In this case we want the arg type, but we know it's the same as the + // return type. Same for the 3 cases below. + let arg = pop1_with_bitcast(state, type_of(op), builder); + state.push1(builder.ins().ceil(arg)); + } + Operator::F32x4Floor | Operator::F64x2Floor => { + let arg = pop1_with_bitcast(state, type_of(op), builder); + state.push1(builder.ins().floor(arg)); + } + Operator::F32x4Trunc | Operator::F64x2Trunc => { + let arg = pop1_with_bitcast(state, type_of(op), builder); + state.push1(builder.ins().trunc(arg)); + } + Operator::F32x4Nearest | Operator::F64x2Nearest => { + let arg = pop1_with_bitcast(state, type_of(op), builder); + state.push1(builder.ins().nearest(arg)); } Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => { @@ -2528,8 +2541,14 @@ fn type_of(operator: &Operator) -> Type { | Operator::F32x4Div | Operator::F32x4Min | Operator::F32x4Max + | Operator::F32x4PMin + | Operator::F32x4PMax | Operator::I32x4TruncSatF32x4S - | Operator::I32x4TruncSatF32x4U => F32X4, + | Operator::I32x4TruncSatF32x4U + | Operator::F32x4Ceil + | Operator::F32x4Floor + | Operator::F32x4Trunc + | Operator::F32x4Nearest => F32X4, Operator::F64x2Splat | Operator::F64x2ExtractLane { .. } @@ -2548,7 +2567,13 @@ fn type_of(operator: &Operator) -> Type { | Operator::F64x2Mul | Operator::F64x2Div | Operator::F64x2Min - | Operator::F64x2Max => F64X2, + | Operator::F64x2Max + | Operator::F64x2PMin + | Operator::F64x2PMax + | Operator::F64x2Ceil + | Operator::F64x2Floor + | Operator::F64x2Trunc + | Operator::F64x2Nearest => F64X2, _ => unimplemented!( "Currently only SIMD instructions are mapped to their return type; the \