Skip to content

Commit

Permalink
CL/aarch64: implement the wasm SIMD pseudo-max/min and FP-rounding in…
Browse files Browse the repository at this point in the history
…structions

This patch implements, for aarch64, the following wasm SIMD extensions

  Floating-point rounding instructions
  WebAssembly/simd#232

  Pseudo-Minimum and Pseudo-Maximum instructions
  WebAssembly/simd#122

The changes are straightforward:

* `build.rs`: the relevant tests have been enabled

* `cranelift/codegen/meta/src/shared/instructions.rs`: new CLIF instructions
  `fmin_pseudo` and `fmax_pseudo`.  The wasm rounding instructions do not need
  any new CLIF instructions.

* `cranelift/wasm/src/code_translator.rs`: translation into CLIF; this is
  pretty much the same as any other unary or binary vector instruction (for
  the rounding and the pmin/max respectively)

* `cranelift/codegen/src/isa/aarch64/lower_inst.rs`:
  - `fmin_pseudo` and `fmax_pseudo` are converted into a two instruction
    sequence, `fcmpgt` followed by `bsl`
  - the CLIF rounding instructions are converted to a suitable vector
    `frint{n,z,p,m}` instruction.

* `cranelift/codegen/src/isa/aarch64/inst/mod.rs`: minor extension of `pub
  enum VecMisc2` to handle the rounding operations.  And corresponding `emit`
  cases.
  • Loading branch information
julian-seward1 committed Oct 24, 2020
1 parent 2702942 commit bfceb8c
Show file tree
Hide file tree
Showing 8 changed files with 265 additions and 37 deletions.
14 changes: 7 additions & 7 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -229,17 +229,17 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "x86_64";
}

// This is only implemented on aarch64.
("simd", "simd_boolean") => {
// These are only implemented on aarch64.
("simd", "simd_boolean")
| ("simd", "simd_f32x4_pmin_pmax")
| ("simd", "simd_f32x4_rounding")
| ("simd", "simd_f64x2_pmin_pmax")
| ("simd", "simd_f64x2_rounding") => {
return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "aarch64";
}

// These tests have simd operators which aren't implemented yet.
("simd", "simd_f32x4_pmin_pmax") => return true,
("simd", "simd_f32x4_rounding") => return true,
("simd", "simd_f64x2_pmin_pmax") => return true,
("simd", "simd_f64x2_rounding") => return true,

// (currently none)
_ => {}
},
_ => panic!("unrecognized strategy"),
Expand Down
32 changes: 32 additions & 0 deletions cranelift/codegen/meta/src/shared/instructions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3577,6 +3577,22 @@ pub(crate) fn define(
.operands_out(vec![a]),
);

ig.push(
Inst::new(
"fmin_pseudo",
r#"
Floating point pseudo-minimum, propagating NaNs.
See https://github.com/WebAssembly/simd/pull/122 for background.
If either operand is NaN, this returns a NaN. If both operands are zeroes, regardless
of sign, the first operand (``x``) is returned.
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);

let a = &Operand::new("a", Float).with_doc("The larger of ``x`` and ``y``");

ig.push(
Expand All @@ -3593,6 +3609,22 @@ pub(crate) fn define(
.operands_out(vec![a]),
);

ig.push(
Inst::new(
"fmax_pseudo",
r#"
Floating point pseudo-maximum, propagating NaNs.
See https://github.com/WebAssembly/simd/pull/122 for background.
If either operand is NaN, this returns a NaN. If both operands are zeroes, regardless
of sign, the first operand (``x``) is returned.
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);

let a = &Operand::new("a", Float).with_doc("``x`` rounded to integral value");

ig.push(
Expand Down
16 changes: 16 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1429,6 +1429,22 @@ impl MachInstEmit for Inst {
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
(0b1, 0b11101, enc_size & 0b1)
}
VecMisc2::Frintn => {
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
(0b0, 0b11000, enc_size & 0b01)
}
VecMisc2::Frintz => {
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
(0b0, 0b11001, enc_size | 0b10)
}
VecMisc2::Frintm => {
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
(0b0, 0b11001, enc_size & 0b01)
}
VecMisc2::Frintp => {
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
(0b0, 0b11000, enc_size | 0b10)
}
};
sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn));
}
Expand Down
88 changes: 88 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3476,6 +3476,94 @@ fn test_aarch64_binemit() {
"ucvtf v10.2d, v19.2d",
));

insns.push((
Inst::VecMisc {
op: VecMisc2::Frintn,
rd: writable_vreg(11),
rn: vreg(18),
size: VectorSize::Size32x4,
},
"4B8A214E",
"frintn v11.4s, v18.4s",
));

insns.push((
Inst::VecMisc {
op: VecMisc2::Frintn,
rd: writable_vreg(12),
rn: vreg(17),
size: VectorSize::Size64x2,
},
"2C8A614E",
"frintn v12.2d, v17.2d",
));

insns.push((
Inst::VecMisc {
op: VecMisc2::Frintz,
rd: writable_vreg(11),
rn: vreg(18),
size: VectorSize::Size32x4,
},
"4B9AA14E",
"frintz v11.4s, v18.4s",
));

insns.push((
Inst::VecMisc {
op: VecMisc2::Frintz,
rd: writable_vreg(12),
rn: vreg(17),
size: VectorSize::Size64x2,
},
"2C9AE14E",
"frintz v12.2d, v17.2d",
));

insns.push((
Inst::VecMisc {
op: VecMisc2::Frintm,
rd: writable_vreg(11),
rn: vreg(18),
size: VectorSize::Size32x4,
},
"4B9A214E",
"frintm v11.4s, v18.4s",
));

insns.push((
Inst::VecMisc {
op: VecMisc2::Frintm,
rd: writable_vreg(12),
rn: vreg(17),
size: VectorSize::Size64x2,
},
"2C9A614E",
"frintm v12.2d, v17.2d",
));

insns.push((
Inst::VecMisc {
op: VecMisc2::Frintp,
rd: writable_vreg(11),
rn: vreg(18),
size: VectorSize::Size32x4,
},
"4B8AA14E",
"frintp v11.4s, v18.4s",
));

insns.push((
Inst::VecMisc {
op: VecMisc2::Frintp,
rd: writable_vreg(12),
rn: vreg(17),
size: VectorSize::Size64x2,
},
"2C8AE14E",
"frintp v12.2d, v17.2d",
));

insns.push((
Inst::VecLanes {
op: VecLanesOp::Uminv,
Expand Down
12 changes: 12 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,14 @@ pub enum VecMisc2 {
Scvtf,
/// Unsigned integer convert to floating-point
Ucvtf,
/// Floating point round to integral, rounding towards nearest
Frintn,
/// Floating point round to integral, rounding towards zero
Frintz,
/// Floating point round to integral, rounding towards minus infinity
Frintm,
/// Floating point round to integral, rounding towards plus infinity
Frintp,
}

/// A Vector narrowing operation with two registers.
Expand Down Expand Up @@ -3435,6 +3443,10 @@ impl Inst {
VecMisc2::Fcvtzu => ("fcvtzu", size),
VecMisc2::Scvtf => ("scvtf", size),
VecMisc2::Ucvtf => ("ucvtf", size),
VecMisc2::Frintn => ("frintn", size),
VecMisc2::Frintz => ("frintz", size),
VecMisc2::Frintm => ("frintm", size),
VecMisc2::Frintp => ("frintp", size),
};

let rd_size = if is_shll { size.widen() } else { size };
Expand Down
85 changes: 70 additions & 15 deletions cranelift/codegen/src/isa/aarch64/lower_inst.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2373,6 +2373,43 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
}

Opcode::FminPseudo | Opcode::FmaxPseudo => {
let ty = ctx.input_ty(insn, 0);
if ty == F32X4 || ty == F64X2 {
// pmin(a,b) => bitsel(b, a, cmpgt(a, b))
// pmax(a,b) => bitsel(b, a, cmpgt(b, a))
let r_dst = get_output_reg(ctx, outputs[0]);
let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
// Since we're going to write the output register `r_dst` anyway, we might as
// well first use it to hold the comparison result. This has the slightly unusual
// effect that we modify the output register in the first instruction (`fcmgt`)
// but read both the inputs again in the second instruction (`bsl`), which means
// that the output register can't be either of the input registers. Regalloc
// should handle this correctly, nevertheless.
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::Fcmgt,
rd: r_dst,
rn: if op == Opcode::FminPseudo { r_a } else { r_b },
rm: if op == Opcode::FminPseudo { r_b } else { r_a },
size: if ty == F32X4 {
VectorSize::Size32x4
} else {
VectorSize::Size64x2
},
});
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::Bsl,
rd: r_dst,
rn: r_b,
rm: r_a,
size: VectorSize::Size8x16,
});
} else {
panic!("Opcode::FminPseudo | Opcode::FmaxPseudo: unhandled type");
}
}

Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => {
let ty = ty.unwrap();
let bits = ty_bits(ty);
Expand Down Expand Up @@ -2411,21 +2448,39 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}

Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => {
let bits = ty_bits(ctx.output_ty(insn, 0));
let op = match (op, bits) {
(Opcode::Ceil, 32) => FpuRoundMode::Plus32,
(Opcode::Ceil, 64) => FpuRoundMode::Plus64,
(Opcode::Floor, 32) => FpuRoundMode::Minus32,
(Opcode::Floor, 64) => FpuRoundMode::Minus64,
(Opcode::Trunc, 32) => FpuRoundMode::Zero32,
(Opcode::Trunc, 64) => FpuRoundMode::Zero64,
(Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
(Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
_ => panic!("Unknown op/bits combination"),
};
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rd = get_output_reg(ctx, outputs[0]);
ctx.emit(Inst::FpuRound { op, rd, rn });
let ty = ctx.output_ty(insn, 0);
if !ty.is_vector() {
let bits = ty_bits(ty);
let op = match (op, bits) {
(Opcode::Ceil, 32) => FpuRoundMode::Plus32,
(Opcode::Ceil, 64) => FpuRoundMode::Plus64,
(Opcode::Floor, 32) => FpuRoundMode::Minus32,
(Opcode::Floor, 64) => FpuRoundMode::Minus64,
(Opcode::Trunc, 32) => FpuRoundMode::Zero32,
(Opcode::Trunc, 64) => FpuRoundMode::Zero64,
(Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
(Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
_ => panic!("Unknown op/bits combination (scalar)"),
};
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rd = get_output_reg(ctx, outputs[0]);
ctx.emit(Inst::FpuRound { op, rd, rn });
} else {
let (op, size) = match (op, ty) {
(Opcode::Ceil, F32X4) => (VecMisc2::Frintp, VectorSize::Size32x4),
(Opcode::Ceil, F64X2) => (VecMisc2::Frintp, VectorSize::Size64x2),
(Opcode::Floor, F32X4) => (VecMisc2::Frintm, VectorSize::Size32x4),
(Opcode::Floor, F64X2) => (VecMisc2::Frintm, VectorSize::Size64x2),
(Opcode::Trunc, F32X4) => (VecMisc2::Frintz, VectorSize::Size32x4),
(Opcode::Trunc, F64X2) => (VecMisc2::Frintz, VectorSize::Size64x2),
(Opcode::Nearest, F32X4) => (VecMisc2::Frintn, VectorSize::Size32x4),
(Opcode::Nearest, F64X2) => (VecMisc2::Frintn, VectorSize::Size64x2),
_ => panic!("Unknown op/ty combination (vector){:?}", ty),
};
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rd = get_output_reg(ctx, outputs[0]);
ctx.emit(Inst::VecMisc { op, rd, rn, size });
}
}

Opcode::Fma => {
Expand Down
Binary file modified cranelift/codegen/src/preopt.serialized
Binary file not shown.
55 changes: 40 additions & 15 deletions cranelift/wasm/src/code_translator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1679,6 +1679,14 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
state.push1(builder.ins().fmin(a, b))
}
Operator::F32x4PMax | Operator::F64x2PMax => {
let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
state.push1(builder.ins().fmax_pseudo(a, b))
}
Operator::F32x4PMin | Operator::F64x2PMin => {
let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
state.push1(builder.ins().fmin_pseudo(a, b))
}
Operator::F32x4Sqrt | Operator::F64x2Sqrt => {
let a = pop1_with_bitcast(state, type_of(op), builder);
state.push1(builder.ins().sqrt(a))
Expand Down Expand Up @@ -1756,19 +1764,24 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
state.push1(builder.ins().uwiden_high(a))
}

Operator::F32x4Ceil
| Operator::F32x4Floor
| Operator::F32x4Trunc
| Operator::F32x4Nearest
| Operator::F32x4PMin
| Operator::F32x4PMax
| Operator::F64x2Ceil
| Operator::F64x2Floor
| Operator::F64x2Trunc
| Operator::F64x2PMin
| Operator::F64x2PMax
| Operator::F64x2Nearest => {
return Err(wasm_unsupported!("proposed SIMD operator {:?}", op));
Operator::F32x4Ceil | Operator::F64x2Ceil => {
// This is something of a misuse of `type_of`, because that produces the return type
// of `op`. In this case we want the arg type, but we know it's the same as the
// return type. Same for the 3 cases below.
let arg = pop1_with_bitcast(state, type_of(op), builder);
state.push1(builder.ins().ceil(arg));
}
Operator::F32x4Floor | Operator::F64x2Floor => {
let arg = pop1_with_bitcast(state, type_of(op), builder);
state.push1(builder.ins().floor(arg));
}
Operator::F32x4Trunc | Operator::F64x2Trunc => {
let arg = pop1_with_bitcast(state, type_of(op), builder);
state.push1(builder.ins().trunc(arg));
}
Operator::F32x4Nearest | Operator::F64x2Nearest => {
let arg = pop1_with_bitcast(state, type_of(op), builder);
state.push1(builder.ins().nearest(arg));
}

Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => {
Expand Down Expand Up @@ -2528,8 +2541,14 @@ fn type_of(operator: &Operator) -> Type {
| Operator::F32x4Div
| Operator::F32x4Min
| Operator::F32x4Max
| Operator::F32x4PMin
| Operator::F32x4PMax
| Operator::I32x4TruncSatF32x4S
| Operator::I32x4TruncSatF32x4U => F32X4,
| Operator::I32x4TruncSatF32x4U
| Operator::F32x4Ceil
| Operator::F32x4Floor
| Operator::F32x4Trunc
| Operator::F32x4Nearest => F32X4,

Operator::F64x2Splat
| Operator::F64x2ExtractLane { .. }
Expand All @@ -2548,7 +2567,13 @@ fn type_of(operator: &Operator) -> Type {
| Operator::F64x2Mul
| Operator::F64x2Div
| Operator::F64x2Min
| Operator::F64x2Max => F64X2,
| Operator::F64x2Max
| Operator::F64x2PMin
| Operator::F64x2PMax
| Operator::F64x2Ceil
| Operator::F64x2Floor
| Operator::F64x2Trunc
| Operator::F64x2Nearest => F64X2,

_ => unimplemented!(
"Currently only SIMD instructions are mapped to their return type; the \
Expand Down

0 comments on commit bfceb8c

Please sign in to comment.