Skip to content

Commit e20f0cf

Browse files
CL/aarch64: implement the wasm SIMD pseudo-max/min and FP-rounding instructions
This patch implements, for aarch64, the following wasm SIMD extensions Floating-point rounding instructions WebAssembly/simd#232 Pseudo-Minimum and Pseudo-Maximum instructions WebAssembly/simd#122 The changes are straightforward: * `build.rs`: the relevant tests have been enabled * `cranelift/codegen/meta/src/shared/instructions.rs`: new CLIF instructions `fmin_pseudo` and `fmax_pseudo`. The wasm rounding instructions do not need any new CLIF instructions. * `cranelift/wasm/src/code_translator.rs`: translation into CLIF; this is pretty much the same as any other unary or binary vector instruction (for the rounding and the pmin/max respectively) * `cranelift/codegen/src/isa/aarch64/lower_inst.rs`: - `fmin_pseudo` and `fmax_pseudo` are converted into a two instruction sequence, `fcmpgt` followed by `bsl` - the CLIF rounding instructions are converted to a suitable vector `frint{n,z,p,m}` instruction. * `cranelift/codegen/src/isa/aarch64/inst/mod.rs`: minor extension of `pub enum VecMisc2` to handle the rounding operations. And corresponding `emit` cases.
1 parent 2702942 commit e20f0cf

File tree

8 files changed

+265
-37
lines changed

8 files changed

+265
-37
lines changed

build.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -229,17 +229,17 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
229229
return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "x86_64";
230230
}
231231

232-
// This is only implemented on aarch64.
233-
("simd", "simd_boolean") => {
232+
// These are only implemented on aarch64.
233+
("simd", "simd_boolean")
234+
| ("simd", "simd_f32x4_pmin_pmax")
235+
| ("simd", "simd_f32x4_rounding")
236+
| ("simd", "simd_f64x2_pmin_pmax")
237+
| ("simd", "simd_f64x2_rounding") => {
234238
return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "aarch64";
235239
}
236240

237241
// These tests have simd operators which aren't implemented yet.
238-
("simd", "simd_f32x4_pmin_pmax") => return true,
239-
("simd", "simd_f32x4_rounding") => return true,
240-
("simd", "simd_f64x2_pmin_pmax") => return true,
241-
("simd", "simd_f64x2_rounding") => return true,
242-
242+
// (currently none)
243243
_ => {}
244244
},
245245
_ => panic!("unrecognized strategy"),

cranelift/codegen/meta/src/shared/instructions.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3577,6 +3577,22 @@ pub(crate) fn define(
35773577
.operands_out(vec![a]),
35783578
);
35793579

3580+
ig.push(
3581+
Inst::new(
3582+
"fmin_pseudo",
3583+
r#"
3584+
Floating point pseudo-minimum, propagating NaNs. This behaves differently from ``fmin``.
3585+
See https://github.com/WebAssembly/simd/pull/122 for background.
3586+
3587+
The behaviour is defined as ``fmin_pseudo(a, b) = (b < a) ? b : a``, and the behaviour
3588+
for zero or NaN inputs follows from the behaviour of ``<`` with such inputs.
3589+
"#,
3590+
&formats.binary,
3591+
)
3592+
.operands_in(vec![x, y])
3593+
.operands_out(vec![a]),
3594+
);
3595+
35803596
let a = &Operand::new("a", Float).with_doc("The larger of ``x`` and ``y``");
35813597

35823598
ig.push(
@@ -3593,6 +3609,22 @@ pub(crate) fn define(
35933609
.operands_out(vec![a]),
35943610
);
35953611

3612+
ig.push(
3613+
Inst::new(
3614+
"fmax_pseudo",
3615+
r#"
3616+
Floating point pseudo-maximum, propagating NaNs. This behaves differently from ``fmax``.
3617+
See https://github.com/WebAssembly/simd/pull/122 for background.
3618+
3619+
The behaviour is defined as ``fmax_pseudo(a, b) = (a < b) ? b : a``, and the behaviour
3620+
for zero or NaN inputs follows from the behaviour of ``<`` with such inputs.
3621+
"#,
3622+
&formats.binary,
3623+
)
3624+
.operands_in(vec![x, y])
3625+
.operands_out(vec![a]),
3626+
);
3627+
35963628
let a = &Operand::new("a", Float).with_doc("``x`` rounded to integral value");
35973629

35983630
ig.push(

cranelift/codegen/src/isa/aarch64/inst/emit.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1429,6 +1429,22 @@ impl MachInstEmit for Inst {
14291429
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
14301430
(0b1, 0b11101, enc_size & 0b1)
14311431
}
1432+
VecMisc2::Frintn => {
1433+
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
1434+
(0b0, 0b11000, enc_size & 0b01)
1435+
}
1436+
VecMisc2::Frintz => {
1437+
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
1438+
(0b0, 0b11001, enc_size | 0b10)
1439+
}
1440+
VecMisc2::Frintm => {
1441+
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
1442+
(0b0, 0b11001, enc_size & 0b01)
1443+
}
1444+
VecMisc2::Frintp => {
1445+
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
1446+
(0b0, 0b11000, enc_size | 0b10)
1447+
}
14321448
};
14331449
sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn));
14341450
}

cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3476,6 +3476,94 @@ fn test_aarch64_binemit() {
34763476
"ucvtf v10.2d, v19.2d",
34773477
));
34783478

3479+
insns.push((
3480+
Inst::VecMisc {
3481+
op: VecMisc2::Frintn,
3482+
rd: writable_vreg(11),
3483+
rn: vreg(18),
3484+
size: VectorSize::Size32x4,
3485+
},
3486+
"4B8A214E",
3487+
"frintn v11.4s, v18.4s",
3488+
));
3489+
3490+
insns.push((
3491+
Inst::VecMisc {
3492+
op: VecMisc2::Frintn,
3493+
rd: writable_vreg(12),
3494+
rn: vreg(17),
3495+
size: VectorSize::Size64x2,
3496+
},
3497+
"2C8A614E",
3498+
"frintn v12.2d, v17.2d",
3499+
));
3500+
3501+
insns.push((
3502+
Inst::VecMisc {
3503+
op: VecMisc2::Frintz,
3504+
rd: writable_vreg(11),
3505+
rn: vreg(18),
3506+
size: VectorSize::Size32x4,
3507+
},
3508+
"4B9AA14E",
3509+
"frintz v11.4s, v18.4s",
3510+
));
3511+
3512+
insns.push((
3513+
Inst::VecMisc {
3514+
op: VecMisc2::Frintz,
3515+
rd: writable_vreg(12),
3516+
rn: vreg(17),
3517+
size: VectorSize::Size64x2,
3518+
},
3519+
"2C9AE14E",
3520+
"frintz v12.2d, v17.2d",
3521+
));
3522+
3523+
insns.push((
3524+
Inst::VecMisc {
3525+
op: VecMisc2::Frintm,
3526+
rd: writable_vreg(11),
3527+
rn: vreg(18),
3528+
size: VectorSize::Size32x4,
3529+
},
3530+
"4B9A214E",
3531+
"frintm v11.4s, v18.4s",
3532+
));
3533+
3534+
insns.push((
3535+
Inst::VecMisc {
3536+
op: VecMisc2::Frintm,
3537+
rd: writable_vreg(12),
3538+
rn: vreg(17),
3539+
size: VectorSize::Size64x2,
3540+
},
3541+
"2C9A614E",
3542+
"frintm v12.2d, v17.2d",
3543+
));
3544+
3545+
insns.push((
3546+
Inst::VecMisc {
3547+
op: VecMisc2::Frintp,
3548+
rd: writable_vreg(11),
3549+
rn: vreg(18),
3550+
size: VectorSize::Size32x4,
3551+
},
3552+
"4B8AA14E",
3553+
"frintp v11.4s, v18.4s",
3554+
));
3555+
3556+
insns.push((
3557+
Inst::VecMisc {
3558+
op: VecMisc2::Frintp,
3559+
rd: writable_vreg(12),
3560+
rn: vreg(17),
3561+
size: VectorSize::Size64x2,
3562+
},
3563+
"2C8AE14E",
3564+
"frintp v12.2d, v17.2d",
3565+
));
3566+
34793567
insns.push((
34803568
Inst::VecLanes {
34813569
op: VecLanesOp::Uminv,

cranelift/codegen/src/isa/aarch64/inst/mod.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,14 @@ pub enum VecMisc2 {
318318
Scvtf,
319319
/// Unsigned integer convert to floating-point
320320
Ucvtf,
321+
/// Floating point round to integral, rounding towards nearest
322+
Frintn,
323+
/// Floating point round to integral, rounding towards zero
324+
Frintz,
325+
/// Floating point round to integral, rounding towards minus infinity
326+
Frintm,
327+
/// Floating point round to integral, rounding towards plus infinity
328+
Frintp,
321329
}
322330

323331
/// A Vector narrowing operation with two registers.
@@ -3435,6 +3443,10 @@ impl Inst {
34353443
VecMisc2::Fcvtzu => ("fcvtzu", size),
34363444
VecMisc2::Scvtf => ("scvtf", size),
34373445
VecMisc2::Ucvtf => ("ucvtf", size),
3446+
VecMisc2::Frintn => ("frintn", size),
3447+
VecMisc2::Frintz => ("frintz", size),
3448+
VecMisc2::Frintm => ("frintm", size),
3449+
VecMisc2::Frintp => ("frintp", size),
34383450
};
34393451

34403452
let rd_size = if is_shll { size.widen() } else { size };

cranelift/codegen/src/isa/aarch64/lower_inst.rs

Lines changed: 70 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2373,6 +2373,43 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
23732373
}
23742374
}
23752375

2376+
Opcode::FminPseudo | Opcode::FmaxPseudo => {
2377+
let ty = ctx.input_ty(insn, 0);
2378+
if ty == F32X4 || ty == F64X2 {
2379+
// pmin(a,b) => bitsel(b, a, cmpgt(a, b))
2380+
// pmax(a,b) => bitsel(b, a, cmpgt(b, a))
2381+
let r_dst = get_output_reg(ctx, outputs[0]);
2382+
let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2383+
let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
2384+
// Since we're going to write the output register `r_dst` anyway, we might as
2385+
// well first use it to hold the comparison result. This has the slightly unusual
2386+
// effect that we modify the output register in the first instruction (`fcmgt`)
2387+
// but read both the inputs again in the second instruction (`bsl`), which means
2388+
// that the output register can't be either of the input registers. Regalloc
2389+
// should handle this correctly, nevertheless.
2390+
ctx.emit(Inst::VecRRR {
2391+
alu_op: VecALUOp::Fcmgt,
2392+
rd: r_dst,
2393+
rn: if op == Opcode::FminPseudo { r_a } else { r_b },
2394+
rm: if op == Opcode::FminPseudo { r_b } else { r_a },
2395+
size: if ty == F32X4 {
2396+
VectorSize::Size32x4
2397+
} else {
2398+
VectorSize::Size64x2
2399+
},
2400+
});
2401+
ctx.emit(Inst::VecRRR {
2402+
alu_op: VecALUOp::Bsl,
2403+
rd: r_dst,
2404+
rn: r_b,
2405+
rm: r_a,
2406+
size: VectorSize::Size8x16,
2407+
});
2408+
} else {
2409+
panic!("Opcode::FminPseudo | Opcode::FmaxPseudo: unhandled type");
2410+
}
2411+
}
2412+
23762413
Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => {
23772414
let ty = ty.unwrap();
23782415
let bits = ty_bits(ty);
@@ -2411,21 +2448,39 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
24112448
}
24122449

24132450
Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => {
2414-
let bits = ty_bits(ctx.output_ty(insn, 0));
2415-
let op = match (op, bits) {
2416-
(Opcode::Ceil, 32) => FpuRoundMode::Plus32,
2417-
(Opcode::Ceil, 64) => FpuRoundMode::Plus64,
2418-
(Opcode::Floor, 32) => FpuRoundMode::Minus32,
2419-
(Opcode::Floor, 64) => FpuRoundMode::Minus64,
2420-
(Opcode::Trunc, 32) => FpuRoundMode::Zero32,
2421-
(Opcode::Trunc, 64) => FpuRoundMode::Zero64,
2422-
(Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
2423-
(Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
2424-
_ => panic!("Unknown op/bits combination"),
2425-
};
2426-
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2427-
let rd = get_output_reg(ctx, outputs[0]);
2428-
ctx.emit(Inst::FpuRound { op, rd, rn });
2451+
let ty = ctx.output_ty(insn, 0);
2452+
if !ty.is_vector() {
2453+
let bits = ty_bits(ty);
2454+
let op = match (op, bits) {
2455+
(Opcode::Ceil, 32) => FpuRoundMode::Plus32,
2456+
(Opcode::Ceil, 64) => FpuRoundMode::Plus64,
2457+
(Opcode::Floor, 32) => FpuRoundMode::Minus32,
2458+
(Opcode::Floor, 64) => FpuRoundMode::Minus64,
2459+
(Opcode::Trunc, 32) => FpuRoundMode::Zero32,
2460+
(Opcode::Trunc, 64) => FpuRoundMode::Zero64,
2461+
(Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
2462+
(Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
2463+
_ => panic!("Unknown op/bits combination (scalar)"),
2464+
};
2465+
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2466+
let rd = get_output_reg(ctx, outputs[0]);
2467+
ctx.emit(Inst::FpuRound { op, rd, rn });
2468+
} else {
2469+
let (op, size) = match (op, ty) {
2470+
(Opcode::Ceil, F32X4) => (VecMisc2::Frintp, VectorSize::Size32x4),
2471+
(Opcode::Ceil, F64X2) => (VecMisc2::Frintp, VectorSize::Size64x2),
2472+
(Opcode::Floor, F32X4) => (VecMisc2::Frintm, VectorSize::Size32x4),
2473+
(Opcode::Floor, F64X2) => (VecMisc2::Frintm, VectorSize::Size64x2),
2474+
(Opcode::Trunc, F32X4) => (VecMisc2::Frintz, VectorSize::Size32x4),
2475+
(Opcode::Trunc, F64X2) => (VecMisc2::Frintz, VectorSize::Size64x2),
2476+
(Opcode::Nearest, F32X4) => (VecMisc2::Frintn, VectorSize::Size32x4),
2477+
(Opcode::Nearest, F64X2) => (VecMisc2::Frintn, VectorSize::Size64x2),
2478+
_ => panic!("Unknown op/ty combination (vector){:?}", ty),
2479+
};
2480+
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2481+
let rd = get_output_reg(ctx, outputs[0]);
2482+
ctx.emit(Inst::VecMisc { op, rd, rn, size });
2483+
}
24292484
}
24302485

24312486
Opcode::Fma => {
0 Bytes
Binary file not shown.

cranelift/wasm/src/code_translator.rs

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1679,6 +1679,14 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
16791679
let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
16801680
state.push1(builder.ins().fmin(a, b))
16811681
}
1682+
Operator::F32x4PMax | Operator::F64x2PMax => {
1683+
let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
1684+
state.push1(builder.ins().fmax_pseudo(a, b))
1685+
}
1686+
Operator::F32x4PMin | Operator::F64x2PMin => {
1687+
let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
1688+
state.push1(builder.ins().fmin_pseudo(a, b))
1689+
}
16821690
Operator::F32x4Sqrt | Operator::F64x2Sqrt => {
16831691
let a = pop1_with_bitcast(state, type_of(op), builder);
16841692
state.push1(builder.ins().sqrt(a))
@@ -1756,19 +1764,24 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
17561764
state.push1(builder.ins().uwiden_high(a))
17571765
}
17581766

1759-
Operator::F32x4Ceil
1760-
| Operator::F32x4Floor
1761-
| Operator::F32x4Trunc
1762-
| Operator::F32x4Nearest
1763-
| Operator::F32x4PMin
1764-
| Operator::F32x4PMax
1765-
| Operator::F64x2Ceil
1766-
| Operator::F64x2Floor
1767-
| Operator::F64x2Trunc
1768-
| Operator::F64x2PMin
1769-
| Operator::F64x2PMax
1770-
| Operator::F64x2Nearest => {
1771-
return Err(wasm_unsupported!("proposed SIMD operator {:?}", op));
1767+
Operator::F32x4Ceil | Operator::F64x2Ceil => {
1768+
// This is something of a misuse of `type_of`, because that produces the return type
1769+
// of `op`. In this case we want the arg type, but we know it's the same as the
1770+
// return type. Same for the 3 cases below.
1771+
let arg = pop1_with_bitcast(state, type_of(op), builder);
1772+
state.push1(builder.ins().ceil(arg));
1773+
}
1774+
Operator::F32x4Floor | Operator::F64x2Floor => {
1775+
let arg = pop1_with_bitcast(state, type_of(op), builder);
1776+
state.push1(builder.ins().floor(arg));
1777+
}
1778+
Operator::F32x4Trunc | Operator::F64x2Trunc => {
1779+
let arg = pop1_with_bitcast(state, type_of(op), builder);
1780+
state.push1(builder.ins().trunc(arg));
1781+
}
1782+
Operator::F32x4Nearest | Operator::F64x2Nearest => {
1783+
let arg = pop1_with_bitcast(state, type_of(op), builder);
1784+
state.push1(builder.ins().nearest(arg));
17721785
}
17731786

17741787
Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => {
@@ -2528,8 +2541,14 @@ fn type_of(operator: &Operator) -> Type {
25282541
| Operator::F32x4Div
25292542
| Operator::F32x4Min
25302543
| Operator::F32x4Max
2544+
| Operator::F32x4PMin
2545+
| Operator::F32x4PMax
25312546
| Operator::I32x4TruncSatF32x4S
2532-
| Operator::I32x4TruncSatF32x4U => F32X4,
2547+
| Operator::I32x4TruncSatF32x4U
2548+
| Operator::F32x4Ceil
2549+
| Operator::F32x4Floor
2550+
| Operator::F32x4Trunc
2551+
| Operator::F32x4Nearest => F32X4,
25332552

25342553
Operator::F64x2Splat
25352554
| Operator::F64x2ExtractLane { .. }
@@ -2548,7 +2567,13 @@ fn type_of(operator: &Operator) -> Type {
25482567
| Operator::F64x2Mul
25492568
| Operator::F64x2Div
25502569
| Operator::F64x2Min
2551-
| Operator::F64x2Max => F64X2,
2570+
| Operator::F64x2Max
2571+
| Operator::F64x2PMin
2572+
| Operator::F64x2PMax
2573+
| Operator::F64x2Ceil
2574+
| Operator::F64x2Floor
2575+
| Operator::F64x2Trunc
2576+
| Operator::F64x2Nearest => F64X2,
25522577

25532578
_ => unimplemented!(
25542579
"Currently only SIMD instructions are mapped to their return type; the \

0 commit comments

Comments
 (0)