CL/aarch64: implement the wasm SIMD pseudo-max/min and FP-rounding instructions

julian-seward1 · julian-seward1 · commit e20f0cf6463c · 2020-10-26T09:56:39.000+01:00
This patch implements, for aarch64, the following wasm SIMD extensions Floating-point rounding instructions WebAssembly/simd#232 Pseudo-Minimum and Pseudo-Maximum instructions WebAssembly/simd#122 The changes are straightforward: * `build.rs`: the relevant tests have been enabled * `cranelift/codegen/meta/src/shared/instructions.rs`: new CLIF instructions `fmin_pseudo` and `fmax_pseudo`. The wasm rounding instructions do not need any new CLIF instructions. * `cranelift/wasm/src/code_translator.rs`: translation into CLIF; this is pretty much the same as any other unary or binary vector instruction (for the rounding and the pmin/max respectively) * `cranelift/codegen/src/isa/aarch64/lower_inst.rs`: - `fmin_pseudo` and `fmax_pseudo` are converted into a two instruction sequence, `fcmpgt` followed by `bsl` - the CLIF rounding instructions are converted to a suitable vector `frint{n,z,p,m}` instruction. * `cranelift/codegen/src/isa/aarch64/inst/mod.rs`: minor extension of `pub enum VecMisc2` to handle the rounding operations. And corresponding `emit` cases.
diff --git a/build.rs b/build.rs
@@ -229,17 +229,17 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
                 return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "x86_64";
             }
 
-            // This is only implemented on aarch64.
-            ("simd", "simd_boolean") => {
+            // These are only implemented on aarch64.
+            ("simd", "simd_boolean")
+            | ("simd", "simd_f32x4_pmin_pmax")
+            | ("simd", "simd_f32x4_rounding")
+            | ("simd", "simd_f64x2_pmin_pmax")
+            | ("simd", "simd_f64x2_rounding") => {
                 return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "aarch64";
             }
 
             // These tests have simd operators which aren't implemented yet.
-            ("simd", "simd_f32x4_pmin_pmax") => return true,
-            ("simd", "simd_f32x4_rounding") => return true,
-            ("simd", "simd_f64x2_pmin_pmax") => return true,
-            ("simd", "simd_f64x2_rounding") => return true,
-
+            // (currently none)
             _ => {}
         },
         _ => panic!("unrecognized strategy"),
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -3577,6 +3577,22 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
+    ig.push(
+        Inst::new(
+            "fmin_pseudo",
+            r#"
+        Floating point pseudo-minimum, propagating NaNs.  This behaves differently from ``fmin``.
+        See https://github.com/WebAssembly/simd/pull/122 for background.
+
+        The behaviour is defined as ``fmin_pseudo(a, b) = (b < a) ? b : a``, and the behaviour
+        for zero or NaN inputs follows from the behaviour of ``<`` with such inputs.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
     let a = &Operand::new("a", Float).with_doc("The larger of ``x`` and ``y``");
 
     ig.push(
@@ -3593,6 +3609,22 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
+    ig.push(
+        Inst::new(
+            "fmax_pseudo",
+            r#"
+        Floating point pseudo-maximum, propagating NaNs.  This behaves differently from ``fmax``.
+        See https://github.com/WebAssembly/simd/pull/122 for background.
+
+        The behaviour is defined as ``fmax_pseudo(a, b) = (a < b) ? b : a``, and the behaviour
+        for zero or NaN inputs follows from the behaviour of ``<`` with such inputs.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
     let a = &Operand::new("a", Float).with_doc("``x`` rounded to integral value");
 
     ig.push(
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1429,6 +1429,22 @@ impl MachInstEmit for Inst {
                         debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
                         (0b1, 0b11101, enc_size & 0b1)
                     }
+                    VecMisc2::Frintn => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b0, 0b11000, enc_size & 0b01)
+                    }
+                    VecMisc2::Frintz => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b0, 0b11001, enc_size | 0b10)
+                    }
+                    VecMisc2::Frintm => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b0, 0b11001, enc_size & 0b01)
+                    }
+                    VecMisc2::Frintp => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b0, 0b11000, enc_size | 0b10)
+                    }
                 };
                 sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn));
             }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -3476,6 +3476,94 @@ fn test_aarch64_binemit() {
         "ucvtf v10.2d, v19.2d",
     ));
 
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintn,
+            rd: writable_vreg(11),
+            rn: vreg(18),
+            size: VectorSize::Size32x4,
+        },
+        "4B8A214E",
+        "frintn v11.4s, v18.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintn,
+            rd: writable_vreg(12),
+            rn: vreg(17),
+            size: VectorSize::Size64x2,
+        },
+        "2C8A614E",
+        "frintn v12.2d, v17.2d",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintz,
+            rd: writable_vreg(11),
+            rn: vreg(18),
+            size: VectorSize::Size32x4,
+        },
+        "4B9AA14E",
+        "frintz v11.4s, v18.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintz,
+            rd: writable_vreg(12),
+            rn: vreg(17),
+            size: VectorSize::Size64x2,
+        },
+        "2C9AE14E",
+        "frintz v12.2d, v17.2d",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintm,
+            rd: writable_vreg(11),
+            rn: vreg(18),
+            size: VectorSize::Size32x4,
+        },
+        "4B9A214E",
+        "frintm v11.4s, v18.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintm,
+            rd: writable_vreg(12),
+            rn: vreg(17),
+            size: VectorSize::Size64x2,
+        },
+        "2C9A614E",
+        "frintm v12.2d, v17.2d",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintp,
+            rd: writable_vreg(11),
+            rn: vreg(18),
+            size: VectorSize::Size32x4,
+        },
+        "4B8AA14E",
+        "frintp v11.4s, v18.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintp,
+            rd: writable_vreg(12),
+            rn: vreg(17),
+            size: VectorSize::Size64x2,
+        },
+        "2C8AE14E",
+        "frintp v12.2d, v17.2d",
+    ));
+
     insns.push((
         Inst::VecLanes {
             op: VecLanesOp::Uminv,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -318,6 +318,14 @@ pub enum VecMisc2 {
     Scvtf,
     /// Unsigned integer convert to floating-point
     Ucvtf,
+    /// Floating point round to integral, rounding towards nearest
+    Frintn,
+    /// Floating point round to integral, rounding towards zero
+    Frintz,
+    /// Floating point round to integral, rounding towards minus infinity
+    Frintm,
+    /// Floating point round to integral, rounding towards plus infinity
+    Frintp,
 }
 
 /// A Vector narrowing operation with two registers.
@@ -3435,6 +3443,10 @@ impl Inst {
                     VecMisc2::Fcvtzu => ("fcvtzu", size),
                     VecMisc2::Scvtf => ("scvtf", size),
                     VecMisc2::Ucvtf => ("ucvtf", size),
+                    VecMisc2::Frintn => ("frintn", size),
+                    VecMisc2::Frintz => ("frintz", size),
+                    VecMisc2::Frintm => ("frintm", size),
+                    VecMisc2::Frintp => ("frintp", size),
                 };
 
                 let rd_size = if is_shll { size.widen() } else { size };
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2373,6 +2373,43 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         }
 
+        Opcode::FminPseudo | Opcode::FmaxPseudo => {
+            let ty = ctx.input_ty(insn, 0);
+            if ty == F32X4 || ty == F64X2 {
+                // pmin(a,b) => bitsel(b, a, cmpgt(a, b))
+                // pmax(a,b) => bitsel(b, a, cmpgt(b, a))
+                let r_dst = get_output_reg(ctx, outputs[0]);
+                let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+                // Since we're going to write the output register `r_dst` anyway, we might as
+                // well first use it to hold the comparison result.  This has the slightly unusual
+                // effect that we modify the output register in the first instruction (`fcmgt`)
+                // but read both the inputs again in the second instruction (`bsl`), which means
+                // that the output register can't be either of the input registers.  Regalloc
+                // should handle this correctly, nevertheless.
+                ctx.emit(Inst::VecRRR {
+                    alu_op: VecALUOp::Fcmgt,
+                    rd: r_dst,
+                    rn: if op == Opcode::FminPseudo { r_a } else { r_b },
+                    rm: if op == Opcode::FminPseudo { r_b } else { r_a },
+                    size: if ty == F32X4 {
+                        VectorSize::Size32x4
+                    } else {
+                        VectorSize::Size64x2
+                    },
+                });
+                ctx.emit(Inst::VecRRR {
+                    alu_op: VecALUOp::Bsl,
+                    rd: r_dst,
+                    rn: r_b,
+                    rm: r_a,
+                    size: VectorSize::Size8x16,
+                });
+            } else {
+                panic!("Opcode::FminPseudo | Opcode::FmaxPseudo: unhandled type");
+            }
+        }
+
         Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => {
             let ty = ty.unwrap();
             let bits = ty_bits(ty);
@@ -2411,21 +2448,39 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => {
-            let bits = ty_bits(ctx.output_ty(insn, 0));
-            let op = match (op, bits) {
-                (Opcode::Ceil, 32) => FpuRoundMode::Plus32,
-                (Opcode::Ceil, 64) => FpuRoundMode::Plus64,
-                (Opcode::Floor, 32) => FpuRoundMode::Minus32,
-                (Opcode::Floor, 64) => FpuRoundMode::Minus64,
-                (Opcode::Trunc, 32) => FpuRoundMode::Zero32,
-                (Opcode::Trunc, 64) => FpuRoundMode::Zero64,
-                (Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
-                (Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
-                _ => panic!("Unknown op/bits combination"),
-            };
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]);
-            ctx.emit(Inst::FpuRound { op, rd, rn });
+            let ty = ctx.output_ty(insn, 0);
+            if !ty.is_vector() {
+                let bits = ty_bits(ty);
+                let op = match (op, bits) {
+                    (Opcode::Ceil, 32) => FpuRoundMode::Plus32,
+                    (Opcode::Ceil, 64) => FpuRoundMode::Plus64,
+                    (Opcode::Floor, 32) => FpuRoundMode::Minus32,
+                    (Opcode::Floor, 64) => FpuRoundMode::Minus64,
+                    (Opcode::Trunc, 32) => FpuRoundMode::Zero32,
+                    (Opcode::Trunc, 64) => FpuRoundMode::Zero64,
+                    (Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
+                    (Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
+                    _ => panic!("Unknown op/bits combination (scalar)"),
+                };
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let rd = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::FpuRound { op, rd, rn });
+            } else {
+                let (op, size) = match (op, ty) {
+                    (Opcode::Ceil, F32X4) => (VecMisc2::Frintp, VectorSize::Size32x4),
+                    (Opcode::Ceil, F64X2) => (VecMisc2::Frintp, VectorSize::Size64x2),
+                    (Opcode::Floor, F32X4) => (VecMisc2::Frintm, VectorSize::Size32x4),
+                    (Opcode::Floor, F64X2) => (VecMisc2::Frintm, VectorSize::Size64x2),
+                    (Opcode::Trunc, F32X4) => (VecMisc2::Frintz, VectorSize::Size32x4),
+                    (Opcode::Trunc, F64X2) => (VecMisc2::Frintz, VectorSize::Size64x2),
+                    (Opcode::Nearest, F32X4) => (VecMisc2::Frintn, VectorSize::Size32x4),
+                    (Opcode::Nearest, F64X2) => (VecMisc2::Frintn, VectorSize::Size64x2),
+                    _ => panic!("Unknown op/ty combination (vector){:?}", ty),
+                };
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let rd = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::VecMisc { op, rd, rn, size });
+            }
         }
 
         Opcode::Fma => {
diff --git a/cranelift/codegen/src/preopt.serialized b/cranelift/codegen/src/preopt.serialized
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
@@ -1679,6 +1679,14 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
             state.push1(builder.ins().fmin(a, b))
         }
+        Operator::F32x4PMax | Operator::F64x2PMax => {
+            let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
+            state.push1(builder.ins().fmax_pseudo(a, b))
+        }
+        Operator::F32x4PMin | Operator::F64x2PMin => {
+            let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
+            state.push1(builder.ins().fmin_pseudo(a, b))
+        }
         Operator::F32x4Sqrt | Operator::F64x2Sqrt => {
             let a = pop1_with_bitcast(state, type_of(op), builder);
             state.push1(builder.ins().sqrt(a))
@@ -1756,19 +1764,24 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             state.push1(builder.ins().uwiden_high(a))
         }
 
-        Operator::F32x4Ceil
-        | Operator::F32x4Floor
-        | Operator::F32x4Trunc
-        | Operator::F32x4Nearest
-        | Operator::F32x4PMin
-        | Operator::F32x4PMax
-        | Operator::F64x2Ceil
-        | Operator::F64x2Floor
-        | Operator::F64x2Trunc
-        | Operator::F64x2PMin
-        | Operator::F64x2PMax
-        | Operator::F64x2Nearest => {
-            return Err(wasm_unsupported!("proposed SIMD operator {:?}", op));
+        Operator::F32x4Ceil | Operator::F64x2Ceil => {
+            // This is something of a misuse of `type_of`, because that produces the return type
+            // of `op`.  In this case we want the arg type, but we know it's the same as the
+            // return type.  Same for the 3 cases below.
+            let arg = pop1_with_bitcast(state, type_of(op), builder);
+            state.push1(builder.ins().ceil(arg));
+        }
+        Operator::F32x4Floor | Operator::F64x2Floor => {
+            let arg = pop1_with_bitcast(state, type_of(op), builder);
+            state.push1(builder.ins().floor(arg));
+        }
+        Operator::F32x4Trunc | Operator::F64x2Trunc => {
+            let arg = pop1_with_bitcast(state, type_of(op), builder);
+            state.push1(builder.ins().trunc(arg));
+        }
+        Operator::F32x4Nearest | Operator::F64x2Nearest => {
+            let arg = pop1_with_bitcast(state, type_of(op), builder);
+            state.push1(builder.ins().nearest(arg));
         }
 
         Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => {
@@ -2528,8 +2541,14 @@ fn type_of(operator: &Operator) -> Type {
         | Operator::F32x4Div
         | Operator::F32x4Min
         | Operator::F32x4Max
+        | Operator::F32x4PMin
+        | Operator::F32x4PMax
         | Operator::I32x4TruncSatF32x4S
-        | Operator::I32x4TruncSatF32x4U => F32X4,
+        | Operator::I32x4TruncSatF32x4U
+        | Operator::F32x4Ceil
+        | Operator::F32x4Floor
+        | Operator::F32x4Trunc
+        | Operator::F32x4Nearest => F32X4,
 
         Operator::F64x2Splat
         | Operator::F64x2ExtractLane { .. }
@@ -2548,7 +2567,13 @@ fn type_of(operator: &Operator) -> Type {
         | Operator::F64x2Mul
         | Operator::F64x2Div
         | Operator::F64x2Min
-        | Operator::F64x2Max => F64X2,
+        | Operator::F64x2Max
+        | Operator::F64x2PMin
+        | Operator::F64x2PMax
+        | Operator::F64x2Ceil
+        | Operator::F64x2Floor
+        | Operator::F64x2Trunc
+        | Operator::F64x2Nearest => F64X2,
 
         _ => unimplemented!(
             "Currently only SIMD instructions are mapped to their return type; the \