Add extend-add-pairwise instructions x64

jlb6740 · jlb6740 · commit f4d1c26ce83a · 2021-06-29T22:01:37.000-07:00
diff --git a/build.rs b/build.rs
@@ -190,10 +190,8 @@ fn x64_should_panic(testsuite: &str, testname: &str, strategy: &str) -> bool {
 
     match (testsuite, testname) {
         ("simd", "simd_conversions") => return true, // unknown operator or unexpected token: tests/spec_testsuite/proposals/simd/simd_conversions.wast:724:6
-        ("simd", "simd_i16x8_extadd_pairwise_i8x16") => return true,
         ("simd", "simd_i16x8_extmul_i8x16") => return true,
         ("simd", "simd_i16x8_q15mulr_sat_s") => return true,
-        ("simd", "simd_i32x4_extadd_pairwise_i16x8") => return true,
         ("simd", "simd_i32x4_extmul_i16x8") => return true,
         ("simd", "simd_i32x4_trunc_sat_f64x2") => return true,
         ("simd", "simd_i64x2_extmul_i32x4") => return true,
@@ -229,9 +227,8 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
 
             // These are new instructions that are not really implemented in any backend.
             ("simd", "simd_conversions")
-            | ("simd", "simd_i16x8_extadd_pairwise_i8x16")
             | ("simd", "simd_i16x8_extmul_i8x16")
-            | ("simd", "simd_i32x4_extadd_pairwise_i16x8")
+            | ("simd", "simd_i16x8_q15mulr_sat_s")
             | ("simd", "simd_i32x4_extmul_i16x8")
             | ("simd", "simd_i32x4_trunc_sat_f64x2")
             | ("simd", "simd_i64x2_extmul_i32x4") => return true,
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -4095,7 +4095,34 @@ pub(crate) fn define(
         Inst::new(
             "uwiden_high",
             r#"
-        Widen the high lanes of `x` using unsigned extension.
+        Lane-wise integer extended pairwise addition producing extended results
+        (twice wider results than the input)
+            "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "extended_pairwise_add_signed",
+            r#"
+        Widen the high lanes of `x` using signed extension.
+
+        This will double the lane width and halve the number of lanes.
+            "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "extended_pairwise_add_unsigned",
+            r#"
+        Widen the high lanes of `x` extending with zeros.
 
         This will double the lane width and halve the number of lanes.
             "#,
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -3545,6 +3545,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::Fvdemote
         | Opcode::FvpromoteLow
         | Opcode::Vconcat
+        | Opcode::ExtendedPairwiseAddSigned
+        | Opcode::ExtendedPairwiseAddUnsigned
         | Opcode::Vsplit => unimplemented!("lowering {}", op),
     }
 
diff --git a/cranelift/codegen/src/isa/s390x/lower.rs b/cranelift/codegen/src/isa/s390x/lower.rs
@@ -2867,7 +2867,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::WideningPairwiseDotProductS
         | Opcode::SqmulRoundSat
         | Opcode::FvpromoteLow
-        | Opcode::Fvdemote => {
+        | Opcode::Fvdemote
+        | Opcode::ExtendedPairwiseAddSigned
+        | Opcode::ExtendedPairwiseAddUnsigned => {
             // TODO
             unimplemented!("Vector ops not implemented.");
         }
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -567,6 +567,7 @@ pub enum SseOpcode {
     Pinsrb,
     Pinsrw,
     Pinsrd,
+    Pmaddubsw,
     Pmaddwd,
     Pmaxsb,
     Pmaxsw,
@@ -734,6 +735,7 @@ impl SseOpcode {
             | SseOpcode::Pcmpgtd
             | SseOpcode::Pextrw
             | SseOpcode::Pinsrw
+            | SseOpcode::Pmaddubsw
             | SseOpcode::Pmaddwd
             | SseOpcode::Pmaxsw
             | SseOpcode::Pmaxub
@@ -925,6 +927,7 @@ impl fmt::Debug for SseOpcode {
             SseOpcode::Pinsrb => "pinsrb",
             SseOpcode::Pinsrw => "pinsrw",
             SseOpcode::Pinsrd => "pinsrd",
+            SseOpcode::Pmaddubsw => "pmaddubsw",
             SseOpcode::Pmaddwd => "pmaddwd",
             SseOpcode::Pmaxsb => "pmaxsb",
             SseOpcode::Pmaxsw => "pmaxsw",
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1483,6 +1483,7 @@ pub(crate) fn emit(
                 SseOpcode::Paddsw => (LegacyPrefixes::_66, 0x0FED, 2),
                 SseOpcode::Paddusb => (LegacyPrefixes::_66, 0x0FDC, 2),
                 SseOpcode::Paddusw => (LegacyPrefixes::_66, 0x0FDD, 2),
+                SseOpcode::Pmaddubsw => (LegacyPrefixes::_66, 0x0F3804, 3),
                 SseOpcode::Pand => (LegacyPrefixes::_66, 0x0FDB, 2),
                 SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2),
                 SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2),
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
@@ -4495,6 +4495,128 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 }
             }
         }
+        Opcode::ExtendedPairwiseAddSigned | Opcode::ExtendedPairwiseAddUnsigned => {
+            // Extended pairwise addition instructions computes extended sums within adjacent
+            // pairs of lanes of a SIMD vector, producing a SIMD vector with half as many lanes.
+            // Instruction sequences taken from instruction SPEC PR https://github.com/WebAssembly/simd/pull/380
+            /*
+            let input_ty = ctx.input_ty(insn, 0);
+            let output_ty = ctx.output_ty(insn, 0);
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            unreachable!();
+            match op {
+                Opcode::ExtendedPairwiseAddSigned => match (input_ty, output_ty) {
+                    (types::I8X16, types::I16X8) => {
+                        static MUL_CONST: [u8; 16] = [0x01; 16];
+                        let mul_const = ctx.use_constant(VCodeConstantData::WellKnown(&MUL_CONST));
+                        let mul_const_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
+                        ctx.emit(Inst::xmm_load_const(mul_const, mul_const_reg, types::I8X16));
+                        ctx.emit(Inst::xmm_mov(
+                            SseOpcode::Movdqa,
+                            RegMem::reg(mul_const_reg.to_reg()),
+                            dst,
+                        ));
+                        ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaddubsw, RegMem::reg(src), dst));
+                    }
+                    (types::I16X8, types::I32X4) => {
+                        static MUL_CONST: [u8; 16] = [
+                            0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
+                            0x01, 0x00, 0x01, 0x00,
+                        ];
+                        let mul_const = ctx.use_constant(VCodeConstantData::WellKnown(&MUL_CONST));
+                        let mul_const_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
+                        ctx.emit(Inst::xmm_load_const(mul_const, mul_const_reg, types::I16X8));
+                        ctx.emit(Inst::xmm_mov(SseOpcode::Movdqa, RegMem::reg(src), dst));
+                        ctx.emit(Inst::xmm_rm_r(
+                            SseOpcode::Pmaddwd,
+                            RegMem::reg(mul_const_reg.to_reg()),
+                            dst,
+                        ));
+                    }
+                    _ => unreachable!(
+                        "Type pattern not supported {:?}-{:?} not supported for {:?}.",
+                        input_ty, output_ty, op
+                    ),
+                },
+                Opcode::ExtendedPairwiseAddUnsigned => match (input_ty, output_ty) {
+                    (types::I8X16, types::I16X8) => {
+                        static MUL_CONST: [u8; 16] = [0x01; 16];
+                        let mul_const = ctx.use_constant(VCodeConstantData::WellKnown(&MUL_CONST));
+                        let mul_const_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
+                        ctx.emit(Inst::xmm_load_const(mul_const, mul_const_reg, types::I8X16));
+                        ctx.emit(Inst::xmm_mov(SseOpcode::Movdqa, RegMem::reg(src), dst));
+                        ctx.emit(Inst::xmm_rm_r(
+                            SseOpcode::Pmaddubsw,
+                            RegMem::reg(mul_const_reg.to_reg()),
+                            dst,
+                        ));
+                    }
+                    (types::I16X8, types::I32X4) => {
+                        static PXOR_CONST: [u8; 16] = [
+                            0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+                            0x00, 0x80, 0x00, 0x80,
+                        ];
+                        let pxor_const =
+                            ctx.use_constant(VCodeConstantData::WellKnown(&PXOR_CONST));
+                        let pxor_const_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
+                        ctx.emit(Inst::xmm_load_const(
+                            pxor_const,
+                            pxor_const_reg,
+                            types::I16X8,
+                        ));
+                        ctx.emit(Inst::xmm_mov(SseOpcode::Movdqa, RegMem::reg(src), dst));
+                        ctx.emit(Inst::xmm_rm_r(
+                            SseOpcode::Pxor,
+                            RegMem::reg(pxor_const_reg.to_reg()),
+                            dst,
+                        ));
+
+                        static MADD_CONST: [u8; 16] = [
+                            0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
+                            0x01, 0x00, 0x01, 0x00,
+                        ];
+                        let madd_const =
+                            ctx.use_constant(VCodeConstantData::WellKnown(&MADD_CONST));
+                        let madd_const_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
+                        ctx.emit(Inst::xmm_load_const(
+                            madd_const,
+                            madd_const_reg,
+                            types::I16X8,
+                        ));
+                        ctx.emit(Inst::xmm_rm_r(
+                            SseOpcode::Pmaddwd,
+                            RegMem::reg(madd_const_reg.to_reg()),
+                            dst,
+                        ));
+
+                        static ADDD_CONST2: [u8; 16] = [
+                            0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00,
+                            0x00, 0x00, 0x01, 0x00,
+                        ];
+                        let addd_const2 =
+                            ctx.use_constant(VCodeConstantData::WellKnown(&ADDD_CONST2));
+                        let addd_const2_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
+                        ctx.emit(Inst::xmm_load_const(
+                            addd_const2,
+                            addd_const2_reg,
+                            types::I16X8,
+                        ));
+                        ctx.emit(Inst::xmm_rm_r(
+                            SseOpcode::Paddd,
+                            RegMem::reg(addd_const2_reg.to_reg()),
+                            dst,
+                        ));
+                    }
+                    _ => unreachable!(
+                        "Type pattern not supported {:?}-{:?} not supported for {:?}.",
+                        input_ty, output_ty, op
+                    ),
+                },
+                _ => unreachable!("{:?} not supported.", op),
+            }
+            */
+        }
         Opcode::UwidenHigh | Opcode::UwidenLow | Opcode::SwidenHigh | Opcode::SwidenLow => {
             let input_ty = ctx.input_ty(insn, 0);
             let output_ty = ctx.output_ty(insn, 0);
diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs
@@ -575,6 +575,8 @@ where
         Opcode::Fence => unimplemented!("Fence"),
         Opcode::WideningPairwiseDotProductS => unimplemented!("WideningPairwiseDotProductS"),
         Opcode::SqmulRoundSat => unimplemented!("SqmulRoundSat"),
+        Opcode::ExtendedPairwiseAddSigned => unimplemented!("ExtendedPairwiseAddSigned"),
+        Opcode::ExtendedPairwiseAddUnsigned => unimplemented!("ExtendedPairwiseAddUnsigned"),
 
         // TODO: these instructions should be removed once the new backend makes these obsolete
         // (see https://github.com/bytecodealliance/wasmtime/issues/1936); additionally, the
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
@@ -1858,6 +1858,22 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let a = pop1_with_bitcast(state, I32X4, builder);
             state.push1(builder.ins().uwiden_high(a))
         }
+        Operator::I16x8ExtAddPairwiseI8x16S => {
+            let a = pop1_with_bitcast(state, I8X16, builder);
+            state.push1(builder.ins().extended_pairwise_add_signed(a))
+        }
+        Operator::I32x4ExtAddPairwiseI16x8S => {
+            let a = pop1_with_bitcast(state, I16X8, builder);
+            state.push1(builder.ins().extended_pairwise_add_signed(a))
+        }
+        Operator::I16x8ExtAddPairwiseI8x16U => {
+            let a = pop1_with_bitcast(state, I8X16, builder);
+            state.push1(builder.ins().extended_pairwise_add_unsigned(a))
+        }
+        Operator::I32x4ExtAddPairwiseI16x8U => {
+            let a = pop1_with_bitcast(state, I16X8, builder);
+            state.push1(builder.ins().extended_pairwise_add_unsigned(a))
+        }
         Operator::F32x4Ceil | Operator::F64x2Ceil => {
             // This is something of a misuse of `type_of`, because that produces the return type
             // of `op`.  In this case we want the arg type, but we know it's the same as the
@@ -1902,10 +1918,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         | Operator::I64x2ExtMulHighI32x4S
         | Operator::I64x2ExtMulLowI32x4U
         | Operator::I64x2ExtMulHighI32x4U
-        | Operator::I16x8ExtAddPairwiseI8x16S
-        | Operator::I16x8ExtAddPairwiseI8x16U
-        | Operator::I32x4ExtAddPairwiseI16x8S
-        | Operator::I32x4ExtAddPairwiseI16x8U
         | Operator::F64x2ConvertLowI32x4U
         | Operator::I32x4TruncSatF64x2SZero
         | Operator::I32x4TruncSatF64x2UZero => {

Original file line number	Diff line number	Diff line change
`@@ -3545,6 +3545,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(`
`3545`	`3545`	`\| Opcode::Fvdemote`
`3546`	`3546`	`\| Opcode::FvpromoteLow`
`3547`	`3547`	`\| Opcode::Vconcat`
	`3548`	`+ \| Opcode::ExtendedPairwiseAddSigned`
	`3549`	`+ \| Opcode::ExtendedPairwiseAddUnsigned`
`3548`	`3550`	`\| Opcode::Vsplit => unimplemented!("lowering {}", op),`
`3549`	`3551`	`}`
`3550`	`3552`