From 8aa9cc53455326b74e64971a8118fe5803301457 Mon Sep 17 00:00:00 2001 From: adhoc Date: Mon, 27 Jan 2025 10:36:06 +0100 Subject: [PATCH 01/10] packed integer add --- crates/wast-util/src/lib.rs | 2 +- tests/disas/winch/x64/i16x8/add/add.wat | 45 +++++++++++++++++++ .../extract_lane_s}/const_avx.wat | 0 .../extract_lane_u}/const.wat | 0 .../replace_lane}/const_avx.wat | 0 .../replace_lane}/param_avx.wat | 0 .../splat}/const_avx2.wat | 0 .../splat}/param_avx2.wat | 0 tests/disas/winch/x64/i32x4/add/add.wat | 45 +++++++++++++++++++ .../extract_lane}/const_avx.wat | 0 .../replace_lane}/const_avx.wat | 0 .../replace_lane}/param_avx.wat | 0 .../splat}/const_avx2.wat | 0 .../splat}/param_avx2.wat | 0 tests/disas/winch/x64/i64x2/add/add.wat | 37 +++++++++++++++ .../extract_lane}/const.wat | 0 .../replace_lane}/const_avx.wat | 0 .../replace_lane}/param_avx.wat | 0 .../splat}/const_avx.wat | 0 .../splat}/param_avx.wat | 0 tests/disas/winch/x64/i8x16/add/add.wat | 45 +++++++++++++++++++ .../extract_lane_s}/const_avx.wat | 0 .../extract_lane_u}/const_avx.wat | 0 .../replace_lane}/const_avx.wat | 0 .../replace_lane}/param_avx.wat | 0 .../shuffle}/const_avx.wat | 0 .../splat}/const_avx2.wat | 0 .../splat}/param_avx2.wat | 0 .../swizzle}/const_avx.wat | 0 winch/codegen/src/isa/aarch64/masm.rs | 12 +++++ winch/codegen/src/isa/x64/masm.rs | 15 +++++++ winch/codegen/src/masm.rs | 4 ++ winch/codegen/src/visitor.rs | 37 +++++++++++++++ 33 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 tests/disas/winch/x64/i16x8/add/add.wat rename tests/disas/winch/x64/{i16x8_extract_lane_s => i16x8/extract_lane_s}/const_avx.wat (100%) rename tests/disas/winch/x64/{i16x8_extract_lane_u => i16x8/extract_lane_u}/const.wat (100%) rename tests/disas/winch/x64/{i16x8_replace_lane => i16x8/replace_lane}/const_avx.wat (100%) rename tests/disas/winch/x64/{i16x8_replace_lane => i16x8/replace_lane}/param_avx.wat (100%) rename tests/disas/winch/x64/{i16x8_splat => i16x8/splat}/const_avx2.wat (100%) rename tests/disas/winch/x64/{i16x8_splat => i16x8/splat}/param_avx2.wat (100%) create mode 100644 tests/disas/winch/x64/i32x4/add/add.wat rename tests/disas/winch/x64/{i32x4_extract_lane => i32x4/extract_lane}/const_avx.wat (100%) rename tests/disas/winch/x64/{i32x4_replace_lane => i32x4/replace_lane}/const_avx.wat (100%) rename tests/disas/winch/x64/{i32x4_replace_lane => i32x4/replace_lane}/param_avx.wat (100%) rename tests/disas/winch/x64/{i32x4_splat => i32x4/splat}/const_avx2.wat (100%) rename tests/disas/winch/x64/{i32x4_splat => i32x4/splat}/param_avx2.wat (100%) create mode 100644 tests/disas/winch/x64/i64x2/add/add.wat rename tests/disas/winch/x64/{i64x2_extract_lane => i64x2/extract_lane}/const.wat (100%) rename tests/disas/winch/x64/{i64x2_replace_lane => i64x2/replace_lane}/const_avx.wat (100%) rename tests/disas/winch/x64/{i64x2_replace_lane => i64x2/replace_lane}/param_avx.wat (100%) rename tests/disas/winch/x64/{i64x2_splat => i64x2/splat}/const_avx.wat (100%) rename tests/disas/winch/x64/{i64x2_splat => i64x2/splat}/param_avx.wat (100%) create mode 100644 tests/disas/winch/x64/i8x16/add/add.wat rename tests/disas/winch/x64/{i8x16_extract_lane_s => i8x16/extract_lane_s}/const_avx.wat (100%) rename tests/disas/winch/x64/{i8x16_extract_lane_u => i8x16/extract_lane_u}/const_avx.wat (100%) rename tests/disas/winch/x64/{i8x16_replace_lane => i8x16/replace_lane}/const_avx.wat (100%) rename tests/disas/winch/x64/{i8x16_replace_lane => i8x16/replace_lane}/param_avx.wat (100%) rename tests/disas/winch/x64/{i8x16_shuffle => i8x16/shuffle}/const_avx.wat (100%) rename tests/disas/winch/x64/{i8x16_splat => i8x16/splat}/const_avx2.wat (100%) rename tests/disas/winch/x64/{i8x16_splat => i8x16/splat}/param_avx2.wat (100%) rename tests/disas/winch/x64/{i8x16_swizzle => i8x16/swizzle}/const_avx.wat (100%) diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs index b2eb08953ad3..61561b5c6b5f 100644 --- a/crates/wast-util/src/lib.rs +++ b/crates/wast-util/src/lib.rs @@ -428,7 +428,6 @@ impl WastTest { "misc_testsuite/simd/issue_3327_bnot_lowering.wast", "spec_testsuite/simd_bit_shift.wast", "spec_testsuite/simd_boolean.wast", - "spec_testsuite/simd_const.wast", "spec_testsuite/simd_conversions.wast", "spec_testsuite/simd_f32x4.wast", "spec_testsuite/simd_f32x4_arith.wast", @@ -499,6 +498,7 @@ impl WastTest { "misc_testsuite/simd/unaligned-load.wast", "multi-memory/simd_memory-multi.wast", "misc_testsuite/simd/issue4807.wast", + "spec_testsuite/simd_const.wast", ]; if unsupported.iter().any(|part| self.path.ends_with(part)) { diff --git a/tests/disas/winch/x64/i16x8/add/add.wat b/tests/disas/winch/x64/i16x8/add/add.wat new file mode 100644 index 000000000000..e2ac03e2fd69 --- /dev/null +++ b/tests/disas/winch/x64/i16x8/add/add.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i16x8.add + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpaddw %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: cmpl %eax, (%rip) +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/tests/disas/winch/x64/i16x8_extract_lane_s/const_avx.wat b/tests/disas/winch/x64/i16x8/extract_lane_s/const_avx.wat similarity index 100% rename from tests/disas/winch/x64/i16x8_extract_lane_s/const_avx.wat rename to tests/disas/winch/x64/i16x8/extract_lane_s/const_avx.wat diff --git a/tests/disas/winch/x64/i16x8_extract_lane_u/const.wat b/tests/disas/winch/x64/i16x8/extract_lane_u/const.wat similarity index 100% rename from tests/disas/winch/x64/i16x8_extract_lane_u/const.wat rename to tests/disas/winch/x64/i16x8/extract_lane_u/const.wat diff --git a/tests/disas/winch/x64/i16x8_replace_lane/const_avx.wat b/tests/disas/winch/x64/i16x8/replace_lane/const_avx.wat similarity index 100% rename from tests/disas/winch/x64/i16x8_replace_lane/const_avx.wat rename to tests/disas/winch/x64/i16x8/replace_lane/const_avx.wat diff --git a/tests/disas/winch/x64/i16x8_replace_lane/param_avx.wat b/tests/disas/winch/x64/i16x8/replace_lane/param_avx.wat similarity index 100% rename from tests/disas/winch/x64/i16x8_replace_lane/param_avx.wat rename to tests/disas/winch/x64/i16x8/replace_lane/param_avx.wat diff --git a/tests/disas/winch/x64/i16x8_splat/const_avx2.wat b/tests/disas/winch/x64/i16x8/splat/const_avx2.wat similarity index 100% rename from tests/disas/winch/x64/i16x8_splat/const_avx2.wat rename to tests/disas/winch/x64/i16x8/splat/const_avx2.wat diff --git a/tests/disas/winch/x64/i16x8_splat/param_avx2.wat b/tests/disas/winch/x64/i16x8/splat/param_avx2.wat similarity index 100% rename from tests/disas/winch/x64/i16x8_splat/param_avx2.wat rename to tests/disas/winch/x64/i16x8/splat/param_avx2.wat diff --git a/tests/disas/winch/x64/i32x4/add/add.wat b/tests/disas/winch/x64/i32x4/add/add.wat new file mode 100644 index 000000000000..4143e0eae2d4 --- /dev/null +++ b/tests/disas/winch/x64/i32x4/add/add.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i32x4.add + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpaddd %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: cmpl %eax, (%rip) +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/tests/disas/winch/x64/i32x4_extract_lane/const_avx.wat b/tests/disas/winch/x64/i32x4/extract_lane/const_avx.wat similarity index 100% rename from tests/disas/winch/x64/i32x4_extract_lane/const_avx.wat rename to tests/disas/winch/x64/i32x4/extract_lane/const_avx.wat diff --git a/tests/disas/winch/x64/i32x4_replace_lane/const_avx.wat b/tests/disas/winch/x64/i32x4/replace_lane/const_avx.wat similarity index 100% rename from tests/disas/winch/x64/i32x4_replace_lane/const_avx.wat rename to tests/disas/winch/x64/i32x4/replace_lane/const_avx.wat diff --git a/tests/disas/winch/x64/i32x4_replace_lane/param_avx.wat b/tests/disas/winch/x64/i32x4/replace_lane/param_avx.wat similarity index 100% rename from tests/disas/winch/x64/i32x4_replace_lane/param_avx.wat rename to tests/disas/winch/x64/i32x4/replace_lane/param_avx.wat diff --git a/tests/disas/winch/x64/i32x4_splat/const_avx2.wat b/tests/disas/winch/x64/i32x4/splat/const_avx2.wat similarity index 100% rename from tests/disas/winch/x64/i32x4_splat/const_avx2.wat rename to tests/disas/winch/x64/i32x4/splat/const_avx2.wat diff --git a/tests/disas/winch/x64/i32x4_splat/param_avx2.wat b/tests/disas/winch/x64/i32x4/splat/param_avx2.wat similarity index 100% rename from tests/disas/winch/x64/i32x4_splat/param_avx2.wat rename to tests/disas/winch/x64/i32x4/splat/param_avx2.wat diff --git a/tests/disas/winch/x64/i64x2/add/add.wat b/tests/disas/winch/x64/i64x2/add/add.wat new file mode 100644 index 000000000000..2014f6992ae5 --- /dev/null +++ b/tests/disas/winch/x64/i64x2/add/add.wat @@ -0,0 +1,37 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i64x2.add + (i64x2.splat (i64.const 10)) + (i64x2.splat (i64.const 10)) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x48 +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; vpshufd $0x44, 0x1b(%rip), %xmm0 +;; vpshufd $0x44, 0x12(%rip), %xmm1 +;; vpaddq %xmm1, %xmm0, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 48: ud2 +;; 4a: addb %al, (%rax) +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: orb (%rax), %al +;; 52: addb %al, (%rax) +;; 54: addb %al, (%rax) +;; 56: addb %al, (%rax) diff --git a/tests/disas/winch/x64/i64x2_extract_lane/const.wat b/tests/disas/winch/x64/i64x2/extract_lane/const.wat similarity index 100% rename from tests/disas/winch/x64/i64x2_extract_lane/const.wat rename to tests/disas/winch/x64/i64x2/extract_lane/const.wat diff --git a/tests/disas/winch/x64/i64x2_replace_lane/const_avx.wat b/tests/disas/winch/x64/i64x2/replace_lane/const_avx.wat similarity index 100% rename from tests/disas/winch/x64/i64x2_replace_lane/const_avx.wat rename to tests/disas/winch/x64/i64x2/replace_lane/const_avx.wat diff --git a/tests/disas/winch/x64/i64x2_replace_lane/param_avx.wat b/tests/disas/winch/x64/i64x2/replace_lane/param_avx.wat similarity index 100% rename from tests/disas/winch/x64/i64x2_replace_lane/param_avx.wat rename to tests/disas/winch/x64/i64x2/replace_lane/param_avx.wat diff --git a/tests/disas/winch/x64/i64x2_splat/const_avx.wat b/tests/disas/winch/x64/i64x2/splat/const_avx.wat similarity index 100% rename from tests/disas/winch/x64/i64x2_splat/const_avx.wat rename to tests/disas/winch/x64/i64x2/splat/const_avx.wat diff --git a/tests/disas/winch/x64/i64x2_splat/param_avx.wat b/tests/disas/winch/x64/i64x2/splat/param_avx.wat similarity index 100% rename from tests/disas/winch/x64/i64x2_splat/param_avx.wat rename to tests/disas/winch/x64/i64x2/splat/param_avx.wat diff --git a/tests/disas/winch/x64/i8x16/add/add.wat b/tests/disas/winch/x64/i8x16/add/add.wat new file mode 100644 index 000000000000..dff418defad0 --- /dev/null +++ b/tests/disas/winch/x64/i8x16/add/add.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i8x16.add + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpaddb %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: cmpl %eax, (%rip) +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/tests/disas/winch/x64/i8x16_extract_lane_s/const_avx.wat b/tests/disas/winch/x64/i8x16/extract_lane_s/const_avx.wat similarity index 100% rename from tests/disas/winch/x64/i8x16_extract_lane_s/const_avx.wat rename to tests/disas/winch/x64/i8x16/extract_lane_s/const_avx.wat diff --git a/tests/disas/winch/x64/i8x16_extract_lane_u/const_avx.wat b/tests/disas/winch/x64/i8x16/extract_lane_u/const_avx.wat similarity index 100% rename from tests/disas/winch/x64/i8x16_extract_lane_u/const_avx.wat rename to tests/disas/winch/x64/i8x16/extract_lane_u/const_avx.wat diff --git a/tests/disas/winch/x64/i8x16_replace_lane/const_avx.wat b/tests/disas/winch/x64/i8x16/replace_lane/const_avx.wat similarity index 100% rename from tests/disas/winch/x64/i8x16_replace_lane/const_avx.wat rename to tests/disas/winch/x64/i8x16/replace_lane/const_avx.wat diff --git a/tests/disas/winch/x64/i8x16_replace_lane/param_avx.wat b/tests/disas/winch/x64/i8x16/replace_lane/param_avx.wat similarity index 100% rename from tests/disas/winch/x64/i8x16_replace_lane/param_avx.wat rename to tests/disas/winch/x64/i8x16/replace_lane/param_avx.wat diff --git a/tests/disas/winch/x64/i8x16_shuffle/const_avx.wat b/tests/disas/winch/x64/i8x16/shuffle/const_avx.wat similarity index 100% rename from tests/disas/winch/x64/i8x16_shuffle/const_avx.wat rename to tests/disas/winch/x64/i8x16/shuffle/const_avx.wat diff --git a/tests/disas/winch/x64/i8x16_splat/const_avx2.wat b/tests/disas/winch/x64/i8x16/splat/const_avx2.wat similarity index 100% rename from tests/disas/winch/x64/i8x16_splat/const_avx2.wat rename to tests/disas/winch/x64/i8x16/splat/const_avx2.wat diff --git a/tests/disas/winch/x64/i8x16_splat/param_avx2.wat b/tests/disas/winch/x64/i8x16/splat/param_avx2.wat similarity index 100% rename from tests/disas/winch/x64/i8x16_splat/param_avx2.wat rename to tests/disas/winch/x64/i8x16/splat/param_avx2.wat diff --git a/tests/disas/winch/x64/i8x16_swizzle/const_avx.wat b/tests/disas/winch/x64/i8x16/swizzle/const_avx.wat similarity index 100% rename from tests/disas/winch/x64/i8x16_swizzle/const_avx.wat rename to tests/disas/winch/x64/i8x16/swizzle/const_avx.wat diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs index 7957cf47ff40..231e228c4565 100644 --- a/winch/codegen/src/isa/aarch64/masm.rs +++ b/winch/codegen/src/isa/aarch64/masm.rs @@ -1101,6 +1101,18 @@ impl Masm for MacroAssembler { fn v128_any_true(&mut self, _src: Reg, _dst: WritableReg) -> Result<()> { Err(anyhow!(CodeGenError::unimplemented_masm_instruction())) } + + fn v128_add( + &mut self, + _lhs: Reg, + _rhs: Reg, + _dst: WritableReg, + _size: OperandSize, + ) -> Result<()> { + Err(anyhow!(CodeGenError::unimplemented_masm_instruction())) + } + Err(anyhow!(CodeGenError::unimplemented_masm_instruction())) + } } impl MacroAssembler { diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs index 78f1811899db..261da563fa13 100644 --- a/winch/codegen/src/isa/x64/masm.rs +++ b/winch/codegen/src/isa/x64/masm.rs @@ -1882,6 +1882,21 @@ impl Masm for MacroAssembler { self.asm.setcc(IntCmpKind::Ne, dst); Ok(()) } + + fn v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { + self.ensure_has_avx()?; + + let op = match size { + OperandSize::S8 => AvxOpcode::Vpaddb, + OperandSize::S16 => AvxOpcode::Vpaddw, + OperandSize::S32 => AvxOpcode::Vpaddd, + OperandSize::S64 => AvxOpcode::Vpaddq, + OperandSize::S128 => bail!(CodeGenError::unexpected_operand_size()), + }; + + self.asm.xmm_rmi_rvex(op, src, dst.to_reg(), dst); + Ok(()) + } } impl MacroAssembler { diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs index c9ed5cebc0b3..77d9447f0518 100644 --- a/winch/codegen/src/masm.rs +++ b/winch/codegen/src/masm.rs @@ -1645,4 +1645,8 @@ pub(crate) trait MacroAssembler { /// If any bit in `src` is 1, set `dst` to 1, or 0 otherwise. fn v128_any_true(&mut self, src: Reg, dst: WritableReg) -> Result<()>; + + /// Perform a vector add between `src` and `dst`, placing the result in `dst`, where each lane + /// is interpreted to be `size` long. + fn v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>; } diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs index e966bba27ab3..73bfede80d68 100644 --- a/winch/codegen/src/visitor.rs +++ b/winch/codegen/src/visitor.rs @@ -418,6 +418,10 @@ macro_rules! def_unsupported { (emit V128Store16Lane $($rest:tt)*) => {}; (emit V128Store32Lane $($rest:tt)*) => {}; (emit V128Store64Lane $($rest:tt)*) => {}; + (emit I8x16Add $($rest:tt)*) => {}; + (emit I16x8Add $($rest:tt)*) => {}; + (emit I32x4Add $($rest:tt)*) => {}; + (emit I64x2Add $($rest:tt)*) => {}; (emit $unsupported:tt $($rest:tt)*) => {$($rest)*}; } @@ -3520,6 +3524,39 @@ where self.emit_wasm_store(&arg, StoreKind::vector_lane(lane, OperandSize::S64)) } + fn visit_i8x16_add(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S8, |masm, dst, src, size| { + masm.v128_add(dst, src, writable!(dst), size)?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + + fn visit_i16x8_add(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S16, |masm, dst, src, size| { + masm.v128_add(dst, src, writable!(dst), size)?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + + fn visit_i32x4_add(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S32, |masm, dst, src, size| { + masm.v128_add(dst, src, writable!(dst), size)?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + + fn visit_i64x2_add(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S64, |masm, dst, src, size| { + masm.v128_add(dst, src, writable!(dst), size)?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + } + wasmparser::for_each_visit_simd_operator!(def_unsupported); } From 7170e63ca339c0c714be67cb3230e3b350f31b8b Mon Sep 17 00:00:00 2001 From: adhoc Date: Wed, 29 Jan 2025 11:36:55 +0100 Subject: [PATCH 02/10] packed integer sub --- tests/disas/winch/x64/i16x8/sub/sub.wat | 45 +++++++++++++++++++++++++ tests/disas/winch/x64/i32x4/sub/sub.wat | 45 +++++++++++++++++++++++++ tests/disas/winch/x64/i64x2/sub/sub.wat | 37 ++++++++++++++++++++ tests/disas/winch/x64/i8x16/sub/sub.wat | 45 +++++++++++++++++++++++++ winch/codegen/src/isa/aarch64/masm.rs | 8 +++++ winch/codegen/src/isa/x64/masm.rs | 17 +++++++++- winch/codegen/src/masm.rs | 4 +++ winch/codegen/src/visitor.rs | 35 +++++++++++++++++++ 8 files changed, 235 insertions(+), 1 deletion(-) create mode 100644 tests/disas/winch/x64/i16x8/sub/sub.wat create mode 100644 tests/disas/winch/x64/i32x4/sub/sub.wat create mode 100644 tests/disas/winch/x64/i64x2/sub/sub.wat create mode 100644 tests/disas/winch/x64/i8x16/sub/sub.wat diff --git a/tests/disas/winch/x64/i16x8/sub/sub.wat b/tests/disas/winch/x64/i16x8/sub/sub.wat new file mode 100644 index 000000000000..b028d315fba6 --- /dev/null +++ b/tests/disas/winch/x64/i16x8/sub/sub.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i16x8.sub + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpsubw %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: cmpl %eax, (%rip) +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/tests/disas/winch/x64/i32x4/sub/sub.wat b/tests/disas/winch/x64/i32x4/sub/sub.wat new file mode 100644 index 000000000000..72e77c7d94d3 --- /dev/null +++ b/tests/disas/winch/x64/i32x4/sub/sub.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i32x4.sub + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpsubd %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: cmpl %eax, (%rip) +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/tests/disas/winch/x64/i64x2/sub/sub.wat b/tests/disas/winch/x64/i64x2/sub/sub.wat new file mode 100644 index 000000000000..d1d47143ce98 --- /dev/null +++ b/tests/disas/winch/x64/i64x2/sub/sub.wat @@ -0,0 +1,37 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i64x2.sub + (i64x2.splat (i64.const 10)) + (i64x2.splat (i64.const 10)) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x48 +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; vpshufd $0x44, 0x1b(%rip), %xmm0 +;; vpshufd $0x44, 0x12(%rip), %xmm1 +;; vpsubq %xmm1, %xmm0, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 48: ud2 +;; 4a: addb %al, (%rax) +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: orb (%rax), %al +;; 52: addb %al, (%rax) +;; 54: addb %al, (%rax) +;; 56: addb %al, (%rax) diff --git a/tests/disas/winch/x64/i8x16/sub/sub.wat b/tests/disas/winch/x64/i8x16/sub/sub.wat new file mode 100644 index 000000000000..11633deb12f9 --- /dev/null +++ b/tests/disas/winch/x64/i8x16/sub/sub.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i8x16.sub + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpsubb %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: cmpl %eax, (%rip) +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs index 231e228c4565..b775d06821b3 100644 --- a/winch/codegen/src/isa/aarch64/masm.rs +++ b/winch/codegen/src/isa/aarch64/masm.rs @@ -1111,6 +1111,14 @@ impl Masm for MacroAssembler { ) -> Result<()> { Err(anyhow!(CodeGenError::unimplemented_masm_instruction())) } + + fn v128_sub( + &mut self, + _lhs: Reg, + _rhs: Reg, + _dst: WritableReg, + _size: OperandSize, + ) -> Result<()> { Err(anyhow!(CodeGenError::unimplemented_masm_instruction())) } } diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs index 261da563fa13..cb7f056954ae 100644 --- a/winch/codegen/src/isa/x64/masm.rs +++ b/winch/codegen/src/isa/x64/masm.rs @@ -1894,7 +1894,22 @@ impl Masm for MacroAssembler { OperandSize::S128 => bail!(CodeGenError::unexpected_operand_size()), }; - self.asm.xmm_rmi_rvex(op, src, dst.to_reg(), dst); + self.asm.xmm_rmi_rvex(op, lhs, rhs, dst); + Ok(()) + } + + fn v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { + self.ensure_has_avx()?; + + let op = match size { + OperandSize::S8 => AvxOpcode::Vpsubb, + OperandSize::S16 => AvxOpcode::Vpsubw, + OperandSize::S32 => AvxOpcode::Vpsubd, + OperandSize::S64 => AvxOpcode::Vpsubq, + OperandSize::S128 => bail!(CodeGenError::unexpected_operand_size()), + }; + + self.asm.xmm_rmi_rvex(op, lhs, rhs, dst); Ok(()) } } diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs index 77d9447f0518..9a43bf145b3c 100644 --- a/winch/codegen/src/masm.rs +++ b/winch/codegen/src/masm.rs @@ -1649,4 +1649,8 @@ pub(crate) trait MacroAssembler { /// Perform a vector add between `src` and `dst`, placing the result in `dst`, where each lane /// is interpreted to be `size` long. fn v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>; + + /// Perform a vector sub between `src` and `dst`, placing the result in `dst`, where each lane + /// is interpreted to be `size` long. + fn v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>; } diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs index 73bfede80d68..2f8eed0f75ec 100644 --- a/winch/codegen/src/visitor.rs +++ b/winch/codegen/src/visitor.rs @@ -422,6 +422,10 @@ macro_rules! def_unsupported { (emit I16x8Add $($rest:tt)*) => {}; (emit I32x4Add $($rest:tt)*) => {}; (emit I64x2Add $($rest:tt)*) => {}; + (emit I8x16Sub $($rest:tt)*) => {}; + (emit I16x8Sub $($rest:tt)*) => {}; + (emit I32x4Sub $($rest:tt)*) => {}; + (emit I64x2Sub $($rest:tt)*) => {}; (emit $unsupported:tt $($rest:tt)*) => {$($rest)*}; } @@ -3555,6 +3559,37 @@ where Ok(TypedReg::new(WasmValType::V128, dst)) }) } + + fn visit_i8x16_sub(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S8, |masm, dst, src, size| { + masm.v128_sub(dst, src, writable!(dst), size)?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + + fn visit_i16x8_sub(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S16, |masm, dst, src, size| { + masm.v128_sub(dst, src, writable!(dst), size)?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + + fn visit_i32x4_sub(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S32, |masm, dst, src, size| { + masm.v128_sub(dst, src, writable!(dst), size)?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + + fn visit_i64x2_sub(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S64, |masm, dst, src, size| { + masm.v128_sub(dst, src, writable!(dst), size)?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) } wasmparser::for_each_visit_simd_operator!(def_unsupported); From e5b677c045c127e39cacd4b0d4ab51818252a116 Mon Sep 17 00:00:00 2001 From: adhoc Date: Wed, 29 Jan 2025 12:33:59 +0100 Subject: [PATCH 03/10] packed integer mul --- tests/disas/winch/x64/i16x8/mul/mul.wat | 45 +++++++++++++++++++++++++ tests/disas/winch/x64/i32x4/mul/mul.wat | 45 +++++++++++++++++++++++++ tests/disas/winch/x64/i64x2/mul/mul.wat | 36 ++++++++++++++++++++ winch/codegen/src/isa/aarch64/masm.rs | 10 ++++++ winch/codegen/src/isa/x64/asm.rs | 25 +++++++++++--- winch/codegen/src/isa/x64/masm.rs | 40 +++++++++++++++++++++- winch/codegen/src/masm.rs | 8 +++-- winch/codegen/src/visitor.rs | 27 +++++++++++++++ 8 files changed, 229 insertions(+), 7 deletions(-) create mode 100644 tests/disas/winch/x64/i16x8/mul/mul.wat create mode 100644 tests/disas/winch/x64/i32x4/mul/mul.wat create mode 100644 tests/disas/winch/x64/i64x2/mul/mul.wat diff --git a/tests/disas/winch/x64/i16x8/mul/mul.wat b/tests/disas/winch/x64/i16x8/mul/mul.wat new file mode 100644 index 000000000000..2b2a0193b2b4 --- /dev/null +++ b/tests/disas/winch/x64/i16x8/mul/mul.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i16x8.mul + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpmullw %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: cmpl %eax, (%rip) +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/tests/disas/winch/x64/i32x4/mul/mul.wat b/tests/disas/winch/x64/i32x4/mul/mul.wat new file mode 100644 index 000000000000..bbdacb23e55d --- /dev/null +++ b/tests/disas/winch/x64/i32x4/mul/mul.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i32x4.mul + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4b +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpmulld %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4b: ud2 +;; 4d: addb %al, (%rax) +;; 4f: addb %bh, (%rcx) +;; 51: addl $0, %eax +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/tests/disas/winch/x64/i64x2/mul/mul.wat b/tests/disas/winch/x64/i64x2/mul/mul.wat new file mode 100644 index 000000000000..8d86f137cbb4 --- /dev/null +++ b/tests/disas/winch/x64/i64x2/mul/mul.wat @@ -0,0 +1,36 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx512vl", "-Ccranelift-has-avx", "-Ccranelift-has-avx512dq", ] + +(module + (memory 1 1) + (func (result v128) + (i64x2.mul + (i64x2.splat (i64.const 10)) + (i64x2.splat (i64.const 10)) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; vpshufd $0x44, 0x1b(%rip), %xmm0 +;; vpshufd $0x44, 0x12(%rip), %xmm1 +;; vpmullq %xmm1, %xmm0, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: orb (%rax), %al +;; 52: addb %al, (%rax) +;; 54: addb %al, (%rax) +;; 56: addb %al, (%rax) diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs index b775d06821b3..2d882b2aef5e 100644 --- a/winch/codegen/src/isa/aarch64/masm.rs +++ b/winch/codegen/src/isa/aarch64/masm.rs @@ -1121,6 +1121,16 @@ impl Masm for MacroAssembler { ) -> Result<()> { Err(anyhow!(CodeGenError::unimplemented_masm_instruction())) } + + fn v128_mul( + &mut self, + _lhs: Reg, + _rhs: Reg, + _dst: WritableReg, + _size: OperandSize, + ) -> Result<()> { + Err(anyhow!(CodeGenError::unimplemented_masm_instruction())) + } } impl MacroAssembler { diff --git a/winch/codegen/src/isa/x64/asm.rs b/winch/codegen/src/isa/x64/asm.rs index 1249b0314c2b..6b0f0d544f34 100644 --- a/winch/codegen/src/isa/x64/asm.rs +++ b/winch/codegen/src/isa/x64/asm.rs @@ -18,10 +18,10 @@ use cranelift_codegen::{ unwind::UnwindInst, x64::{ args::{ - self, AluRmiROpcode, Amode, AvxOpcode, CmpOpcode, DivSignedness, ExtMode, - FenceKind, FromWritableReg, Gpr, GprMem, GprMemImm, Imm8Gpr, Imm8Reg, RegMem, - RegMemImm, ShiftKind as CraneliftShiftKind, SseOpcode, SyntheticAmode, WritableGpr, - WritableXmm, Xmm, XmmMem, XmmMemAligned, XmmMemImm, CC, + self, AluRmiROpcode, Amode, Avx512Opcode, AvxOpcode, CmpOpcode, DivSignedness, + ExtMode, FenceKind, FromWritableReg, Gpr, GprMem, GprMemImm, Imm8Gpr, Imm8Reg, + RegMem, RegMemImm, ShiftKind as CraneliftShiftKind, SseOpcode, SyntheticAmode, + WritableGpr, WritableXmm, Xmm, XmmMem, XmmMemAligned, XmmMemImm, CC, }, encoding::rex::{encode_modrm, RexFlags}, settings as x64_settings, AtomicRmwSeqOp, EmitInfo, EmitState, Inst, @@ -2075,6 +2075,23 @@ impl Assembler { }, }); } + + pub(crate) fn xmm_rm_rvex3( + &mut self, + op: Avx512Opcode, + src1: Reg, + src2: Reg, + dst: WritableReg, + ) { + self.emit(Inst::XmmRmREvex3 { + op, + // `src1` reuses `dst`, and is ignored in emission + src1: dst.to_reg().into(), + src2: src1.into(), + src3: src2.into(), + dst: dst.map(Into::into), + }); + } } /// Captures the region in a MachBuffer where an add-with-immediate instruction would be emitted, diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs index cb7f056954ae..08f97def2a19 100644 --- a/winch/codegen/src/isa/x64/masm.rs +++ b/winch/codegen/src/isa/x64/masm.rs @@ -34,7 +34,7 @@ use cranelift_codegen::{ isa::{ unwind::UnwindInst, x64::{ - args::{AvxOpcode, FenceKind, CC}, + args::{Avx512Opcode, AvxOpcode, FenceKind, CC}, settings as x64_settings, AtomicRmwSeqOp, }, }, @@ -1912,6 +1912,28 @@ impl Masm for MacroAssembler { self.asm.xmm_rmi_rvex(op, lhs, rhs, dst); Ok(()) } + + fn v128_mul(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { + let mul_avx = |this: &mut Self, op| { + this.ensure_has_avx()?; + this.asm.xmm_rmi_rvex(op, lhs, rhs, dst); + Ok(()) + }; + + let mul_avx512 = |this: &mut Self, op| { + this.ensure_has_avx512vl()?; + this.ensure_has_avx512dq()?; + this.asm.xmm_rm_rvex3(op, lhs, rhs, dst); + Ok(()) + }; + + match size { + OperandSize::S16 => mul_avx(self, AvxOpcode::Vpmullw), + OperandSize::S32 => mul_avx(self, AvxOpcode::Vpmulld), + OperandSize::S64 => mul_avx512(self, Avx512Opcode::Vpmullq), + _ => bail!(CodeGenError::unexpected_operand_size()), + } + } } impl MacroAssembler { @@ -1953,6 +1975,22 @@ impl MacroAssembler { Ok(()) } + fn ensure_has_avx512vl(&self) -> Result<()> { + anyhow::ensure!( + self.flags.has_avx512vl(), + CodeGenError::UnimplementedForNoAvx2 + ); + Ok(()) + } + + fn ensure_has_avx512dq(&self) -> Result<()> { + anyhow::ensure!( + self.flags.has_avx512dq(), + CodeGenError::UnimplementedForNoAvx2 + ); + Ok(()) + } + fn increment_sp(&mut self, bytes: u32) { self.sp_offset += bytes; diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs index 9a43bf145b3c..9917f8862d94 100644 --- a/winch/codegen/src/masm.rs +++ b/winch/codegen/src/masm.rs @@ -1646,11 +1646,15 @@ pub(crate) trait MacroAssembler { /// If any bit in `src` is 1, set `dst` to 1, or 0 otherwise. fn v128_any_true(&mut self, src: Reg, dst: WritableReg) -> Result<()>; - /// Perform a vector add between `src` and `dst`, placing the result in `dst`, where each lane + /// Perform a vector add between `lsh` and `rhs`, placing the result in `dst`, where each lane /// is interpreted to be `size` long. fn v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>; - /// Perform a vector sub between `src` and `dst`, placing the result in `dst`, where each lane + /// Perform a vector sub between `lhs` and `rhs`, placing the result in `dst`, where each lane /// is interpreted to be `size` long. fn v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>; + + /// Perform a vector lane-wise mul between `lhs` and `rhs`, placing the result in `dst`, where each lane + /// is interpreted to be `size` long. + fn v128_mul(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>; } diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs index 2f8eed0f75ec..8f5d638bf7b5 100644 --- a/winch/codegen/src/visitor.rs +++ b/winch/codegen/src/visitor.rs @@ -426,6 +426,9 @@ macro_rules! def_unsupported { (emit I16x8Sub $($rest:tt)*) => {}; (emit I32x4Sub $($rest:tt)*) => {}; (emit I64x2Sub $($rest:tt)*) => {}; + (emit I16x8Mul $($rest:tt)*) => {}; + (emit I32x4Mul $($rest:tt)*) => {}; + (emit I64x2Mul $($rest:tt)*) => {}; (emit $unsupported:tt $($rest:tt)*) => {$($rest)*}; } @@ -3592,6 +3595,30 @@ where }) } + fn visit_i16x8_mul(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S16, |masm, dst, src, size| { + masm.v128_mul(dst, src, writable!(dst), size)?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + + fn visit_i32x4_mul(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S32, |masm, dst, src, size| { + masm.v128_mul(dst, src, writable!(dst), size)?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + + fn visit_i64x2_mul(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S64, |masm, dst, src, size| { + masm.v128_mul(dst, src, writable!(dst), size)?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + wasmparser::for_each_visit_simd_operator!(def_unsupported); } From a05f46ee54d8e37826484aae46cd2344a3a5d661 Mon Sep 17 00:00:00 2001 From: adhoc Date: Wed, 29 Jan 2025 13:26:58 +0100 Subject: [PATCH 04/10] packed integer saturating add --- tests/disas/winch/x64/i16x8/add/add_sat_s.wat | 45 ++++++++++++ tests/disas/winch/x64/i16x8/add/add_sat_u.wat | 45 ++++++++++++ tests/disas/winch/x64/i8x16/add/add_sat_s.wat | 45 ++++++++++++ tests/disas/winch/x64/i8x16/add/add_sat_u.wat | 45 ++++++++++++ winch/codegen/src/isa/aarch64/masm.rs | 1 + winch/codegen/src/isa/x64/masm.rs | 39 +++++++--- winch/codegen/src/masm.rs | 19 ++++- winch/codegen/src/visitor.rs | 73 +++++++++++++++++-- 8 files changed, 292 insertions(+), 20 deletions(-) create mode 100644 tests/disas/winch/x64/i16x8/add/add_sat_s.wat create mode 100644 tests/disas/winch/x64/i16x8/add/add_sat_u.wat create mode 100644 tests/disas/winch/x64/i8x16/add/add_sat_s.wat create mode 100644 tests/disas/winch/x64/i8x16/add/add_sat_u.wat diff --git a/tests/disas/winch/x64/i16x8/add/add_sat_s.wat b/tests/disas/winch/x64/i16x8/add/add_sat_s.wat new file mode 100644 index 000000000000..0f5076182406 --- /dev/null +++ b/tests/disas/winch/x64/i16x8/add/add_sat_s.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i16x8.add_sat_s + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpaddsw %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: cmpl %eax, (%rip) +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/tests/disas/winch/x64/i16x8/add/add_sat_u.wat b/tests/disas/winch/x64/i16x8/add/add_sat_u.wat new file mode 100644 index 000000000000..91044110f38c --- /dev/null +++ b/tests/disas/winch/x64/i16x8/add/add_sat_u.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i16x8.add_sat_u + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpaddusw %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: cmpl %eax, (%rip) +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/tests/disas/winch/x64/i8x16/add/add_sat_s.wat b/tests/disas/winch/x64/i8x16/add/add_sat_s.wat new file mode 100644 index 000000000000..c03ae6e3e33d --- /dev/null +++ b/tests/disas/winch/x64/i8x16/add/add_sat_s.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i8x16.add_sat_s + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpaddsb %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: cmpl %eax, (%rip) +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/tests/disas/winch/x64/i8x16/add/add_sat_u.wat b/tests/disas/winch/x64/i8x16/add/add_sat_u.wat new file mode 100644 index 000000000000..460b2ef6f692 --- /dev/null +++ b/tests/disas/winch/x64/i8x16/add/add_sat_u.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i8x16.add_sat_u + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpaddusb %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: cmpl %eax, (%rip) +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs index 2d882b2aef5e..c9c78ad32b4d 100644 --- a/winch/codegen/src/isa/aarch64/masm.rs +++ b/winch/codegen/src/isa/aarch64/masm.rs @@ -1108,6 +1108,7 @@ impl Masm for MacroAssembler { _rhs: Reg, _dst: WritableReg, _size: OperandSize, + _handle_overflow: HandleOverflowKind, ) -> Result<()> { Err(anyhow!(CodeGenError::unimplemented_masm_instruction())) } diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs index 08f97def2a19..a64eb71a81ee 100644 --- a/winch/codegen/src/isa/x64/masm.rs +++ b/winch/codegen/src/isa/x64/masm.rs @@ -7,10 +7,7 @@ use super::{ use anyhow::{anyhow, bail, Result}; use crate::masm::{ - DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, Imm as I, IntCmpKind, LaneSelector, - LoadKind, MacroAssembler as Masm, MulWideKind, OperandSize, RegImm, RemKind, ReplaceLaneKind, - RmwOp, RoundingMode, ShiftKind, SplatKind, StoreKind, TrapCode, TruncKind, VectorCompareKind, - VectorEqualityKind, Zero, TRUSTED_FLAGS, UNTRUSTED_FLAGS, + DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, HandleOverflowKind, Imm as I, IntCmpKind, LaneSelector, LoadKind, MacroAssembler as Masm, MulWideKind, OperandSize, RegImm, RemKind, ReplaceLaneKind, RmwOp, RoundingMode, ShiftKind, SplatKind, StoreKind, TrapCode, TruncKind, VectorCompareKind, VectorEqualityKind, Zero, TRUSTED_FLAGS, UNTRUSTED_FLAGS }; use crate::{ abi::{self, align_to, calculate_frame_adjustment, LocalSlot}, @@ -1883,18 +1880,38 @@ impl Masm for MacroAssembler { Ok(()) } - fn v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { + fn v128_add( + &mut self, + lhs: Reg, + rhs: Reg, + dst: WritableReg, + size: OperandSize, + handle_overflow_kind: HandleOverflowKind, + ) -> Result<()> { self.ensure_has_avx()?; - let op = match size { - OperandSize::S8 => AvxOpcode::Vpaddb, - OperandSize::S16 => AvxOpcode::Vpaddw, - OperandSize::S32 => AvxOpcode::Vpaddd, - OperandSize::S64 => AvxOpcode::Vpaddq, - OperandSize::S128 => bail!(CodeGenError::unexpected_operand_size()), + let op = match handle_overflow_kind { + HandleOverflowKind::None => match size { + OperandSize::S8 => AvxOpcode::Vpaddb, + OperandSize::S16 => AvxOpcode::Vpaddw, + OperandSize::S32 => AvxOpcode::Vpaddd, + OperandSize::S64 => AvxOpcode::Vpaddq, + OperandSize::S128 => bail!(CodeGenError::unexpected_operand_size()), + }, + HandleOverflowKind::SignedSaturating => match size { + OperandSize::S8 => AvxOpcode::Vpaddsb, + OperandSize::S16 => AvxOpcode::Vpaddsw, + _ => bail!(CodeGenError::unexpected_operand_size()), + }, + HandleOverflowKind::UnsignedSaturating => match size { + OperandSize::S8 => AvxOpcode::Vpaddusb, + OperandSize::S16 => AvxOpcode::Vpaddusw, + _ => bail!(CodeGenError::unexpected_operand_size()), + }, }; self.asm.xmm_rmi_rvex(op, lhs, rhs, dst); + Ok(()) } diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs index 9917f8862d94..cc3f8ab4e1b9 100644 --- a/winch/codegen/src/masm.rs +++ b/winch/codegen/src/masm.rs @@ -225,6 +225,16 @@ pub(crate) enum Extend { __Kind(T), } +/// How to handle overflow. +pub enum HandleOverflowKind { + /// Do nothing. + None, + /// Perform signed saturation. + SignedSaturating, + /// Perform unsigned saturation. + UnsignedSaturating, +} + impl From> for ExtendKind { fn from(value: Extend) -> Self { ExtendKind::Unsigned(value) @@ -1648,7 +1658,14 @@ pub(crate) trait MacroAssembler { /// Perform a vector add between `lsh` and `rhs`, placing the result in `dst`, where each lane /// is interpreted to be `size` long. - fn v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>; + fn v128_add( + &mut self, + lhs: Reg, + rhs: Reg, + dst: WritableReg, + size: OperandSize, + handle_overflow: HandleOverflowKind, + ) -> Result<()>; /// Perform a vector sub between `lhs` and `rhs`, placing the result in `dst`, where each lane /// is interpreted to be `size` long. diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs index 8f5d638bf7b5..0c18b025c89b 100644 --- a/winch/codegen/src/visitor.rs +++ b/winch/codegen/src/visitor.rs @@ -10,10 +10,7 @@ use crate::codegen::{ FnCall, }; use crate::masm::{ - DivKind, Extend, ExtractLaneKind, FloatCmpKind, IntCmpKind, LoadKind, MacroAssembler, - MemMoveDirection, MulWideKind, OperandSize, RegImm, RemKind, ReplaceLaneKind, RmwOp, - RoundingMode, SPOffset, ShiftKind, Signed, SplatKind, SplatLoadKind, StoreKind, TruncKind, - V128LoadExtendKind, VectorCompareKind, VectorEqualityKind, Zero, + DivKind, Extend, ExtractLaneKind, FloatCmpKind, HandleOverflowKind, IntCmpKind, LoadKind, MacroAssembler, MemMoveDirection, MulWideKind, OperandSize, RegImm, RemKind, ReplaceLaneKind, RmwOp, RoundingMode, SPOffset, ShiftKind, Signed, SplatKind, SplatLoadKind, StoreKind, TruncKind, V128LoadExtendKind, VectorCompareKind, VectorEqualityKind, Zero }; use crate::reg::{writable, Reg}; @@ -429,6 +426,10 @@ macro_rules! def_unsupported { (emit I16x8Mul $($rest:tt)*) => {}; (emit I32x4Mul $($rest:tt)*) => {}; (emit I64x2Mul $($rest:tt)*) => {}; + (emit I8x16AddSatS $($rest:tt)*) => {}; + (emit I16x8AddSatS $($rest:tt)*) => {}; + (emit I8x16AddSatU $($rest:tt)*) => {}; + (emit I16x8AddSatU $($rest:tt)*) => {}; (emit $unsupported:tt $($rest:tt)*) => {$($rest)*}; } @@ -3534,7 +3535,7 @@ where fn visit_i8x16_add(&mut self) -> Self::Output { self.context .binop(self.masm, OperandSize::S8, |masm, dst, src, size| { - masm.v128_add(dst, src, writable!(dst), size)?; + masm.v128_add(dst, src, writable!(dst), size, HandleOverflowKind::None)?; Ok(TypedReg::new(WasmValType::V128, dst)) }) } @@ -3542,7 +3543,7 @@ where fn visit_i16x8_add(&mut self) -> Self::Output { self.context .binop(self.masm, OperandSize::S16, |masm, dst, src, size| { - masm.v128_add(dst, src, writable!(dst), size)?; + masm.v128_add(dst, src, writable!(dst), size, HandleOverflowKind::None)?; Ok(TypedReg::new(WasmValType::V128, dst)) }) } @@ -3550,7 +3551,7 @@ where fn visit_i32x4_add(&mut self) -> Self::Output { self.context .binop(self.masm, OperandSize::S32, |masm, dst, src, size| { - masm.v128_add(dst, src, writable!(dst), size)?; + masm.v128_add(dst, src, writable!(dst), size, HandleOverflowKind::None)?; Ok(TypedReg::new(WasmValType::V128, dst)) }) } @@ -3558,7 +3559,7 @@ where fn visit_i64x2_add(&mut self) -> Self::Output { self.context .binop(self.masm, OperandSize::S64, |masm, dst, src, size| { - masm.v128_add(dst, src, writable!(dst), size)?; + masm.v128_add(dst, src, writable!(dst), size, HandleOverflowKind::None)?; Ok(TypedReg::new(WasmValType::V128, dst)) }) } @@ -3619,6 +3620,62 @@ where }) } + fn visit_i8x16_add_sat_s(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S8, |masm, dst, src, size| { + masm.v128_add( + dst, + src, + writable!(dst), + size, + HandleOverflowKind::SignedSaturating, + )?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + + fn visit_i16x8_add_sat_s(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S16, |masm, dst, src, size| { + masm.v128_add( + dst, + src, + writable!(dst), + size, + HandleOverflowKind::SignedSaturating, + )?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + + fn visit_i8x16_add_sat_u(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S8, |masm, dst, src, size| { + masm.v128_add( + dst, + src, + writable!(dst), + size, + HandleOverflowKind::UnsignedSaturating, + )?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + + fn visit_i16x8_add_sat_u(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S16, |masm, dst, src, size| { + masm.v128_add( + dst, + src, + writable!(dst), + size, + HandleOverflowKind::UnsignedSaturating, + )?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + wasmparser::for_each_visit_simd_operator!(def_unsupported); } From 87c675986fd2b1f7bec3e43af3f987cf3e4add42 Mon Sep 17 00:00:00 2001 From: adhoc Date: Wed, 29 Jan 2025 13:39:48 +0100 Subject: [PATCH 05/10] packed integer saturating sub --- tests/disas/winch/x64/i16x8/sub/sub_sat_s.wat | 45 ++++++++++++ tests/disas/winch/x64/i16x8/sub/sub_sat_u.wat | 45 ++++++++++++ tests/disas/winch/x64/i8x16/sub/sub_sat_s.wat | 45 ++++++++++++ tests/disas/winch/x64/i8x16/sub/sub_sat_u.wat | 45 ++++++++++++ winch/codegen/src/isa/aarch64/masm.rs | 1 + winch/codegen/src/isa/x64/masm.rs | 34 ++++++++-- winch/codegen/src/masm.rs | 9 ++- winch/codegen/src/visitor.rs | 68 +++++++++++++++++-- 8 files changed, 280 insertions(+), 12 deletions(-) create mode 100644 tests/disas/winch/x64/i16x8/sub/sub_sat_s.wat create mode 100644 tests/disas/winch/x64/i16x8/sub/sub_sat_u.wat create mode 100644 tests/disas/winch/x64/i8x16/sub/sub_sat_s.wat create mode 100644 tests/disas/winch/x64/i8x16/sub/sub_sat_u.wat diff --git a/tests/disas/winch/x64/i16x8/sub/sub_sat_s.wat b/tests/disas/winch/x64/i16x8/sub/sub_sat_s.wat new file mode 100644 index 000000000000..c2a60bb6a66d --- /dev/null +++ b/tests/disas/winch/x64/i16x8/sub/sub_sat_s.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i16x8.sub_sat_s + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpsubsw %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: cmpl %eax, (%rip) +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/tests/disas/winch/x64/i16x8/sub/sub_sat_u.wat b/tests/disas/winch/x64/i16x8/sub/sub_sat_u.wat new file mode 100644 index 000000000000..9826ba671b6e --- /dev/null +++ b/tests/disas/winch/x64/i16x8/sub/sub_sat_u.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i16x8.sub_sat_u + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpsubusw %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: cmpl %eax, (%rip) +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/tests/disas/winch/x64/i8x16/sub/sub_sat_s.wat b/tests/disas/winch/x64/i8x16/sub/sub_sat_s.wat new file mode 100644 index 000000000000..a580be69ce1a --- /dev/null +++ b/tests/disas/winch/x64/i8x16/sub/sub_sat_s.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i8x16.sub_sat_s + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpsubsb %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: cmpl %eax, (%rip) +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/tests/disas/winch/x64/i8x16/sub/sub_sat_u.wat b/tests/disas/winch/x64/i8x16/sub/sub_sat_u.wat new file mode 100644 index 000000000000..7e3197f95a9b --- /dev/null +++ b/tests/disas/winch/x64/i8x16/sub/sub_sat_u.wat @@ -0,0 +1,45 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (result v128) + (i8x16.sub_sat_u + (v128.const i64x2 42 42) + (v128.const i64x2 1337 1337) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x4a +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x1c(%rip), %xmm0 +;; movdqu 0x24(%rip), %xmm1 +;; vpsubusb %xmm0, %xmm1, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 4a: ud2 +;; 4c: addb %al, (%rax) +;; 4e: addb %al, (%rax) +;; 50: cmpl %eax, (%rip) +;; 56: addb %al, (%rax) +;; 58: cmpl %eax, (%rip) +;; 5e: addb %al, (%rax) +;; 60: subb (%rax), %al +;; 62: addb %al, (%rax) +;; 64: addb %al, (%rax) +;; 66: addb %al, (%rax) +;; 68: subb (%rax), %al +;; 6a: addb %al, (%rax) +;; 6c: addb %al, (%rax) +;; 6e: addb %al, (%rax) diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs index c9c78ad32b4d..cc7055991da7 100644 --- a/winch/codegen/src/isa/aarch64/masm.rs +++ b/winch/codegen/src/isa/aarch64/masm.rs @@ -1119,6 +1119,7 @@ impl Masm for MacroAssembler { _rhs: Reg, _dst: WritableReg, _size: OperandSize, + _handle_overflow: HandleOverflowKind, ) -> Result<()> { Err(anyhow!(CodeGenError::unimplemented_masm_instruction())) } diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs index a64eb71a81ee..e929a1956488 100644 --- a/winch/codegen/src/isa/x64/masm.rs +++ b/winch/codegen/src/isa/x64/masm.rs @@ -1915,18 +1915,38 @@ impl Masm for MacroAssembler { Ok(()) } - fn v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { + fn v128_sub( + &mut self, + lhs: Reg, + rhs: Reg, + dst: WritableReg, + size: OperandSize, + handle_overflow_kind: HandleOverflowKind, + ) -> Result<()> { self.ensure_has_avx()?; - let op = match size { - OperandSize::S8 => AvxOpcode::Vpsubb, - OperandSize::S16 => AvxOpcode::Vpsubw, - OperandSize::S32 => AvxOpcode::Vpsubd, - OperandSize::S64 => AvxOpcode::Vpsubq, - OperandSize::S128 => bail!(CodeGenError::unexpected_operand_size()), + let op = match handle_overflow_kind { + HandleOverflowKind::None => match size { + OperandSize::S8 => AvxOpcode::Vpsubb, + OperandSize::S16 => AvxOpcode::Vpsubw, + OperandSize::S32 => AvxOpcode::Vpsubd, + OperandSize::S64 => AvxOpcode::Vpsubq, + OperandSize::S128 => bail!(CodeGenError::unexpected_operand_size()), + }, + HandleOverflowKind::SignedSaturating => match size { + OperandSize::S8 => AvxOpcode::Vpsubsb, + OperandSize::S16 => AvxOpcode::Vpsubsw, + _ => bail!(CodeGenError::unexpected_operand_size()), + }, + HandleOverflowKind::UnsignedSaturating => match size { + OperandSize::S8 => AvxOpcode::Vpsubusb, + OperandSize::S16 => AvxOpcode::Vpsubusw, + _ => bail!(CodeGenError::unexpected_operand_size()), + }, }; self.asm.xmm_rmi_rvex(op, lhs, rhs, dst); + Ok(()) } diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs index cc3f8ab4e1b9..a65212f15adb 100644 --- a/winch/codegen/src/masm.rs +++ b/winch/codegen/src/masm.rs @@ -1669,7 +1669,14 @@ pub(crate) trait MacroAssembler { /// Perform a vector sub between `lhs` and `rhs`, placing the result in `dst`, where each lane /// is interpreted to be `size` long. - fn v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>; + fn v128_sub( + &mut self, + lhs: Reg, + rhs: Reg, + dst: WritableReg, + size: OperandSize, + handle_overflow: HandleOverflowKind, + ) -> Result<()>; /// Perform a vector lane-wise mul between `lhs` and `rhs`, placing the result in `dst`, where each lane /// is interpreted to be `size` long. diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs index 0c18b025c89b..00a725ba4661 100644 --- a/winch/codegen/src/visitor.rs +++ b/winch/codegen/src/visitor.rs @@ -430,6 +430,10 @@ macro_rules! def_unsupported { (emit I16x8AddSatS $($rest:tt)*) => {}; (emit I8x16AddSatU $($rest:tt)*) => {}; (emit I16x8AddSatU $($rest:tt)*) => {}; + (emit I8x16SubSatS $($rest:tt)*) => {}; + (emit I16x8SubSatS $($rest:tt)*) => {}; + (emit I8x16SubSatU $($rest:tt)*) => {}; + (emit I16x8SubSatU $($rest:tt)*) => {}; (emit $unsupported:tt $($rest:tt)*) => {$($rest)*}; } @@ -3567,7 +3571,7 @@ where fn visit_i8x16_sub(&mut self) -> Self::Output { self.context .binop(self.masm, OperandSize::S8, |masm, dst, src, size| { - masm.v128_sub(dst, src, writable!(dst), size)?; + masm.v128_sub(dst, src, writable!(dst), size, HandleOverflowKind::None)?; Ok(TypedReg::new(WasmValType::V128, dst)) }) } @@ -3575,7 +3579,7 @@ where fn visit_i16x8_sub(&mut self) -> Self::Output { self.context .binop(self.masm, OperandSize::S16, |masm, dst, src, size| { - masm.v128_sub(dst, src, writable!(dst), size)?; + masm.v128_sub(dst, src, writable!(dst), size, HandleOverflowKind::None)?; Ok(TypedReg::new(WasmValType::V128, dst)) }) } @@ -3583,7 +3587,7 @@ where fn visit_i32x4_sub(&mut self) -> Self::Output { self.context .binop(self.masm, OperandSize::S32, |masm, dst, src, size| { - masm.v128_sub(dst, src, writable!(dst), size)?; + masm.v128_sub(dst, src, writable!(dst), size, HandleOverflowKind::None)?; Ok(TypedReg::new(WasmValType::V128, dst)) }) } @@ -3591,7 +3595,7 @@ where fn visit_i64x2_sub(&mut self) -> Self::Output { self.context .binop(self.masm, OperandSize::S64, |masm, dst, src, size| { - masm.v128_sub(dst, src, writable!(dst), size)?; + masm.v128_sub(dst, src, writable!(dst), size, HandleOverflowKind::None)?; Ok(TypedReg::new(WasmValType::V128, dst)) }) } @@ -3676,6 +3680,62 @@ where }) } + fn visit_i8x16_sub_sat_s(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S8, |masm, dst, src, size| { + masm.v128_sub( + dst, + src, + writable!(dst), + size, + HandleOverflowKind::SignedSaturating, + )?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + + fn visit_i16x8_sub_sat_s(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S16, |masm, dst, src, size| { + masm.v128_sub( + dst, + src, + writable!(dst), + size, + HandleOverflowKind::SignedSaturating, + )?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + + fn visit_i8x16_sub_sat_u(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S8, |masm, dst, src, size| { + masm.v128_sub( + dst, + src, + writable!(dst), + size, + HandleOverflowKind::UnsignedSaturating, + )?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + + fn visit_i16x8_sub_sat_u(&mut self) -> Self::Output { + self.context + .binop(self.masm, OperandSize::S16, |masm, dst, src, size| { + masm.v128_sub( + dst, + src, + writable!(dst), + size, + HandleOverflowKind::UnsignedSaturating, + )?; + Ok(TypedReg::new(WasmValType::V128, dst)) + }) + } + wasmparser::for_each_visit_simd_operator!(def_unsupported); } From cec7a09ffa25003ebd149d7ec6a88466a7756749 Mon Sep 17 00:00:00 2001 From: adhoc Date: Wed, 29 Jan 2025 14:10:22 +0100 Subject: [PATCH 06/10] fix missing error codes for avx --- winch/codegen/src/codegen/error.rs | 6 ++++++ winch/codegen/src/isa/x64/masm.rs | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/winch/codegen/src/codegen/error.rs b/winch/codegen/src/codegen/error.rs index 344a514e1d0c..2a3d652aa940 100644 --- a/winch/codegen/src/codegen/error.rs +++ b/winch/codegen/src/codegen/error.rs @@ -26,6 +26,12 @@ pub(crate) enum CodeGenError { /// Unimplemented due to requiring AVX2. #[error("Instruction not implemented for CPUs without AVX2 support")] UnimplementedForNoAvx2, + /// Unimplemented due to requiring AVX512VL. + #[error("Instruction not implemented for CPUs without AVX512VL support")] + UnimplementedForNoAvx512VL, + /// Unimplemented due to requiring AVX512DQ. + #[error("Instruction not implemented for CPUs without AVX512DQ support")] + UnimplementedForNoAvx512DQ, /// Unsupported eager initialization of tables. #[error("Unsupported eager initialization of tables")] UnsupportedTableEagerInit, diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs index e929a1956488..36736a144d37 100644 --- a/winch/codegen/src/isa/x64/masm.rs +++ b/winch/codegen/src/isa/x64/masm.rs @@ -2015,7 +2015,7 @@ impl MacroAssembler { fn ensure_has_avx512vl(&self) -> Result<()> { anyhow::ensure!( self.flags.has_avx512vl(), - CodeGenError::UnimplementedForNoAvx2 + CodeGenError::UnimplementedForNoAvx512VL ); Ok(()) } @@ -2023,7 +2023,7 @@ impl MacroAssembler { fn ensure_has_avx512dq(&self) -> Result<()> { anyhow::ensure!( self.flags.has_avx512dq(), - CodeGenError::UnimplementedForNoAvx2 + CodeGenError::UnimplementedForNoAvx512DQ ); Ok(()) } From c783cda53577d34959becaf6a972d9c618f41adc Mon Sep 17 00:00:00 2001 From: adhoc Date: Wed, 29 Jan 2025 22:15:34 +0100 Subject: [PATCH 07/10] change `size` to `lane_width` --- winch/codegen/src/masm.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs index a65212f15adb..85d713e2a01d 100644 --- a/winch/codegen/src/masm.rs +++ b/winch/codegen/src/masm.rs @@ -1657,28 +1657,32 @@ pub(crate) trait MacroAssembler { fn v128_any_true(&mut self, src: Reg, dst: WritableReg) -> Result<()>; /// Perform a vector add between `lsh` and `rhs`, placing the result in `dst`, where each lane - /// is interpreted to be `size` long. + /// is interpreted to be `lane_width` long. + /// + /// `handle_overflow` determines how overflow should be handled. fn v128_add( &mut self, lhs: Reg, rhs: Reg, dst: WritableReg, - size: OperandSize, + lane_width: OperandSize, handle_overflow: HandleOverflowKind, ) -> Result<()>; /// Perform a vector sub between `lhs` and `rhs`, placing the result in `dst`, where each lane - /// is interpreted to be `size` long. + /// is interpreted to be `lane_width` long. + /// + /// `handle_overflow` determines how overflow should be handled. fn v128_sub( &mut self, lhs: Reg, rhs: Reg, dst: WritableReg, - size: OperandSize, + lane_width: OperandSize, handle_overflow: HandleOverflowKind, ) -> Result<()>; /// Perform a vector lane-wise mul between `lhs` and `rhs`, placing the result in `dst`, where each lane /// is interpreted to be `size` long. - fn v128_mul(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>; + fn v128_mul(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, lane_width: OperandSize) -> Result<()>; } From afa529413bc4e87b6b145db4357d0426640445a6 Mon Sep 17 00:00:00 2001 From: adhoc Date: Thu, 30 Jan 2025 12:53:24 +0100 Subject: [PATCH 08/10] fmt --- winch/codegen/src/masm.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs index 85d713e2a01d..d51895a4a05a 100644 --- a/winch/codegen/src/masm.rs +++ b/winch/codegen/src/masm.rs @@ -1684,5 +1684,11 @@ pub(crate) trait MacroAssembler { /// Perform a vector lane-wise mul between `lhs` and `rhs`, placing the result in `dst`, where each lane /// is interpreted to be `size` long. - fn v128_mul(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, lane_width: OperandSize) -> Result<()>; + fn v128_mul( + &mut self, + lhs: Reg, + rhs: Reg, + dst: WritableReg, + lane_width: OperandSize, + ) -> Result<()>; } From 55046ec809bb522da5a1d569bf29aeea0e27d20e Mon Sep 17 00:00:00 2001 From: adhoc Date: Fri, 31 Jan 2025 17:54:16 +0100 Subject: [PATCH 09/10] i64x2 mul fallback --- winch/codegen/src/isa/aarch64/masm.rs | 14 ++- winch/codegen/src/isa/x64/asm.rs | 8 +- winch/codegen/src/isa/x64/masm.rs | 126 +++++++++++++++++++++++--- winch/codegen/src/masm.rs | 4 +- winch/codegen/src/visitor.rs | 23 ++--- 5 files changed, 134 insertions(+), 41 deletions(-) diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs index cc7055991da7..fef5dccfef45 100644 --- a/winch/codegen/src/isa/aarch64/masm.rs +++ b/winch/codegen/src/isa/aarch64/masm.rs @@ -13,10 +13,10 @@ use crate::{ CallingConvention, }, masm::{ - CalleeKind, DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, Imm as I, - IntCmpKind, LoadKind, MacroAssembler as Masm, MulWideKind, OperandSize, RegImm, RemKind, - ReplaceLaneKind, RmwOp, RoundingMode, SPOffset, ShiftKind, SplatKind, StackSlot, StoreKind, - TrapCode, TruncKind, VectorCompareKind, VectorEqualityKind, Zero, TRUSTED_FLAGS, + CalleeKind, DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, HandleOverflowKind, + Imm as I, IntCmpKind, LoadKind, MacroAssembler as Masm, MulWideKind, OperandSize, RegImm, + RemKind, ReplaceLaneKind, RmwOp, RoundingMode, SPOffset, ShiftKind, SplatKind, StackSlot, + StoreKind, TrapCode, TruncKind, VectorCompareKind, VectorEqualityKind, Zero, TRUSTED_FLAGS, UNTRUSTED_FLAGS, }, stack::TypedReg, @@ -1126,10 +1126,8 @@ impl Masm for MacroAssembler { fn v128_mul( &mut self, - _lhs: Reg, - _rhs: Reg, - _dst: WritableReg, - _size: OperandSize, + _context: &mut CodeGenContext, + _lane_width: OperandSize, ) -> Result<()> { Err(anyhow!(CodeGenError::unimplemented_masm_instruction())) } diff --git a/winch/codegen/src/isa/x64/asm.rs b/winch/codegen/src/isa/x64/asm.rs index 6b0f0d544f34..ad08f9e8e7ac 100644 --- a/winch/codegen/src/isa/x64/asm.rs +++ b/winch/codegen/src/isa/x64/asm.rs @@ -1880,7 +1880,13 @@ impl Assembler { } } - pub fn xmm_rmi_rvex(&mut self, op: AvxOpcode, src1: Reg, src2: Reg, dst: WritableReg) { + pub fn xmm_rmi_rvex( + &mut self, + op: AvxOpcode, + src1: Reg, + src2: impl Into, + dst: WritableReg, + ) { self.emit(Inst::XmmRmiRVex { op, src1: src1.into(), diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs index 36736a144d37..460038b07ccf 100644 --- a/winch/codegen/src/isa/x64/masm.rs +++ b/winch/codegen/src/isa/x64/masm.rs @@ -7,7 +7,10 @@ use super::{ use anyhow::{anyhow, bail, Result}; use crate::masm::{ - DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, HandleOverflowKind, Imm as I, IntCmpKind, LaneSelector, LoadKind, MacroAssembler as Masm, MulWideKind, OperandSize, RegImm, RemKind, ReplaceLaneKind, RmwOp, RoundingMode, ShiftKind, SplatKind, StoreKind, TrapCode, TruncKind, VectorCompareKind, VectorEqualityKind, Zero, TRUSTED_FLAGS, UNTRUSTED_FLAGS + DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, HandleOverflowKind, Imm as I, + IntCmpKind, LaneSelector, LoadKind, MacroAssembler as Masm, MulWideKind, OperandSize, RegImm, + RemKind, ReplaceLaneKind, RmwOp, RoundingMode, ShiftKind, SplatKind, StoreKind, TrapCode, + TruncKind, VectorCompareKind, VectorEqualityKind, Zero, TRUSTED_FLAGS, UNTRUSTED_FLAGS, }; use crate::{ abi::{self, align_to, calculate_frame_adjustment, LocalSlot}, @@ -31,7 +34,7 @@ use cranelift_codegen::{ isa::{ unwind::UnwindInst, x64::{ - args::{Avx512Opcode, AvxOpcode, FenceKind, CC}, + args::{Avx512Opcode, AvxOpcode, FenceKind, RegMemImm, XmmMemImm, CC}, settings as x64_settings, AtomicRmwSeqOp, }, }, @@ -1950,26 +1953,123 @@ impl Masm for MacroAssembler { Ok(()) } - fn v128_mul(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { + fn v128_mul( + &mut self, + context: &mut CodeGenContext, + lane_width: OperandSize, + ) -> Result<()> { + self.ensure_has_avx()?; + + let rhs = context.pop_to_reg(self, None)?; + let lhs = context.pop_to_reg(self, None)?; + let mul_avx = |this: &mut Self, op| { - this.ensure_has_avx()?; - this.asm.xmm_rmi_rvex(op, lhs, rhs, dst); - Ok(()) + this.asm + .xmm_rmi_rvex(op, lhs.reg, rhs.reg, writable!(lhs.reg)); }; - let mul_avx512 = |this: &mut Self, op| { - this.ensure_has_avx512vl()?; - this.ensure_has_avx512dq()?; - this.asm.xmm_rm_rvex3(op, lhs, rhs, dst); - Ok(()) + let mul_i64x2_avx512 = |this: &mut Self| { + this.asm + .xmm_rm_rvex3(Avx512Opcode::Vpmullq, lhs.reg, rhs.reg, writable!(lhs.reg)); }; - match size { + let mul_i64x2_fallback = + |this: &mut Self, context: &mut CodeGenContext| -> Result<()> { + // Standard AVX doesn't have an instruction for i64x2 multiplication, instead, we have to fallback + // to an instruction sequence using 32bits multiplication (taken from cranelift + // implementation, in `isa/x64/lower.isle`): + // + // > Otherwise, for i64x2 multiplication we describe a lane A as being composed of + // > a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand + // > multiplication can then be written as: + // + // > Ah Al + // > * Bh Bl + // > ----- + // > Al * Bl + // > + (Ah * Bl) << 32 + // > + (Al * Bh) << 32 + // + // > So for each lane we will compute: + // + // > A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32 + // + // > Note, the algorithm will use `pmuludq` which operates directly on the lower + // > 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of + // > the lane of the destination. For this reason we don't need shifts to isolate + // > the lower 32-bits, however, we will need to use shifts to isolate the high + // > 32-bits when doing calculations, i.e., `Ah == A >> 32`. + + let tmp1 = regs::scratch_xmm(); + let tmp2 = context.any_fpr(this)?; + + // tmp1 = lhs_hi = (lhs >> 32) + this.asm.xmm_rmi_rvex( + AvxOpcode::Vpsrlq, + lhs.reg, + XmmMemImm::unwrap_new(RegMemImm::imm(32)), + writable!(tmp1), + ); + // tmp2 = lhs_hi * rhs_low = tmp1 * rhs + this.asm + .xmm_rmi_rvex(AvxOpcode::Vpmuldq, tmp1, rhs.reg, writable!(tmp2)); + + // tmp1 = rhs_hi = rhs >> 32 + this.asm.xmm_rmi_rvex( + AvxOpcode::Vpsrlq, + rhs.reg, + XmmMemImm::unwrap_new(RegMemImm::imm(32)), + writable!(tmp1), + ); + + // tmp1 = lhs_low * rhs_high = tmp1 * lhs + this.asm + .xmm_rmi_rvex(AvxOpcode::Vpmuludq, tmp1, lhs.reg, writable!(tmp1)); + + // tmp1 = ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp2 + this.asm + .xmm_rmi_rvex(AvxOpcode::Vpaddq, tmp1, tmp2, writable!(tmp1)); + + //tmp1 = tmp1 << 32 + this.asm.xmm_rmi_rvex( + AvxOpcode::Vpsllq, + tmp1, + XmmMemImm::unwrap_new(RegMemImm::imm(32)), + writable!(tmp1), + ); + + // tmp2 = lhs_lo + rhs_lo + this.asm + .xmm_rmi_rvex(AvxOpcode::Vpmuludq, lhs.reg, rhs.reg, writable!(tmp2)); + + // finally, with `lhs` as destination: + // lhs = (lhs_low * rhs_low) + ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp2 + this.asm + .xmm_rmi_rvex(AvxOpcode::Vpaddq, tmp1, tmp2, writable!(lhs.reg)); + + context.free_reg(tmp2); + + Ok(()) + }; + + match lane_width { OperandSize::S16 => mul_avx(self, AvxOpcode::Vpmullw), OperandSize::S32 => mul_avx(self, AvxOpcode::Vpmulld), - OperandSize::S64 => mul_avx512(self, Avx512Opcode::Vpmullq), + // This is the fast path when AVX512 is available. + OperandSize::S64 + if self.ensure_has_avx512vl().is_ok() && self.ensure_has_avx512dq().is_ok() => + { + mul_i64x2_avx512(self) + } + // Otherwise, we emit AVX fallback sequence. + OperandSize::S64 => mul_i64x2_fallback(self, context)?, _ => bail!(CodeGenError::unexpected_operand_size()), } + + context.stack.push(lhs.into()); + context.free_reg(rhs); + + Ok(()) } } diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs index d51895a4a05a..55a70419dd9e 100644 --- a/winch/codegen/src/masm.rs +++ b/winch/codegen/src/masm.rs @@ -1686,9 +1686,7 @@ pub(crate) trait MacroAssembler { /// is interpreted to be `size` long. fn v128_mul( &mut self, - lhs: Reg, - rhs: Reg, - dst: WritableReg, + context: &mut CodeGenContext, lane_width: OperandSize, ) -> Result<()>; } diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs index 00a725ba4661..136437ba45e6 100644 --- a/winch/codegen/src/visitor.rs +++ b/winch/codegen/src/visitor.rs @@ -10,7 +10,10 @@ use crate::codegen::{ FnCall, }; use crate::masm::{ - DivKind, Extend, ExtractLaneKind, FloatCmpKind, HandleOverflowKind, IntCmpKind, LoadKind, MacroAssembler, MemMoveDirection, MulWideKind, OperandSize, RegImm, RemKind, ReplaceLaneKind, RmwOp, RoundingMode, SPOffset, ShiftKind, Signed, SplatKind, SplatLoadKind, StoreKind, TruncKind, V128LoadExtendKind, VectorCompareKind, VectorEqualityKind, Zero + DivKind, Extend, ExtractLaneKind, FloatCmpKind, HandleOverflowKind, IntCmpKind, LoadKind, + MacroAssembler, MemMoveDirection, MulWideKind, OperandSize, RegImm, RemKind, ReplaceLaneKind, + RmwOp, RoundingMode, SPOffset, ShiftKind, Signed, SplatKind, SplatLoadKind, StoreKind, + TruncKind, V128LoadExtendKind, VectorCompareKind, VectorEqualityKind, Zero, }; use crate::reg::{writable, Reg}; @@ -3601,27 +3604,15 @@ where } fn visit_i16x8_mul(&mut self) -> Self::Output { - self.context - .binop(self.masm, OperandSize::S16, |masm, dst, src, size| { - masm.v128_mul(dst, src, writable!(dst), size)?; - Ok(TypedReg::new(WasmValType::V128, dst)) - }) + self.masm.v128_mul(&mut self.context, OperandSize::S16) } fn visit_i32x4_mul(&mut self) -> Self::Output { - self.context - .binop(self.masm, OperandSize::S32, |masm, dst, src, size| { - masm.v128_mul(dst, src, writable!(dst), size)?; - Ok(TypedReg::new(WasmValType::V128, dst)) - }) + self.masm.v128_mul(&mut self.context, OperandSize::S32) } fn visit_i64x2_mul(&mut self) -> Self::Output { - self.context - .binop(self.masm, OperandSize::S64, |masm, dst, src, size| { - masm.v128_mul(dst, src, writable!(dst), size)?; - Ok(TypedReg::new(WasmValType::V128, dst)) - }) + self.masm.v128_mul(&mut self.context, OperandSize::S64) } fn visit_i8x16_add_sat_s(&mut self) -> Self::Output { From 8a9cde43c22713a7672928cae5ea37b8941c53f5 Mon Sep 17 00:00:00 2001 From: adhoc Date: Fri, 31 Jan 2025 23:33:54 +0100 Subject: [PATCH 10/10] add fallback test. --- .../winch/x64/i64x2/mul/mul_fallback.wat | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 tests/disas/winch/x64/i64x2/mul/mul_fallback.wat diff --git a/tests/disas/winch/x64/i64x2/mul/mul_fallback.wat b/tests/disas/winch/x64/i64x2/mul/mul_fallback.wat new file mode 100644 index 000000000000..d344433ce2a2 --- /dev/null +++ b/tests/disas/winch/x64/i64x2/mul/mul_fallback.wat @@ -0,0 +1,40 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (memory 1 1) + (func (param v128 v128) (result v128) + (i64x2.mul + (local.get 0) + (local.get 1) + ))) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x30, %r11 +;; cmpq %rsp, %r11 +;; ja 0x72 +;; 1c: movq %rdi, %r14 +;; subq $0x30, %rsp +;; movq %rdi, 0x28(%rsp) +;; movq %rsi, 0x20(%rsp) +;; movdqu %xmm0, 0x10(%rsp) +;; movdqu %xmm1, (%rsp) +;; movdqu (%rsp), %xmm0 +;; movdqu 0x10(%rsp), %xmm1 +;; vpsrlq $0x20, %xmm1, %xmm15 +;; vpmuldq %xmm0, %xmm15, %xmm2 +;; vpsrlq $0x20, %xmm0, %xmm15 +;; vpmuludq %xmm1, %xmm15, %xmm15 +;; vpaddq %xmm2, %xmm15, %xmm15 +;; vpsllq $0x20, %xmm15, %xmm15 +;; vpmuludq %xmm0, %xmm1, %xmm2 +;; vpaddq %xmm2, %xmm15, %xmm1 +;; movdqa %xmm1, %xmm0 +;; addq $0x30, %rsp +;; popq %rbp +;; retq +;; 72: ud2