Skip to content

[WebAssembly] Support promoting lower lanes of f16x8 to f32x4. #129786

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

brendandahl
Copy link
Contributor

No description provided.

@llvmbot llvmbot added clang Clang issues not falling into any other category backend:WebAssembly backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics mc Machine (object) code labels Mar 4, 2025
@llvmbot
Copy link
Member

llvmbot commented Mar 4, 2025

@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-clang

Author: Brendan Dahl (brendandahl)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/129786.diff

6 Files Affected:

  • (modified) clang/lib/Headers/wasm_simd128.h (+8)
  • (modified) cross-project-tests/intrinsic-header-tests/wasm_simd128.c (+6)
  • (modified) llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp (+31-14)
  • (modified) llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td (+1)
  • (modified) llvm/test/CodeGen/WebAssembly/half-precision.ll (+20)
  • (modified) llvm/test/MC/WebAssembly/simd-encodings.s (+3)
diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h
index 08e39bf1a79b4..c509d7841135e 100644
--- a/clang/lib/Headers/wasm_simd128.h
+++ b/clang/lib/Headers/wasm_simd128.h
@@ -45,6 +45,7 @@ typedef int __i32x2 __attribute__((__vector_size__(8), __aligned__(8)));
 typedef unsigned int __u32x2
     __attribute__((__vector_size__(8), __aligned__(8)));
 typedef float __f32x2 __attribute__((__vector_size__(8), __aligned__(8)));
+typedef __fp16 __f16x4 __attribute__((__vector_size__(8), __aligned__(8)));
 
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("simd128"),        \
@@ -2010,6 +2011,13 @@ static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_u16x8(v128_t __a) {
   return (v128_t) __builtin_convertvector((__u16x8)__a, __f16x8);
 }
 
+static __inline__ v128_t __FP16_FN_ATTRS
+wasm_f32x4_promote_low_f16x8(v128_t __a) {
+  return (v128_t) __builtin_convertvector(
+      (__f16x4){((__f16x8)__a)[0], ((__f16x8)__a)[1],
+                ((__f16x8)__a)[2], ((__f16x8)__a)[3]}, __f32x4);
+}
+
 static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_madd(v128_t __a,
                                                                  v128_t __b,
                                                                  v128_t __c) {
diff --git a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c
index b601d90cfcc92..1f4809483589e 100644
--- a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c
+++ b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c
@@ -1033,6 +1033,12 @@ v128_t test_f64x2_promote_low_f32x4(v128_t a) {
   return wasm_f64x2_promote_low_f32x4(a);
 }
 
+// CHECK-LABEL: test_f32x4_promote_low_f16x8:
+// CHECK: f32x4.promote_low_f16x8{{$}}
+v128_t test_f32x4_promote_low_f16x8(v128_t a) {
+  return wasm_f32x4_promote_low_f16x8(a);
+}
+
 // CHECK-LABEL: test_i8x16_shuffle:
 // CHECK: i8x16.shuffle 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
 // 0{{$}}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index b24a45c2d8898..4a034ed508cfe 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -2341,7 +2341,7 @@ WebAssemblyTargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op,
 
 static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
-  if (Op.getValueType() != MVT::v2f64)
+  if (Op.getValueType() != MVT::v2f64 && Op.getValueType() != MVT::v4f32)
     return SDValue();
 
   auto GetConvertedLane = [](SDValue Op, unsigned &Opcode, SDValue &SrcVec,
@@ -2354,6 +2354,7 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) {
       Opcode = WebAssemblyISD::CONVERT_LOW_U;
       break;
     case ISD::FP_EXTEND:
+    case ISD::FP16_TO_FP:
       Opcode = WebAssemblyISD::PROMOTE_LOW;
       break;
     default:
@@ -2372,36 +2373,52 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) {
     return true;
   };
 
-  unsigned LHSOpcode, RHSOpcode, LHSIndex, RHSIndex;
-  SDValue LHSSrcVec, RHSSrcVec;
-  if (!GetConvertedLane(Op.getOperand(0), LHSOpcode, LHSSrcVec, LHSIndex) ||
-      !GetConvertedLane(Op.getOperand(1), RHSOpcode, RHSSrcVec, RHSIndex))
+  unsigned NumLanes = Op.getValueType() == MVT::v2f64 ? 2 : 4;
+  unsigned FirstOpcode = 0, SecondOpcode = 0, ThirdOpcode = 0, FourthOpcode = 0;
+  unsigned FirstIndex = 0, SecondIndex = 0, ThirdIndex = 0, FourthIndex = 0;
+  SDValue FirstSrcVec, SecondSrcVec, ThirdSrcVec, FourthSrcVec;
+
+  if (!GetConvertedLane(Op.getOperand(0), FirstOpcode, FirstSrcVec, FirstIndex) ||
+      !GetConvertedLane(Op.getOperand(1), SecondOpcode, SecondSrcVec, SecondIndex))
+    return SDValue();
+
+  // If we're converting to v4f32, check the third and fourth lanes, too.
+  if (NumLanes == 4 && (!GetConvertedLane(Op.getOperand(2), ThirdOpcode, ThirdSrcVec, ThirdIndex) ||
+                        !GetConvertedLane(Op.getOperand(3), FourthOpcode, FourthSrcVec, FourthIndex)))
+    return SDValue();
+
+  if (FirstOpcode != SecondOpcode)
     return SDValue();
 
-  if (LHSOpcode != RHSOpcode)
+  // TODO Add an optimization similar to the v2f64 below for shuffling the
+  // vectors when the lanes are in the wrong order or come from different src
+  // vectors.
+  if (NumLanes == 4 && (FirstOpcode != ThirdOpcode || FirstOpcode != FourthOpcode ||
+                        FirstSrcVec != SecondSrcVec || FirstSrcVec != ThirdSrcVec || FirstSrcVec != FourthSrcVec ||
+                        FirstIndex != 0 || SecondIndex != 1 || ThirdIndex != 2 || FourthIndex != 3))
     return SDValue();
 
   MVT ExpectedSrcVT;
-  switch (LHSOpcode) {
+  switch (FirstOpcode) {
   case WebAssemblyISD::CONVERT_LOW_S:
   case WebAssemblyISD::CONVERT_LOW_U:
     ExpectedSrcVT = MVT::v4i32;
     break;
   case WebAssemblyISD::PROMOTE_LOW:
-    ExpectedSrcVT = MVT::v4f32;
+    ExpectedSrcVT = NumLanes == 2 ? MVT::v4f32 : MVT::v8i16;
     break;
   }
-  if (LHSSrcVec.getValueType() != ExpectedSrcVT)
+  if (FirstSrcVec.getValueType() != ExpectedSrcVT)
     return SDValue();
 
-  auto Src = LHSSrcVec;
-  if (LHSIndex != 0 || RHSIndex != 1 || LHSSrcVec != RHSSrcVec) {
+  auto Src = FirstSrcVec;
+  if (NumLanes == 2 && (FirstIndex != 0 || SecondIndex != 1 || FirstSrcVec != SecondSrcVec)) {
     // Shuffle the source vector so that the converted lanes are the low lanes.
     Src = DAG.getVectorShuffle(
-        ExpectedSrcVT, DL, LHSSrcVec, RHSSrcVec,
-        {static_cast<int>(LHSIndex), static_cast<int>(RHSIndex) + 4, -1, -1});
+        ExpectedSrcVT, DL, FirstSrcVec, SecondSrcVec,
+        {static_cast<int>(FirstIndex), static_cast<int>(SecondIndex) + 4, -1, -1});
   }
-  return DAG.getNode(LHSOpcode, DL, MVT::v2f64, Src);
+  return DAG.getNode(FirstOpcode, DL, NumLanes == 2 ? MVT::v2f64 : MVT::v4f32, Src);
 }
 
 SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index c591e5ef181a4..d2d62b8b62c3e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1468,6 +1468,7 @@ defm "" : SIMDConvert<F32x4, F64x2, demote_zero,
 def promote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
 def promote_low : SDNode<"WebAssemblyISD::PROMOTE_LOW", promote_t>;
 defm "" : SIMDConvert<F64x2, F32x4, promote_low, "promote_low_f32x4", 0x5f>;
+defm "" : HalfPrecisionConvert<F32x4, I16x8, promote_low, "promote_low_f16x8", 0x14b>;
 
 // Lower extending loads to load64_zero + promote_low
 def extloadv2f32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
index 4e8ff5955c63b..f0e23ea289265 100644
--- a/llvm/test/CodeGen/WebAssembly/half-precision.ll
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -369,3 +369,23 @@ define <8 x half> @shuffle_poison_v8f16(<8 x half> %x, <8 x half> %y) {
                i32 poison, i32 poison, i32 poison, i32 poison>
   ret <8 x half> %res
 }
+
+define <4 x float> @promote_low_v4f32(<8 x half> %x) {
+; CHECK-LABEL: promote_low_v4f32:
+; CHECK: .functype promote_low_v4f32 (v128) -> (v128){{$}}
+; CHECK-NEXT: f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0
+; CHECK-NEXT: return $pop[[R]]
+  %v = shufflevector <8 x half> %x, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %a = fpext <4 x half> %v to <4 x float>
+  ret <4 x float> %a
+}
+
+define <4 x float> @promote_low_v4f32_2(<8 x half> %x) {
+; CHECK-LABEL: promote_low_v4f32_2:
+; CHECK:         .functype promote_low_v4f32_2 (v128) -> (v128)
+; CHECK-NEXT:    f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0
+; CHECK-NEXT: return $pop[[R]]
+  %v = fpext <8 x half> %x to <8 x float>
+  %a = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %a
+}
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 48aec4bc52a0c..57af1daad0226 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -935,4 +935,7 @@ main:
     # CHECK: f16x8.convert_i16x8_u # encoding: [0xfd,0xc8,0x02]
     f16x8.convert_i16x8_u
 
+    # CHECK: f32x4.promote_low_f16x8 # encoding: [0xfd,0xcb,0x02]
+    f32x4.promote_low_f16x8
+
     end_function

@llvmbot
Copy link
Member

llvmbot commented Mar 4, 2025

@llvm/pr-subscribers-backend-webassembly

Author: Brendan Dahl (brendandahl)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/129786.diff

6 Files Affected:

  • (modified) clang/lib/Headers/wasm_simd128.h (+8)
  • (modified) cross-project-tests/intrinsic-header-tests/wasm_simd128.c (+6)
  • (modified) llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp (+31-14)
  • (modified) llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td (+1)
  • (modified) llvm/test/CodeGen/WebAssembly/half-precision.ll (+20)
  • (modified) llvm/test/MC/WebAssembly/simd-encodings.s (+3)
diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h
index 08e39bf1a79b4..c509d7841135e 100644
--- a/clang/lib/Headers/wasm_simd128.h
+++ b/clang/lib/Headers/wasm_simd128.h
@@ -45,6 +45,7 @@ typedef int __i32x2 __attribute__((__vector_size__(8), __aligned__(8)));
 typedef unsigned int __u32x2
     __attribute__((__vector_size__(8), __aligned__(8)));
 typedef float __f32x2 __attribute__((__vector_size__(8), __aligned__(8)));
+typedef __fp16 __f16x4 __attribute__((__vector_size__(8), __aligned__(8)));
 
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("simd128"),        \
@@ -2010,6 +2011,13 @@ static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_u16x8(v128_t __a) {
   return (v128_t) __builtin_convertvector((__u16x8)__a, __f16x8);
 }
 
+static __inline__ v128_t __FP16_FN_ATTRS
+wasm_f32x4_promote_low_f16x8(v128_t __a) {
+  return (v128_t) __builtin_convertvector(
+      (__f16x4){((__f16x8)__a)[0], ((__f16x8)__a)[1],
+                ((__f16x8)__a)[2], ((__f16x8)__a)[3]}, __f32x4);
+}
+
 static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_madd(v128_t __a,
                                                                  v128_t __b,
                                                                  v128_t __c) {
diff --git a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c
index b601d90cfcc92..1f4809483589e 100644
--- a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c
+++ b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c
@@ -1033,6 +1033,12 @@ v128_t test_f64x2_promote_low_f32x4(v128_t a) {
   return wasm_f64x2_promote_low_f32x4(a);
 }
 
+// CHECK-LABEL: test_f32x4_promote_low_f16x8:
+// CHECK: f32x4.promote_low_f16x8{{$}}
+v128_t test_f32x4_promote_low_f16x8(v128_t a) {
+  return wasm_f32x4_promote_low_f16x8(a);
+}
+
 // CHECK-LABEL: test_i8x16_shuffle:
 // CHECK: i8x16.shuffle 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
 // 0{{$}}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index b24a45c2d8898..4a034ed508cfe 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -2341,7 +2341,7 @@ WebAssemblyTargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op,
 
 static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
-  if (Op.getValueType() != MVT::v2f64)
+  if (Op.getValueType() != MVT::v2f64 && Op.getValueType() != MVT::v4f32)
     return SDValue();
 
   auto GetConvertedLane = [](SDValue Op, unsigned &Opcode, SDValue &SrcVec,
@@ -2354,6 +2354,7 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) {
       Opcode = WebAssemblyISD::CONVERT_LOW_U;
       break;
     case ISD::FP_EXTEND:
+    case ISD::FP16_TO_FP:
       Opcode = WebAssemblyISD::PROMOTE_LOW;
       break;
     default:
@@ -2372,36 +2373,52 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) {
     return true;
   };
 
-  unsigned LHSOpcode, RHSOpcode, LHSIndex, RHSIndex;
-  SDValue LHSSrcVec, RHSSrcVec;
-  if (!GetConvertedLane(Op.getOperand(0), LHSOpcode, LHSSrcVec, LHSIndex) ||
-      !GetConvertedLane(Op.getOperand(1), RHSOpcode, RHSSrcVec, RHSIndex))
+  unsigned NumLanes = Op.getValueType() == MVT::v2f64 ? 2 : 4;
+  unsigned FirstOpcode = 0, SecondOpcode = 0, ThirdOpcode = 0, FourthOpcode = 0;
+  unsigned FirstIndex = 0, SecondIndex = 0, ThirdIndex = 0, FourthIndex = 0;
+  SDValue FirstSrcVec, SecondSrcVec, ThirdSrcVec, FourthSrcVec;
+
+  if (!GetConvertedLane(Op.getOperand(0), FirstOpcode, FirstSrcVec, FirstIndex) ||
+      !GetConvertedLane(Op.getOperand(1), SecondOpcode, SecondSrcVec, SecondIndex))
+    return SDValue();
+
+  // If we're converting to v4f32, check the third and fourth lanes, too.
+  if (NumLanes == 4 && (!GetConvertedLane(Op.getOperand(2), ThirdOpcode, ThirdSrcVec, ThirdIndex) ||
+                        !GetConvertedLane(Op.getOperand(3), FourthOpcode, FourthSrcVec, FourthIndex)))
+    return SDValue();
+
+  if (FirstOpcode != SecondOpcode)
     return SDValue();
 
-  if (LHSOpcode != RHSOpcode)
+  // TODO Add an optimization similar to the v2f64 below for shuffling the
+  // vectors when the lanes are in the wrong order or come from different src
+  // vectors.
+  if (NumLanes == 4 && (FirstOpcode != ThirdOpcode || FirstOpcode != FourthOpcode ||
+                        FirstSrcVec != SecondSrcVec || FirstSrcVec != ThirdSrcVec || FirstSrcVec != FourthSrcVec ||
+                        FirstIndex != 0 || SecondIndex != 1 || ThirdIndex != 2 || FourthIndex != 3))
     return SDValue();
 
   MVT ExpectedSrcVT;
-  switch (LHSOpcode) {
+  switch (FirstOpcode) {
   case WebAssemblyISD::CONVERT_LOW_S:
   case WebAssemblyISD::CONVERT_LOW_U:
     ExpectedSrcVT = MVT::v4i32;
     break;
   case WebAssemblyISD::PROMOTE_LOW:
-    ExpectedSrcVT = MVT::v4f32;
+    ExpectedSrcVT = NumLanes == 2 ? MVT::v4f32 : MVT::v8i16;
     break;
   }
-  if (LHSSrcVec.getValueType() != ExpectedSrcVT)
+  if (FirstSrcVec.getValueType() != ExpectedSrcVT)
     return SDValue();
 
-  auto Src = LHSSrcVec;
-  if (LHSIndex != 0 || RHSIndex != 1 || LHSSrcVec != RHSSrcVec) {
+  auto Src = FirstSrcVec;
+  if (NumLanes == 2 && (FirstIndex != 0 || SecondIndex != 1 || FirstSrcVec != SecondSrcVec)) {
     // Shuffle the source vector so that the converted lanes are the low lanes.
     Src = DAG.getVectorShuffle(
-        ExpectedSrcVT, DL, LHSSrcVec, RHSSrcVec,
-        {static_cast<int>(LHSIndex), static_cast<int>(RHSIndex) + 4, -1, -1});
+        ExpectedSrcVT, DL, FirstSrcVec, SecondSrcVec,
+        {static_cast<int>(FirstIndex), static_cast<int>(SecondIndex) + 4, -1, -1});
   }
-  return DAG.getNode(LHSOpcode, DL, MVT::v2f64, Src);
+  return DAG.getNode(FirstOpcode, DL, NumLanes == 2 ? MVT::v2f64 : MVT::v4f32, Src);
 }
 
 SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index c591e5ef181a4..d2d62b8b62c3e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1468,6 +1468,7 @@ defm "" : SIMDConvert<F32x4, F64x2, demote_zero,
 def promote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
 def promote_low : SDNode<"WebAssemblyISD::PROMOTE_LOW", promote_t>;
 defm "" : SIMDConvert<F64x2, F32x4, promote_low, "promote_low_f32x4", 0x5f>;
+defm "" : HalfPrecisionConvert<F32x4, I16x8, promote_low, "promote_low_f16x8", 0x14b>;
 
 // Lower extending loads to load64_zero + promote_low
 def extloadv2f32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
index 4e8ff5955c63b..f0e23ea289265 100644
--- a/llvm/test/CodeGen/WebAssembly/half-precision.ll
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -369,3 +369,23 @@ define <8 x half> @shuffle_poison_v8f16(<8 x half> %x, <8 x half> %y) {
                i32 poison, i32 poison, i32 poison, i32 poison>
   ret <8 x half> %res
 }
+
+define <4 x float> @promote_low_v4f32(<8 x half> %x) {
+; CHECK-LABEL: promote_low_v4f32:
+; CHECK: .functype promote_low_v4f32 (v128) -> (v128){{$}}
+; CHECK-NEXT: f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0
+; CHECK-NEXT: return $pop[[R]]
+  %v = shufflevector <8 x half> %x, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %a = fpext <4 x half> %v to <4 x float>
+  ret <4 x float> %a
+}
+
+define <4 x float> @promote_low_v4f32_2(<8 x half> %x) {
+; CHECK-LABEL: promote_low_v4f32_2:
+; CHECK:         .functype promote_low_v4f32_2 (v128) -> (v128)
+; CHECK-NEXT:    f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0
+; CHECK-NEXT: return $pop[[R]]
+  %v = fpext <8 x half> %x to <8 x float>
+  %a = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %a
+}
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 48aec4bc52a0c..57af1daad0226 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -935,4 +935,7 @@ main:
     # CHECK: f16x8.convert_i16x8_u # encoding: [0xfd,0xc8,0x02]
     f16x8.convert_i16x8_u
 
+    # CHECK: f32x4.promote_low_f16x8 # encoding: [0xfd,0xcb,0x02]
+    f32x4.promote_low_f16x8
+
     end_function

@llvmbot
Copy link
Member

llvmbot commented Mar 4, 2025

@llvm/pr-subscribers-mc

Author: Brendan Dahl (brendandahl)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/129786.diff

6 Files Affected:

  • (modified) clang/lib/Headers/wasm_simd128.h (+8)
  • (modified) cross-project-tests/intrinsic-header-tests/wasm_simd128.c (+6)
  • (modified) llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp (+31-14)
  • (modified) llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td (+1)
  • (modified) llvm/test/CodeGen/WebAssembly/half-precision.ll (+20)
  • (modified) llvm/test/MC/WebAssembly/simd-encodings.s (+3)
diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h
index 08e39bf1a79b4..c509d7841135e 100644
--- a/clang/lib/Headers/wasm_simd128.h
+++ b/clang/lib/Headers/wasm_simd128.h
@@ -45,6 +45,7 @@ typedef int __i32x2 __attribute__((__vector_size__(8), __aligned__(8)));
 typedef unsigned int __u32x2
     __attribute__((__vector_size__(8), __aligned__(8)));
 typedef float __f32x2 __attribute__((__vector_size__(8), __aligned__(8)));
+typedef __fp16 __f16x4 __attribute__((__vector_size__(8), __aligned__(8)));
 
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("simd128"),        \
@@ -2010,6 +2011,13 @@ static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_u16x8(v128_t __a) {
   return (v128_t) __builtin_convertvector((__u16x8)__a, __f16x8);
 }
 
+static __inline__ v128_t __FP16_FN_ATTRS
+wasm_f32x4_promote_low_f16x8(v128_t __a) {
+  return (v128_t) __builtin_convertvector(
+      (__f16x4){((__f16x8)__a)[0], ((__f16x8)__a)[1],
+                ((__f16x8)__a)[2], ((__f16x8)__a)[3]}, __f32x4);
+}
+
 static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_madd(v128_t __a,
                                                                  v128_t __b,
                                                                  v128_t __c) {
diff --git a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c
index b601d90cfcc92..1f4809483589e 100644
--- a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c
+++ b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c
@@ -1033,6 +1033,12 @@ v128_t test_f64x2_promote_low_f32x4(v128_t a) {
   return wasm_f64x2_promote_low_f32x4(a);
 }
 
+// CHECK-LABEL: test_f32x4_promote_low_f16x8:
+// CHECK: f32x4.promote_low_f16x8{{$}}
+v128_t test_f32x4_promote_low_f16x8(v128_t a) {
+  return wasm_f32x4_promote_low_f16x8(a);
+}
+
 // CHECK-LABEL: test_i8x16_shuffle:
 // CHECK: i8x16.shuffle 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
 // 0{{$}}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index b24a45c2d8898..4a034ed508cfe 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -2341,7 +2341,7 @@ WebAssemblyTargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op,
 
 static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
-  if (Op.getValueType() != MVT::v2f64)
+  if (Op.getValueType() != MVT::v2f64 && Op.getValueType() != MVT::v4f32)
     return SDValue();
 
   auto GetConvertedLane = [](SDValue Op, unsigned &Opcode, SDValue &SrcVec,
@@ -2354,6 +2354,7 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) {
       Opcode = WebAssemblyISD::CONVERT_LOW_U;
       break;
     case ISD::FP_EXTEND:
+    case ISD::FP16_TO_FP:
       Opcode = WebAssemblyISD::PROMOTE_LOW;
       break;
     default:
@@ -2372,36 +2373,52 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) {
     return true;
   };
 
-  unsigned LHSOpcode, RHSOpcode, LHSIndex, RHSIndex;
-  SDValue LHSSrcVec, RHSSrcVec;
-  if (!GetConvertedLane(Op.getOperand(0), LHSOpcode, LHSSrcVec, LHSIndex) ||
-      !GetConvertedLane(Op.getOperand(1), RHSOpcode, RHSSrcVec, RHSIndex))
+  unsigned NumLanes = Op.getValueType() == MVT::v2f64 ? 2 : 4;
+  unsigned FirstOpcode = 0, SecondOpcode = 0, ThirdOpcode = 0, FourthOpcode = 0;
+  unsigned FirstIndex = 0, SecondIndex = 0, ThirdIndex = 0, FourthIndex = 0;
+  SDValue FirstSrcVec, SecondSrcVec, ThirdSrcVec, FourthSrcVec;
+
+  if (!GetConvertedLane(Op.getOperand(0), FirstOpcode, FirstSrcVec, FirstIndex) ||
+      !GetConvertedLane(Op.getOperand(1), SecondOpcode, SecondSrcVec, SecondIndex))
+    return SDValue();
+
+  // If we're converting to v4f32, check the third and fourth lanes, too.
+  if (NumLanes == 4 && (!GetConvertedLane(Op.getOperand(2), ThirdOpcode, ThirdSrcVec, ThirdIndex) ||
+                        !GetConvertedLane(Op.getOperand(3), FourthOpcode, FourthSrcVec, FourthIndex)))
+    return SDValue();
+
+  if (FirstOpcode != SecondOpcode)
     return SDValue();
 
-  if (LHSOpcode != RHSOpcode)
+  // TODO Add an optimization similar to the v2f64 below for shuffling the
+  // vectors when the lanes are in the wrong order or come from different src
+  // vectors.
+  if (NumLanes == 4 && (FirstOpcode != ThirdOpcode || FirstOpcode != FourthOpcode ||
+                        FirstSrcVec != SecondSrcVec || FirstSrcVec != ThirdSrcVec || FirstSrcVec != FourthSrcVec ||
+                        FirstIndex != 0 || SecondIndex != 1 || ThirdIndex != 2 || FourthIndex != 3))
     return SDValue();
 
   MVT ExpectedSrcVT;
-  switch (LHSOpcode) {
+  switch (FirstOpcode) {
   case WebAssemblyISD::CONVERT_LOW_S:
   case WebAssemblyISD::CONVERT_LOW_U:
     ExpectedSrcVT = MVT::v4i32;
     break;
   case WebAssemblyISD::PROMOTE_LOW:
-    ExpectedSrcVT = MVT::v4f32;
+    ExpectedSrcVT = NumLanes == 2 ? MVT::v4f32 : MVT::v8i16;
     break;
   }
-  if (LHSSrcVec.getValueType() != ExpectedSrcVT)
+  if (FirstSrcVec.getValueType() != ExpectedSrcVT)
     return SDValue();
 
-  auto Src = LHSSrcVec;
-  if (LHSIndex != 0 || RHSIndex != 1 || LHSSrcVec != RHSSrcVec) {
+  auto Src = FirstSrcVec;
+  if (NumLanes == 2 && (FirstIndex != 0 || SecondIndex != 1 || FirstSrcVec != SecondSrcVec)) {
     // Shuffle the source vector so that the converted lanes are the low lanes.
     Src = DAG.getVectorShuffle(
-        ExpectedSrcVT, DL, LHSSrcVec, RHSSrcVec,
-        {static_cast<int>(LHSIndex), static_cast<int>(RHSIndex) + 4, -1, -1});
+        ExpectedSrcVT, DL, FirstSrcVec, SecondSrcVec,
+        {static_cast<int>(FirstIndex), static_cast<int>(SecondIndex) + 4, -1, -1});
   }
-  return DAG.getNode(LHSOpcode, DL, MVT::v2f64, Src);
+  return DAG.getNode(FirstOpcode, DL, NumLanes == 2 ? MVT::v2f64 : MVT::v4f32, Src);
 }
 
 SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index c591e5ef181a4..d2d62b8b62c3e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1468,6 +1468,7 @@ defm "" : SIMDConvert<F32x4, F64x2, demote_zero,
 def promote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
 def promote_low : SDNode<"WebAssemblyISD::PROMOTE_LOW", promote_t>;
 defm "" : SIMDConvert<F64x2, F32x4, promote_low, "promote_low_f32x4", 0x5f>;
+defm "" : HalfPrecisionConvert<F32x4, I16x8, promote_low, "promote_low_f16x8", 0x14b>;
 
 // Lower extending loads to load64_zero + promote_low
 def extloadv2f32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
index 4e8ff5955c63b..f0e23ea289265 100644
--- a/llvm/test/CodeGen/WebAssembly/half-precision.ll
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -369,3 +369,23 @@ define <8 x half> @shuffle_poison_v8f16(<8 x half> %x, <8 x half> %y) {
                i32 poison, i32 poison, i32 poison, i32 poison>
   ret <8 x half> %res
 }
+
+define <4 x float> @promote_low_v4f32(<8 x half> %x) {
+; CHECK-LABEL: promote_low_v4f32:
+; CHECK: .functype promote_low_v4f32 (v128) -> (v128){{$}}
+; CHECK-NEXT: f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0
+; CHECK-NEXT: return $pop[[R]]
+  %v = shufflevector <8 x half> %x, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %a = fpext <4 x half> %v to <4 x float>
+  ret <4 x float> %a
+}
+
+define <4 x float> @promote_low_v4f32_2(<8 x half> %x) {
+; CHECK-LABEL: promote_low_v4f32_2:
+; CHECK:         .functype promote_low_v4f32_2 (v128) -> (v128)
+; CHECK-NEXT:    f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0
+; CHECK-NEXT: return $pop[[R]]
+  %v = fpext <8 x half> %x to <8 x float>
+  %a = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %a
+}
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 48aec4bc52a0c..57af1daad0226 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -935,4 +935,7 @@ main:
     # CHECK: f16x8.convert_i16x8_u # encoding: [0xfd,0xc8,0x02]
     f16x8.convert_i16x8_u
 
+    # CHECK: f32x4.promote_low_f16x8 # encoding: [0xfd,0xcb,0x02]
+    f32x4.promote_low_f16x8
+
     end_function

Copy link

github-actions bot commented Mar 4, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

Copy link

github-actions bot commented Mar 4, 2025

✅ With the latest revision this PR passed the undef deprecator.

; CHECK: .functype promote_low_v4f32 (v128) -> (v128){{$}}
; CHECK-NEXT: f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0
; CHECK-NEXT: return $pop[[R]]
%v = shufflevector <8 x half> %x, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
%v = shufflevector <8 x half> %x, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v = shufflevector <8 x half> %x, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

; CHECK-NEXT: f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0
; CHECK-NEXT: return $pop[[R]]
%v = fpext <8 x half> %x to <8 x float>
%a = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
%a = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%a = shufflevector <8 x float> %v, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:WebAssembly backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics clang Clang issues not falling into any other category mc Machine (object) code
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants