From 6c76dd0ef15dadec18582106e05e0264a85edd27 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Sat, 26 Apr 2025 03:21:19 +0000 Subject: [PATCH 1/7] [msan] Implement support for avx512fp16.mask.{add/sub/mul/div/max/min}.sh.round This adds a handler, visitGenericScalarHalfwordInst, which works for mask.{add/sub/mul/div/max/min}.sh.round. Updates the tests in https://github.com/llvm/llvm-project/pull/136260 --- .../Instrumentation/MemorySanitizer.cpp | 69 +++ .../X86/avx512fp16-intrinsics.ll | 457 +++++++++--------- 2 files changed, 306 insertions(+), 220 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 8e31e8d2a4fbd..9f4708e14aa6a 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4312,6 +4312,65 @@ struct MemorySanitizerVisitor : public InstVisitor { setOriginForNaryOp(I); } + // For sh compiler intrinsics: + // llvm.x86.avx512fp16.mask.{add/sub/mul/div/max/min}.sh.round + // (<8 x half>, <8 x half>, <8 x half>, i8, i32) + // A B WriteThru Mask RoundingMode + // + // if (Mask[0]) + // DstShadow[0] = AShadow[0] | BShadow[0] + // else + // DstShadow[0] = WriteThruShadow[0] + // + // DstShadow[1..7] = AShadow[1..7] + void visitGenericScalarHalfwordInst(IntrinsicInst &I) { + IRBuilder<> IRB(&I); + + assert(I.arg_size() == 5); + Value *A = I.getOperand(0); + Value *B = I.getOperand(1); + Value *WriteThrough = I.getOperand(2); + Value *Mask = I.getOperand(3); + Value *RoundingMode = I.getOperand(4); + + // Technically, we could probably just check whether the LSB is initialized + insertShadowCheck(Mask, &I); + insertShadowCheck(RoundingMode, &I); + + assert(isa(A->getType())); + unsigned NumElements = + cast(A->getType())->getNumElements(); + assert(NumElements == 8); + assert(A->getType() == B->getType()); + assert(B->getType() == WriteThrough->getType()); + assert(Mask->getType()->getPrimitiveSizeInBits() == NumElements); + assert(RoundingMode->getType()->isIntegerTy()); + + Mask = IRB.CreateBitCast( + Mask, FixedVectorType::get(IRB.getInt1Ty(), NumElements)); + + Value *AShadow = getShadow(A); + Value *BShadow = getShadow(B); + Value *ABLowerShadow = + IRB.CreateOr(IRB.CreateExtractElement( + AShadow, ConstantInt::get(IRB.getInt32Ty(), 0)), + IRB.CreateExtractElement( + BShadow, ConstantInt::get(IRB.getInt32Ty(), 0))); + Value *WriteThroughShadow = getShadow(WriteThrough); + Value *WriteThroughLowerShadow = IRB.CreateExtractElement( + WriteThroughShadow, ConstantInt::get(IRB.getInt32Ty(), 0)); + + Value *DstLowerShadow = IRB.CreateSelect( + IRB.CreateExtractElement(Mask, ConstantInt::get(IRB.getInt32Ty(), 0)), + ABLowerShadow, WriteThroughLowerShadow); + Value *DstShadow = IRB.CreateInsertElement( + AShadow, DstLowerShadow, ConstantInt::get(IRB.getInt32Ty(), 0), + "_msprop"); + + setShadow(&I, DstShadow); + setOriginForNaryOp(I); + } + // Handle Arm NEON vector load intrinsics (vld*). // // The WithLane instructions (ld[234]lane) are similar to: @@ -5041,6 +5100,16 @@ struct MemorySanitizerVisitor : public InstVisitor { break; } + case Intrinsic::x86_avx512fp16_mask_add_sh_round: + case Intrinsic::x86_avx512fp16_mask_sub_sh_round: + case Intrinsic::x86_avx512fp16_mask_mul_sh_round: + case Intrinsic::x86_avx512fp16_mask_div_sh_round: + case Intrinsic::x86_avx512fp16_mask_max_sh_round: + case Intrinsic::x86_avx512fp16_mask_min_sh_round: { + visitGenericScalarHalfwordInst(I); + break; + } + case Intrinsic::fshl: case Intrinsic::fshr: handleFunnelShift(I); diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll index 61a32e5e2042e..b11b21da492d2 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll @@ -13,9 +13,6 @@ ; - llvm.x86.avx512fp16.mask.getexp.sh ; - llvm.x86.avx512fp16.mask.getmant.ph.512 ; - llvm.x86.avx512fp16.mask.getmant.sh -; - llvm.x86.avx512fp16.mask.max.sh.round -; - llvm.x86.avx512fp16.mask.min.sh.round -; - llvm.x86.avx512fp16.mask.mul.sh.round ; - llvm.x86.avx512fp16.mask.rcp.ph.512 ; - llvm.x86.avx512fp16.mask.rcp.sh ; - llvm.x86.avx512fp16.mask.reduce.ph.512 @@ -27,7 +24,6 @@ ; - llvm.x86.avx512fp16.mask.scalef.ph.512 ; - llvm.x86.avx512fp16.mask.scalef.sh ; - llvm.x86.avx512fp16.mask.sqrt.sh -; - llvm.x86.avx512fp16.mask.sub.sh.round ; - llvm.x86.avx512fp16.mask.vcvtph2dq.512 ; - llvm.x86.avx512fp16.mask.vcvtph2qq.512 ; - llvm.x86.avx512fp16.mask.vcvtph2udq.512 @@ -1393,8 +1389,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] @@ -1409,54 +1405,57 @@ define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[_MSLD:%.*]] = load i16, ptr [[TMP10]], align 2 ; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0 ; CHECK-NEXT: [[VAL:%.*]] = insertelement <8 x half> poison, half [[VAL_HALF]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] -; CHECK: [[BB13]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = or i16 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP15]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i16> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] -; CHECK-NEXT: br i1 [[_MSOR7]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]] -; CHECK: [[BB17]]: +; CHECK-NEXT: br i1 [[_MSCMP6]], label %[[BB22:.*]], label %[[BB23:.*]], !prof [[PROF1]] +; CHECK: [[BB22]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB18]]: +; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i128 [[TMP19]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSCMP8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label %[[BB20:.*]], label %[[BB21:.*]], !prof [[PROF1]] -; CHECK: [[BB20]]: +; CHECK-NEXT: br i1 [[_MSCMP9]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]] +; CHECK: [[BB30]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB21]]: +; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP22]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i16> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP23]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 +; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: br i1 [[_MSOR15]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]] -; CHECK: [[BB24]]: +; CHECK-NEXT: br i1 [[_MSCMP14]], label %[[BB39:.*]], label %[[BB40:.*]], !prof [[PROF1]] +; CHECK: [[BB39]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB25]]: +; CHECK: [[BB40]]: ; CHECK-NEXT: [[RES3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[RES2]], <8 x half> [[VAL]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i16> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x half> [[RES3]] ; %val.half = load half,ptr %ptr @@ -1476,8 +1475,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] @@ -1492,54 +1491,57 @@ define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[_MSLD:%.*]] = load i16, ptr [[TMP10]], align 2 ; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0 ; CHECK-NEXT: [[VAL:%.*]] = insertelement <8 x half> poison, half [[VAL_HALF]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] -; CHECK: [[BB13]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = or i16 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP15]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i16> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] -; CHECK-NEXT: br i1 [[_MSOR7]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]] -; CHECK: [[BB17]]: +; CHECK-NEXT: br i1 [[_MSCMP6]], label %[[BB22:.*]], label %[[BB23:.*]], !prof [[PROF1]] +; CHECK: [[BB22]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB18]]: +; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i128 [[TMP19]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSCMP8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label %[[BB20:.*]], label %[[BB21:.*]], !prof [[PROF1]] -; CHECK: [[BB20]]: +; CHECK-NEXT: br i1 [[_MSCMP9]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]] +; CHECK: [[BB30]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB21]]: +; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP22]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i16> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP23]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 +; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: br i1 [[_MSOR15]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]] -; CHECK: [[BB24]]: +; CHECK-NEXT: br i1 [[_MSCMP14]], label %[[BB39:.*]], label %[[BB40:.*]], !prof [[PROF1]] +; CHECK: [[BB39]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB25]]: +; CHECK: [[BB40]]: ; CHECK-NEXT: [[RES3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[RES2]], <8 x half> [[VAL]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i16> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x half> [[RES3]] ; %val.half = load half,ptr %ptr @@ -1559,8 +1561,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] @@ -1575,54 +1577,57 @@ define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[_MSLD:%.*]] = load i16, ptr [[TMP10]], align 2 ; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0 ; CHECK-NEXT: [[VAL:%.*]] = insertelement <8 x half> poison, half [[VAL_HALF]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] -; CHECK: [[BB13]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = or i16 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP15]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i16> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] -; CHECK-NEXT: br i1 [[_MSOR7]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]] -; CHECK: [[BB17]]: +; CHECK-NEXT: br i1 [[_MSCMP6]], label %[[BB22:.*]], label %[[BB23:.*]], !prof [[PROF1]] +; CHECK: [[BB22]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB18]]: +; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i128 [[TMP19]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSCMP8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label %[[BB20:.*]], label %[[BB21:.*]], !prof [[PROF1]] -; CHECK: [[BB20]]: +; CHECK-NEXT: br i1 [[_MSCMP9]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]] +; CHECK: [[BB30]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB21]]: +; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP22]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i16> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP23]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 +; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: br i1 [[_MSOR15]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]] -; CHECK: [[BB24]]: +; CHECK-NEXT: br i1 [[_MSCMP14]], label %[[BB39:.*]], label %[[BB40:.*]], !prof [[PROF1]] +; CHECK: [[BB39]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB25]]: +; CHECK: [[BB40]]: ; CHECK-NEXT: [[RES3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> [[RES2]], <8 x half> [[VAL]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i16> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x half> [[RES3]] ; %val.half = load half,ptr %ptr @@ -1642,8 +1647,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_div_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] @@ -1658,54 +1663,57 @@ define <8 x half> @test_int_x86_avx512fp16_mask_div_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[_MSLD:%.*]] = load i16, ptr [[TMP10]], align 2 ; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0 ; CHECK-NEXT: [[VAL:%.*]] = insertelement <8 x half> poison, half [[VAL_HALF]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] -; CHECK: [[BB13]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = or i16 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP15]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i16> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] -; CHECK-NEXT: br i1 [[_MSOR7]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]] -; CHECK: [[BB17]]: +; CHECK-NEXT: br i1 [[_MSCMP6]], label %[[BB22:.*]], label %[[BB23:.*]], !prof [[PROF1]] +; CHECK: [[BB22]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB18]]: +; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i128 [[TMP19]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSCMP8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label %[[BB20:.*]], label %[[BB21:.*]], !prof [[PROF1]] -; CHECK: [[BB20]]: +; CHECK-NEXT: br i1 [[_MSCMP9]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]] +; CHECK: [[BB30]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB21]]: +; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP22]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i16> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP23]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 +; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: br i1 [[_MSOR15]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]] -; CHECK: [[BB24]]: +; CHECK-NEXT: br i1 [[_MSCMP14]], label %[[BB39:.*]], label %[[BB40:.*]], !prof [[PROF1]] +; CHECK: [[BB39]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB25]]: +; CHECK: [[BB40]]: ; CHECK-NEXT: [[RES3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> [[RES2]], <8 x half> [[VAL]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i16> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x half> [[RES3]] ; %val.half = load half,ptr %ptr @@ -1725,8 +1733,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_min_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] @@ -1741,54 +1749,57 @@ define <8 x half> @test_int_x86_avx512fp16_mask_min_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[_MSLD:%.*]] = load i16, ptr [[TMP10]], align 2 ; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0 ; CHECK-NEXT: [[VAL:%.*]] = insertelement <8 x half> poison, half [[VAL_HALF]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] -; CHECK: [[BB13]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = or i16 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP15]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i16> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] -; CHECK-NEXT: br i1 [[_MSOR7]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]] -; CHECK: [[BB17]]: +; CHECK-NEXT: br i1 [[_MSCMP6]], label %[[BB22:.*]], label %[[BB23:.*]], !prof [[PROF1]] +; CHECK: [[BB22]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB18]]: +; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i128 [[TMP19]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSCMP8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label %[[BB20:.*]], label %[[BB21:.*]], !prof [[PROF1]] -; CHECK: [[BB20]]: +; CHECK-NEXT: br i1 [[_MSCMP9]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]] +; CHECK: [[BB30]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB21]]: +; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP22]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i16> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP23]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 +; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: br i1 [[_MSOR15]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]] -; CHECK: [[BB24]]: +; CHECK-NEXT: br i1 [[_MSCMP14]], label %[[BB39:.*]], label %[[BB40:.*]], !prof [[PROF1]] +; CHECK: [[BB39]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB25]]: +; CHECK: [[BB40]]: ; CHECK-NEXT: [[RES3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[RES2]], <8 x half> [[VAL]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i16> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x half> [[RES3]] ; %val.half = load half,ptr %ptr @@ -1808,8 +1819,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_max_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] @@ -1824,54 +1835,57 @@ define <8 x half> @test_int_x86_avx512fp16_mask_max_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[_MSLD:%.*]] = load i16, ptr [[TMP10]], align 2 ; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0 ; CHECK-NEXT: [[VAL:%.*]] = insertelement <8 x half> poison, half [[VAL_HALF]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] -; CHECK: [[BB13]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = or i16 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP15]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i16> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] -; CHECK-NEXT: br i1 [[_MSOR7]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]] -; CHECK: [[BB17]]: +; CHECK-NEXT: br i1 [[_MSCMP6]], label %[[BB22:.*]], label %[[BB23:.*]], !prof [[PROF1]] +; CHECK: [[BB22]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB18]]: +; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i128 [[TMP19]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSCMP8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label %[[BB20:.*]], label %[[BB21:.*]], !prof [[PROF1]] -; CHECK: [[BB20]]: +; CHECK-NEXT: br i1 [[_MSCMP9]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]] +; CHECK: [[BB30]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB21]]: +; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP22]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i16> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP23]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 +; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: br i1 [[_MSOR15]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]] -; CHECK: [[BB24]]: +; CHECK-NEXT: br i1 [[_MSCMP14]], label %[[BB39:.*]], label %[[BB40:.*]], !prof [[PROF1]] +; CHECK: [[BB39]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB25]]: +; CHECK: [[BB40]]: ; CHECK-NEXT: [[RES3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[RES2]], <8 x half> [[VAL]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i16> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x half> [[RES3]] ; %val.half = load half,ptr %ptr @@ -3246,3 +3260,6 @@ define <32 x half> @test_mm512_castph256_ph512_freeze(<16 x half> %a0) nounwind } attributes #0 = { sanitize_memory } +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. From 24b7e868e389473461409b74b34b166f6f2a8f4c Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Sat, 26 Apr 2025 06:35:38 +0000 Subject: [PATCH 2/7] Clarify comment --- llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 9f4708e14aa6a..12494f66e686e 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4312,7 +4312,7 @@ struct MemorySanitizerVisitor : public InstVisitor { setOriginForNaryOp(I); } - // For sh compiler intrinsics: + // For sh.* compiler intrinsics: // llvm.x86.avx512fp16.mask.{add/sub/mul/div/max/min}.sh.round // (<8 x half>, <8 x half>, <8 x half>, i8, i32) // A B WriteThru Mask RoundingMode From 33ee8544bb15376f861676a5a4c28d706671123f Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Mon, 28 Apr 2025 21:44:08 +0000 Subject: [PATCH 3/7] Resolve ambiguity in LLVM instruction order --- .../Instrumentation/MemorySanitizer.cpp | 32 ++++++++--------- .../X86/avx512fp16-intrinsics.ll | 36 +++++++++---------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 12494f66e686e..21f601c84892d 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4313,15 +4313,11 @@ struct MemorySanitizerVisitor : public InstVisitor { } // For sh.* compiler intrinsics: - // llvm.x86.avx512fp16.mask.{add/sub/mul/div/max/min}.sh.round - // (<8 x half>, <8 x half>, <8 x half>, i8, i32) - // A B WriteThru Mask RoundingMode - // - // if (Mask[0]) - // DstShadow[0] = AShadow[0] | BShadow[0] - // else - // DstShadow[0] = WriteThruShadow[0] + // llvm.x86.avx512fp16.mask.{add/sub/mul/div/max/min}.sh.round + // (<8 x half>, <8 x half>, <8 x half>, i8, i32) + // A B WriteThru Mask RoundingMode // + // DstShadow[0] = Mask[0] ? (AShadow[0] | BShadow[0]) : WriteThruShadow[0] // DstShadow[1..7] = AShadow[1..7] void visitGenericScalarHalfwordInst(IntrinsicInst &I) { IRBuilder<> IRB(&I); @@ -4348,21 +4344,25 @@ struct MemorySanitizerVisitor : public InstVisitor { Mask = IRB.CreateBitCast( Mask, FixedVectorType::get(IRB.getInt1Ty(), NumElements)); + Value *MaskLower = + IRB.CreateExtractElement(Mask, ConstantInt::get(IRB.getInt32Ty(), 0)); Value *AShadow = getShadow(A); + Value *AShadowLower = IRB.CreateExtractElement( + AShadow, ConstantInt::get(IRB.getInt32Ty(), 0)); + Value *BShadow = getShadow(B); - Value *ABLowerShadow = - IRB.CreateOr(IRB.CreateExtractElement( - AShadow, ConstantInt::get(IRB.getInt32Ty(), 0)), - IRB.CreateExtractElement( - BShadow, ConstantInt::get(IRB.getInt32Ty(), 0))); + Value *BShadowLower = IRB.CreateExtractElement( + BShadow, ConstantInt::get(IRB.getInt32Ty(), 0)); + + Value *ABLowerShadow = IRB.CreateOr(AShadowLower, BShadowLower); + Value *WriteThroughShadow = getShadow(WriteThrough); Value *WriteThroughLowerShadow = IRB.CreateExtractElement( WriteThroughShadow, ConstantInt::get(IRB.getInt32Ty(), 0)); - Value *DstLowerShadow = IRB.CreateSelect( - IRB.CreateExtractElement(Mask, ConstantInt::get(IRB.getInt32Ty(), 0)), - ABLowerShadow, WriteThroughLowerShadow); + Value *DstLowerShadow = + IRB.CreateSelect(MaskLower, ABLowerShadow, WriteThroughLowerShadow); Value *DstShadow = IRB.CreateInsertElement( AShadow, DstLowerShadow, ConstantInt::get(IRB.getInt32Ty(), 0), "_msprop"); diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll index b11b21da492d2..9770794357a20 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll @@ -1412,11 +1412,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] ; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1427,10 +1427,10 @@ define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half ; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) ; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 ; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1441,11 +1441,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half ; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) ; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 ; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] ; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1498,11 +1498,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] ; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1513,10 +1513,10 @@ define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half ; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) ; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 ; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1527,11 +1527,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half ; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) ; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 ; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] ; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1584,11 +1584,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] ; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1599,10 +1599,10 @@ define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half ; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) ; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 ; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1613,11 +1613,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half ; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) ; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 ; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] ; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1670,11 +1670,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_div_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] ; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1685,10 +1685,10 @@ define <8 x half> @test_int_x86_avx512fp16_mask_div_sh(<8 x half> %x1, <8 x half ; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) ; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 ; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1699,11 +1699,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_div_sh(<8 x half> %x1, <8 x half ; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) ; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 ; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] ; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1756,11 +1756,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_min_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] ; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1771,10 +1771,10 @@ define <8 x half> @test_int_x86_avx512fp16_mask_min_sh(<8 x half> %x1, <8 x half ; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) ; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 ; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1785,11 +1785,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_min_sh(<8 x half> %x1, <8 x half ; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) ; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 ; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] ; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1842,11 +1842,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_max_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] ; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1857,10 +1857,10 @@ define <8 x half> @test_int_x86_avx512fp16_mask_max_sh(<8 x half> %x1, <8 x half ; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) ; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 ; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1871,11 +1871,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_max_sh(<8 x half> %x1, <8 x half ; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) ; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 ; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] ; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 From 89d6f6cc9b5b17b8737209bfd59a076e26554af4 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Mon, 28 Apr 2025 21:53:40 +0000 Subject: [PATCH 4/7] Reorder in parameter order --- .../Instrumentation/MemorySanitizer.cpp | 10 +-- .../X86/avx512fp16-intrinsics.ll | 72 +++++++++---------- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 21f601c84892d..7bef2b9706b93 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4342,11 +4342,6 @@ struct MemorySanitizerVisitor : public InstVisitor { assert(Mask->getType()->getPrimitiveSizeInBits() == NumElements); assert(RoundingMode->getType()->isIntegerTy()); - Mask = IRB.CreateBitCast( - Mask, FixedVectorType::get(IRB.getInt1Ty(), NumElements)); - Value *MaskLower = - IRB.CreateExtractElement(Mask, ConstantInt::get(IRB.getInt32Ty(), 0)); - Value *AShadow = getShadow(A); Value *AShadowLower = IRB.CreateExtractElement( AShadow, ConstantInt::get(IRB.getInt32Ty(), 0)); @@ -4361,6 +4356,11 @@ struct MemorySanitizerVisitor : public InstVisitor { Value *WriteThroughLowerShadow = IRB.CreateExtractElement( WriteThroughShadow, ConstantInt::get(IRB.getInt32Ty(), 0)); + Mask = IRB.CreateBitCast( + Mask, FixedVectorType::get(IRB.getInt1Ty(), NumElements)); + Value *MaskLower = + IRB.CreateExtractElement(Mask, ConstantInt::get(IRB.getInt32Ty(), 0)); + Value *DstLowerShadow = IRB.CreateSelect(MaskLower, ABLowerShadow, WriteThroughLowerShadow); Value *DstShadow = IRB.CreateInsertElement( diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll index 9770794357a20..3b12e38e93832 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll @@ -1411,12 +1411,12 @@ define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0 ; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP22]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] ; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1426,11 +1426,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: unreachable ; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP30]], i32 0 ; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 ; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1440,12 +1440,12 @@ define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: unreachable ; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 ; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP39]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] ; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1497,12 +1497,12 @@ define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0 ; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP22]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] ; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1512,11 +1512,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: unreachable ; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP30]], i32 0 ; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 ; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1526,12 +1526,12 @@ define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: unreachable ; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 ; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP39]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] ; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1583,12 +1583,12 @@ define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0 ; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP22]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] ; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1598,11 +1598,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: unreachable ; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP30]], i32 0 ; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 ; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1612,12 +1612,12 @@ define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: unreachable ; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 ; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP39]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] ; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1669,12 +1669,12 @@ define <8 x half> @test_int_x86_avx512fp16_mask_div_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0 ; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP22]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] ; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1684,11 +1684,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_div_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: unreachable ; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP30]], i32 0 ; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 ; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1698,12 +1698,12 @@ define <8 x half> @test_int_x86_avx512fp16_mask_div_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: unreachable ; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 ; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP39]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] ; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1755,12 +1755,12 @@ define <8 x half> @test_int_x86_avx512fp16_mask_min_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0 ; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP22]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] ; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1770,11 +1770,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_min_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: unreachable ; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP30]], i32 0 ; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 ; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1784,12 +1784,12 @@ define <8 x half> @test_int_x86_avx512fp16_mask_min_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: unreachable ; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 ; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP39]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] ; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1841,12 +1841,12 @@ define <8 x half> @test_int_x86_avx512fp16_mask_max_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0 ; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0 ; CHECK-NEXT: [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP22]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]] ; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0 ; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1856,11 +1856,11 @@ define <8 x half> @test_int_x86_avx512fp16_mask_max_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: unreachable ; CHECK: [[BB23]]: ; CHECK-NEXT: [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP30]], i32 0 ; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0 ; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0 ; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0 @@ -1870,12 +1870,12 @@ define <8 x half> @test_int_x86_avx512fp16_mask_max_sh(<8 x half> %x1, <8 x half ; CHECK-NEXT: unreachable ; CHECK: [[BB31]]: ; CHECK-NEXT: [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4) -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0 ; CHECK-NEXT: [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP39]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]] ; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0 ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0 From 9b32b571df316a4150a3a32b10b706eb2e1afb94 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Mon, 28 Apr 2025 22:34:01 +0000 Subject: [PATCH 5/7] Press F to pay respects to Florian's feedback --- .../Instrumentation/MemorySanitizer.cpp | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 7bef2b9706b93..7f29425931a6b 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4312,6 +4312,13 @@ struct MemorySanitizerVisitor : public InstVisitor { setOriginForNaryOp(I); } + Value *extractLowerShadow(IRBuilder<> &IRB, Value *V) { + assert(isa(V->getType())); + assert(cast(V->getType())->getNumElements() > 0); + Value *Shadow = getShadow(V); + return IRB.CreateExtractElement(Shadow, ConstantInt::get(IRB.getInt32Ty(), 0)); + } + // For sh.* compiler intrinsics: // llvm.x86.avx512fp16.mask.{add/sub/mul/div/max/min}.sh.round // (<8 x half>, <8 x half>, <8 x half>, i8, i32) @@ -4329,7 +4336,9 @@ struct MemorySanitizerVisitor : public InstVisitor { Value *Mask = I.getOperand(3); Value *RoundingMode = I.getOperand(4); - // Technically, we could probably just check whether the LSB is initialized + // Technically, we could probably just check whether the LSB is + // initialized, but intuitively it feels like a partly uninitialized mask + // is unintended, and we should warn the user immediately. insertShadowCheck(Mask, &I); insertShadowCheck(RoundingMode, &I); @@ -4342,25 +4351,19 @@ struct MemorySanitizerVisitor : public InstVisitor { assert(Mask->getType()->getPrimitiveSizeInBits() == NumElements); assert(RoundingMode->getType()->isIntegerTy()); - Value *AShadow = getShadow(A); - Value *AShadowLower = IRB.CreateExtractElement( - AShadow, ConstantInt::get(IRB.getInt32Ty(), 0)); - - Value *BShadow = getShadow(B); - Value *BShadowLower = IRB.CreateExtractElement( - BShadow, ConstantInt::get(IRB.getInt32Ty(), 0)); + Value *ALowerShadow = extractLowerShadow(IRB, A); + Value *BLowerShadow = extractLowerShadow(IRB, B); - Value *ABLowerShadow = IRB.CreateOr(AShadowLower, BShadowLower); + Value *ABLowerShadow = IRB.CreateOr(ALowerShadow, BLowerShadow); - Value *WriteThroughShadow = getShadow(WriteThrough); - Value *WriteThroughLowerShadow = IRB.CreateExtractElement( - WriteThroughShadow, ConstantInt::get(IRB.getInt32Ty(), 0)); + Value *WriteThroughLowerShadow = extractLowerShadow(IRB, WriteThrough); Mask = IRB.CreateBitCast( Mask, FixedVectorType::get(IRB.getInt1Ty(), NumElements)); Value *MaskLower = IRB.CreateExtractElement(Mask, ConstantInt::get(IRB.getInt32Ty(), 0)); + Value *AShadow = getShadow(A); Value *DstLowerShadow = IRB.CreateSelect(MaskLower, ABLowerShadow, WriteThroughLowerShadow); Value *DstShadow = IRB.CreateInsertElement( From 0f0291be13a461a7fe3381ece9770ebb0d15e64c Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Mon, 28 Apr 2025 22:36:21 +0000 Subject: [PATCH 6/7] Remove unnecessary branch weights assertion --- .../MemorySanitizer/X86/avx512fp16-intrinsics.ll | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll index 3b12e38e93832..c5d91adf64cb3 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll @@ -3260,6 +3260,3 @@ define <32 x half> @test_mm512_castph256_ph512_freeze(<16 x half> %a0) nounwind } attributes #0 = { sanitize_memory } -;. -; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} -;. From 5291f166a8dd65542da4395b322cf39995c9e39e Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Mon, 28 Apr 2025 22:42:56 +0000 Subject: [PATCH 7/7] clang-format --- llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 7f29425931a6b..b1c832ac9aeb0 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4316,7 +4316,8 @@ struct MemorySanitizerVisitor : public InstVisitor { assert(isa(V->getType())); assert(cast(V->getType())->getNumElements() > 0); Value *Shadow = getShadow(V); - return IRB.CreateExtractElement(Shadow, ConstantInt::get(IRB.getInt32Ty(), 0)); + return IRB.CreateExtractElement(Shadow, + ConstantInt::get(IRB.getInt32Ty(), 0)); } // For sh.* compiler intrinsics: