Skip to content

Commit 9ce298a

Browse files
seven-milelanza
authored andcommitted
[CIR][CodeGen] Special treatment of 3-element extended vector load and store (llvm#674)
Continue the work of llvm#613 . Original CodeGen treat vec3 as vec4 to get aligned memory access. This PR enable these paths.
1 parent 7abde0a commit 9ce298a

File tree

2 files changed

+91
-6
lines changed

2 files changed

+91
-6
lines changed

clang/lib/CIR/CodeGen/CIRGenExpr.cpp

+33-6
Original file line numberDiff line numberDiff line change
@@ -596,10 +596,23 @@ void CIRGenFunction::buildStoreOfScalar(mlir::Value Value, Address Addr,
596596
return;
597597
}
598598

599+
mlir::Type SrcTy = Value.getType();
599600
if (const auto *ClangVecTy = Ty->getAs<clang::VectorType>()) {
601+
auto VecTy = dyn_cast<mlir::cir::VectorType>(SrcTy);
600602
if (!CGM.getCodeGenOpts().PreserveVec3Type &&
601-
ClangVecTy->getNumElements() == 3)
602-
llvm_unreachable("NYI: Special treatment of 3-element vector store");
603+
ClangVecTy->getNumElements() == 3) {
604+
// Handle vec3 special.
605+
if (VecTy && VecTy.getSize() == 3) {
606+
// Our source is a vec3, do a shuffle vector to make it a vec4.
607+
Value = builder.createVecShuffle(Value.getLoc(), Value,
608+
ArrayRef<int64_t>{0, 1, 2, -1});
609+
SrcTy = mlir::cir::VectorType::get(VecTy.getContext(),
610+
VecTy.getEltType(), 4);
611+
}
612+
if (Addr.getElementType() != SrcTy) {
613+
Addr = Addr.withElementType(SrcTy);
614+
}
615+
}
603616
}
604617

605618
// Update the alloca with more info on initialization.
@@ -772,7 +785,7 @@ void CIRGenFunction::buildStoreThroughExtVectorComponentLValue(RValue Src,
772785
// of the Elts constant array will be one past the size of the vector.
773786
// Ignore the last element here, if it is greater than the mask size.
774787
if (getAccessedFieldNo(NumSrcElts - 1, Elts) == Mask.size())
775-
llvm_unreachable("NYI");
788+
NumSrcElts--;
776789

777790
// modify when what gets shuffled in
778791
for (unsigned i = 0; i != NumSrcElts; ++i)
@@ -2770,14 +2783,28 @@ mlir::Value CIRGenFunction::buildLoadOfScalar(Address Addr, bool Volatile,
27702783
llvm_unreachable("NYI");
27712784
}
27722785

2786+
auto ElemTy = Addr.getElementType();
2787+
27732788
if (const auto *ClangVecTy = Ty->getAs<clang::VectorType>()) {
2789+
// Handle vectors of size 3 like size 4 for better performance.
2790+
const auto VTy = cast<mlir::cir::VectorType>(ElemTy);
2791+
27742792
if (!CGM.getCodeGenOpts().PreserveVec3Type &&
2775-
ClangVecTy->getNumElements() == 3)
2776-
llvm_unreachable("NYI: Special treatment of 3-element vector load");
2793+
ClangVecTy->getNumElements() == 3) {
2794+
auto loc = Addr.getPointer().getLoc();
2795+
auto vec4Ty =
2796+
mlir::cir::VectorType::get(VTy.getContext(), VTy.getEltType(), 4);
2797+
Address Cast = Addr.withElementType(vec4Ty);
2798+
// Now load value.
2799+
mlir::Value V = builder.createLoad(loc, Cast);
2800+
2801+
// Shuffle vector to get vec3.
2802+
V = builder.createVecShuffle(loc, V, ArrayRef<int64_t>{0, 1, 2});
2803+
return buildFromMemory(V, Ty);
2804+
}
27772805
}
27782806

27792807
auto Ptr = Addr.getPointer();
2780-
auto ElemTy = Addr.getElementType();
27812808
if (ElemTy.isa<mlir::cir::VoidType>()) {
27822809
ElemTy = mlir::cir::IntType::get(builder.getContext(), 8, true);
27832810
auto ElemPtrTy = mlir::cir::PointerType::get(builder.getContext(), ElemTy);

clang/test/CIR/CodeGen/vectype-ext.cpp

+58
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
// RUN: FileCheck --input-file=%t.ll %s -check-prefix=LLVM
55

66
typedef int vi4 __attribute__((ext_vector_type(4)));
7+
typedef int vi3 __attribute__((ext_vector_type(3)));
78
typedef int vi2 __attribute__((ext_vector_type(2)));
89
typedef double vd2 __attribute__((ext_vector_type(2)));
910
typedef long vl2 __attribute__((ext_vector_type(2)));
@@ -349,6 +350,10 @@ void test_store() {
349350
// CIR-NEXT: %[[#PVECB:]] = cir.alloca !cir.vector<!s32i x 2>
350351
// LLVM-NEXT: %[[#PVECB:]] = alloca <2 x i32>
351352

353+
vi3 c = {};
354+
// CIR-NEXT: %[[#PVECC:]] = cir.alloca !cir.vector<!s32i x 3>
355+
// LLVM-NEXT: %[[#PVECC:]] = alloca <3 x i32>
356+
352357
a.xy = b;
353358
// CIR: %[[#LOAD4RHS:]] = cir.load %{{[0-9]+}} : !cir.ptr<!cir.vector<!s32i x 2>>, !cir.vector<!s32i x 2>
354359
// CIR-NEXT: %[[#LOAD5LHS:]] = cir.load %{{[0-9]+}} : !cir.ptr<!cir.vector<!s32i x 4>>, !cir.vector<!s32i x 4>
@@ -388,6 +393,35 @@ void test_store() {
388393
// LLVM-NEXT: %[[#RESULT:]] = shufflevector <4 x i32> %[[#VECA]], <4 x i32> %[[#EXTVECB]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
389394
// LLVM-NEXT: store <4 x i32> %[[#RESULT]], ptr %[[#PVECA]], align 16
390395

396+
// OpenCL C Specification 6.3.7. Vector Components
397+
// The suffixes .lo (or .even) and .hi (or .odd) for a 3-component vector type
398+
// operate as if the 3-component vector type is a 4-component vector type with
399+
// the value in the w component undefined.
400+
b = c.hi;
401+
402+
// CIR-NEXT: %[[#VECC:]] = cir.load %[[#PVECC]] : !cir.ptr<!cir.vector<!s32i x 3>>, !cir.vector<!s32i x 3>
403+
// CIR-NEXT: %[[#HIPART:]] = cir.vec.shuffle(%[[#VECC]], %[[#VECC]] : !cir.vector<!s32i x 3>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<!s32i x 2>
404+
// CIR-NEXT: cir.store %[[#HIPART]], %[[#PVECB]] : !cir.vector<!s32i x 2>, !cir.ptr<!cir.vector<!s32i x 2>>
405+
406+
// LLVM-NEXT: %[[#VECC:]] = load <3 x i32>, ptr %[[#PVECC]], align 16
407+
// LLVM-NEXT: %[[#HIPART:]] = shufflevector <3 x i32> %[[#VECC]], <3 x i32> %[[#VECC]], <2 x i32> <i32 2, i32 3>
408+
// LLVM-NEXT: store <2 x i32> %[[#HIPART]], ptr %[[#PVECB]], align 8
409+
410+
// c.hi is c[2, 3], in which 3 should be ignored in CIRGen for store
411+
c.hi = b;
412+
413+
// CIR-NEXT: %[[#VECB:]] = cir.load %[[#PVECB]] : !cir.ptr<!cir.vector<!s32i x 2>>, !cir.vector<!s32i x 2>
414+
// CIR-NEXT: %[[#VECC:]] = cir.load %[[#PVECC]] : !cir.ptr<!cir.vector<!s32i x 3>>, !cir.vector<!s32i x 3>
415+
// CIR-NEXT: %[[#EXTVECB:]] = cir.vec.shuffle(%[[#VECB]], %[[#VECB]] : !cir.vector<!s32i x 2>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<-1> : !s32i] : !cir.vector<!s32i x 3>
416+
// CIR-NEXT: %[[#RESULT:]] = cir.vec.shuffle(%[[#VECC]], %[[#EXTVECB]] : !cir.vector<!s32i x 3>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<3> : !s32i] : !cir.vector<!s32i x 3>
417+
// CIR-NEXT: cir.store %[[#RESULT]], %[[#PVECC]] : !cir.vector<!s32i x 3>, !cir.ptr<!cir.vector<!s32i x 3>>
418+
419+
// LLVM-NEXT: %[[#VECB:]] = load <2 x i32>, ptr %[[#PVECB]], align 8
420+
// LLVM-NEXT: %[[#VECC:]] = load <3 x i32>, ptr %[[#PVECC]], align 16
421+
// LLVM-NEXT: %[[#EXTVECB:]] = shufflevector <2 x i32> %[[#VECB]], <2 x i32> %[[#VECB]], <3 x i32> <i32 0, i32 1, i32 poison>
422+
// LLVM-NEXT: %[[#RESULT:]] = shufflevector <3 x i32> %[[#VECC]], <3 x i32> %[[#EXTVECB]], <3 x i32> <i32 0, i32 1, i32 3>
423+
// LLVM-NEXT: store <3 x i32> %[[#RESULT]], ptr %[[#PVECC]], align 16
424+
391425
}
392426

393427
// CIR: cir.func {{@.*test_build_lvalue.*}}
@@ -452,3 +486,27 @@ void test_build_lvalue() {
452486
// LLVM-NEXT: store i32 %[[#RESULT]], ptr %[[#ALLOCAR]], align 4
453487

454488
}
489+
490+
// CIR: cir.func {{@.*test_vec3.*}}
491+
// LLVM: define void {{@.*test_vec3.*}}
492+
void test_vec3() {
493+
vi3 v = {};
494+
// CIR-NEXT: %[[#PV:]] = cir.alloca !cir.vector<!s32i x 3>, !cir.ptr<!cir.vector<!s32i x 3>>, ["v", init] {alignment = 16 : i64}
495+
// CIR: %[[#VEC4:]] = cir.vec.shuffle(%{{[0-9]+}}, %{{[0-9]+}} : !cir.vector<!s32i x 3>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<-1> : !s32i] : !cir.vector<!s32i x 4>
496+
// CIR-NEXT: %[[#PV4:]] = cir.cast(bitcast, %[[#PV]] : !cir.ptr<!cir.vector<!s32i x 3>>), !cir.ptr<!cir.vector<!s32i x 4>>
497+
// CIR-NEXT: cir.store %[[#VEC4]], %[[#PV4]] : !cir.vector<!s32i x 4>, !cir.ptr<!cir.vector<!s32i x 4>>
498+
499+
// LLVM-NEXT: %[[#PV:]] = alloca <3 x i32>, i64 1, align 16
500+
// LLVM-NEXT: store <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr %[[#PV]], align 16
501+
502+
v + 1;
503+
// CIR-NEXT: %[[#PV4:]] = cir.cast(bitcast, %[[#PV]] : !cir.ptr<!cir.vector<!s32i x 3>>), !cir.ptr<!cir.vector<!s32i x 4>>
504+
// CIR-NEXT: %[[#V4:]] = cir.load %[[#PV4]] : !cir.ptr<!cir.vector<!s32i x 4>>, !cir.vector<!s32i x 4>
505+
// CIR-NEXT: %[[#V3:]] = cir.vec.shuffle(%[[#V4]], %[[#V4]] : !cir.vector<!s32i x 4>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i] : !cir.vector<!s32i x 3>
506+
// CIR: %[[#RES:]] = cir.binop(add, %[[#V3]], %{{[0-9]+}}) : !cir.vector<!s32i x 3>
507+
508+
// LLVM-NEXT: %[[#V4:]] = load <4 x i32>, ptr %[[#PV:]], align 16
509+
// LLVM-NEXT: %[[#V3:]] = shufflevector <4 x i32> %[[#V4]], <4 x i32> %[[#V4]], <3 x i32> <i32 0, i32 1, i32 2>
510+
// LLVM-NEXT: %[[#RES:]] = add <3 x i32> %[[#V3]], <i32 1, i32 1, i32 1>
511+
512+
}

0 commit comments

Comments
 (0)