Skip to content

Commit b5bfeaf

Browse files
committed
[AIE2PS] Add combiner to split wide intrinsics for store fusion
Adds a pre-legalizer combiner that splits wide intrinsics feeding stores into narrower operations, enabling better instruction selection through intrinsic+store fusion patterns. Pattern matched: %result = G_INTRINSIC[_W_SIDE_EFFECTS] @wide_intrinsic, %inputs... %bitcast = G_BITCAST %result %lo, %hi = G_UNMERGE_VALUES %bitcast G_STORE %lo, ... G_STORE %hi, ... Transforms to: %input_lo, %input_hi = G_UNMERGE_VALUES %input %result_lo = G_INTRINSIC[_W_SIDE_EFFECTS] @split_intrinsic, %input_lo, ... %result_hi = G_INTRINSIC[_W_SIDE_EFFECTS] @split_intrinsic, %input_hi, ... %lo = G_BITCAST %result_lo %hi = G_BITCAST %result_hi G_STORE %lo, ... G_STORE %hi, ... Currently supports I512 SRS -> 2x I256 SRS, enabling later SRS+STORE fusion in instruction selection. The implementation is generic and extensible via getSplitIntrinsic() for future intrinsics after benchmarking confirms performance benefits. Includes comprehensive tests covering positive and negative cases.
1 parent 822b570 commit b5bfeaf

4 files changed

Lines changed: 536 additions & 1 deletion

File tree

llvm/lib/Target/AIE/AIECombine.td

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,13 @@ def combine_alternating_build_vector : GICombineRule<
7777
(apply [{ applyAlternatingBuildVector(*${root}, MRI, B, ${matchinfo}, Observer); }])
7878
>;
7979

80+
def combine_split_intrinsic_for_store : GICombineRule<
81+
(defs root:$root, build_fn_matchinfo:$matchinfo),
82+
(match (wip_match_opcode G_INTRINSIC_W_SIDE_EFFECTS, G_INTRINSIC): $root,
83+
[{ return matchSplitIntrinsicForStore(*${root}, MRI, (const AIEBaseInstrInfo &)B.getTII(), ${matchinfo}); }]),
84+
(apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])
85+
>;
86+
8087
def combine_splat_vector_matchdata: GIDefMatchData<"std::pair<Register, Register>">;
8188
def combine_splat_vector : GICombineRule<
8289
(defs root:$root, combine_splat_vector_matchdata:$matchinfo),
@@ -600,9 +607,14 @@ def AIE2PPostLegalizerCustomCombiner
600607
[aie_postlegalizer_custom_shared_combines, aie2p_plus_postlegalizer_custom_shared_combines]> {
601608
}
602609

610+
// AIE2PS-specific pre-legalizer combines
611+
def aie2ps_prelegalizer_additional_combines : GICombineGroup<[
612+
combine_split_intrinsic_for_store
613+
]>;
614+
603615
def AIE2PSPreLegalizerCombiner
604616
: GICombiner<"AIE2PSPreLegalizerCombinerImpl",
605-
[aie_generic_combines, aie_additional_combines, aie2p_additional_combines]> {
617+
[aie_generic_combines, aie_additional_combines, aie2p_additional_combines, aie2ps_prelegalizer_additional_combines]> {
606618
let CombineAllMethodName = "tryCombineAllImpl";
607619
}
608620

llvm/lib/Target/AIE/AIECombinerHelper.cpp

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "llvm/CodeGen/TargetOpcodes.h"
2828
#include "llvm/IR/IntrinsicsAIE2.h"
2929
#include "llvm/IR/IntrinsicsAIE2P.h"
30+
#include "llvm/IR/IntrinsicsAIE2PS.h"
3031
#include "llvm/Support/Alignment.h"
3132
#include "llvm/Support/ErrorHandling.h"
3233
#include <optional>
@@ -1204,6 +1205,166 @@ void llvm::applyAddVecEltUndef(MachineInstr &MI, MachineRegisterInfo &MRI,
12041205
MI.eraseFromParent();
12051206
}
12061207

1208+
//===----------------------------------------------------------------------===//
1209+
// combine_split_intrinsic_for_store
1210+
//===----------------------------------------------------------------------===//
1211+
1212+
/// Returns the split intrinsic ID for intrinsics that can be divided into
1213+
/// two smaller operations. This is used to optimize wide intrinsics that feed
1214+
/// stores by splitting them into narrower operations that may have better
1215+
/// instruction selection.
1216+
///
1217+
/// Currently supported:
1218+
/// - aie2ps_I512_v64_acc32_srs -> aie2ps_I256_v32_acc32_srs
1219+
///
1220+
/// \param OriginalID The intrinsic ID to check for splitting
1221+
/// \return The split intrinsic ID if supported, std::nullopt otherwise
1222+
///
1223+
/// NOTE: This list may be extended in the future with additional intrinsics
1224+
/// after proper benchmarking to ensure the split version provides performance
1225+
/// benefits over the original wide intrinsic.
1226+
static std::optional<Intrinsic::ID>
1227+
getSplitIntrinsic(Intrinsic::ID OriginalID) {
1228+
switch (OriginalID) {
1229+
case Intrinsic::aie2ps_I512_v64_acc32_srs:
1230+
return Intrinsic::aie2ps_I256_v32_acc32_srs;
1231+
// Future intrinsics can be added here after benchmarking
1232+
default:
1233+
return std::nullopt;
1234+
}
1235+
}
1236+
1237+
/// Match and split wide intrinsics that feed stores into narrower operations.
1238+
/// This combiner runs in the pre-legalizer stage and handles intrinsics that
1239+
/// can be split into two half-width operations.
1240+
///
1241+
/// Pattern matched:
1242+
/// %result = G_INTRINSIC[_W_SIDE_EFFECTS] @wide_intrinsic, %inputs...
1243+
/// %bitcast = G_BITCAST %result
1244+
/// %lo, %hi = G_UNMERGE_VALUES %bitcast
1245+
/// G_STORE %lo, ...
1246+
/// G_STORE %hi, ...
1247+
///
1248+
/// Transforms to:
1249+
/// %acc_lo, %acc_hi = G_UNMERGE_VALUES %input_acc
1250+
/// %result_lo = G_INTRINSIC[_W_SIDE_EFFECTS] @split_intrinsic, %acc_lo, ...
1251+
/// %result_hi = G_INTRINSIC[_W_SIDE_EFFECTS] @split_intrinsic, %acc_hi, ...
1252+
/// %new_lo = G_BITCAST %result_lo
1253+
/// %new_hi = G_BITCAST %result_hi
1254+
/// G_STORE %new_lo, ...
1255+
/// G_STORE %new_hi, ...
1256+
bool llvm::matchSplitIntrinsicForStore(MachineInstr &MI,
1257+
MachineRegisterInfo &MRI,
1258+
const AIEBaseInstrInfo &TII,
1259+
BuildFnTy &MatchInfo) {
1260+
// 1. Verify this is an intrinsic and check if it can be split
1261+
const unsigned Opcode = MI.getOpcode();
1262+
if (Opcode != TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS &&
1263+
Opcode != TargetOpcode::G_INTRINSIC)
1264+
return false;
1265+
1266+
const auto *IntrMI = cast<GIntrinsic>(&MI);
1267+
const Intrinsic::ID IntrinsicID = IntrMI->getIntrinsicID();
1268+
1269+
const auto SplitIntrinsicID = getSplitIntrinsic(IntrinsicID);
1270+
if (!SplitIntrinsicID)
1271+
return false;
1272+
1273+
// 2. Get intrinsic output register and verify single use
1274+
const Register IntrinsicOutReg = MI.getOperand(0).getReg();
1275+
1276+
auto GetSingleOpcodeUse = [&MRI](Register Reg,
1277+
unsigned Opcode) -> MachineInstr * {
1278+
if (!MRI.hasOneNonDBGUse(Reg))
1279+
return nullptr;
1280+
MachineInstr *SingleMI = &*MRI.use_nodbg_instructions(Reg).begin();
1281+
if (SingleMI && (SingleMI->getOpcode() == Opcode))
1282+
return SingleMI;
1283+
return nullptr;
1284+
};
1285+
1286+
// 3. Check that the single use is a BITCAST
1287+
MachineInstr *BitcastMI =
1288+
GetSingleOpcodeUse(IntrinsicOutReg, TargetOpcode::G_BITCAST);
1289+
if (!BitcastMI)
1290+
return false;
1291+
1292+
const Register BitcastReg = BitcastMI->getOperand(0).getReg();
1293+
1294+
// 4. Check that the single use is an UNMERGE
1295+
MachineInstr *UnmergeMI =
1296+
GetSingleOpcodeUse(BitcastReg, TargetOpcode::G_UNMERGE_VALUES);
1297+
if (!UnmergeMI)
1298+
return false;
1299+
1300+
// 5. Verify UNMERGE produces exactly 2 results
1301+
if (UnmergeMI->getNumDefs() != 2)
1302+
return false;
1303+
1304+
// 6. Get the two unmerge output registers
1305+
const Register LoReg = UnmergeMI->getOperand(0).getReg();
1306+
const Register HiReg = UnmergeMI->getOperand(1).getReg();
1307+
1308+
if (!GetSingleOpcodeUse(LoReg, TargetOpcode::G_STORE) ||
1309+
!GetSingleOpcodeUse(HiReg, TargetOpcode::G_STORE))
1310+
return false;
1311+
1312+
// 7. Extract intrinsic operands (first operand after the intrinsic ID)
1313+
// For G_INTRINSIC_W_SIDE_EFFECTS: operand 0 = def, 1 = ID, 2+ = inputs
1314+
// For G_INTRINSIC: operand 0 = def, 1 = ID, 2+ = inputs
1315+
const Register AccReg = MI.getOperand(2).getReg();
1316+
const Register ShiftReg = MI.getOperand(3).getReg();
1317+
const Register SignReg = MI.getOperand(4).getReg();
1318+
1319+
// 8. Derive types from the IR (no hardcoded types!)
1320+
const LLT OrigAccTy = MRI.getType(AccReg);
1321+
const LLT OrigIntrOutTy = MRI.getType(IntrinsicOutReg);
1322+
1323+
// Calculate split types by dividing by 2
1324+
const LLT AccHalfTy = OrigAccTy.divide(2);
1325+
const LLT IntrOutHalfTy = OrigIntrOutTy.divide(2);
1326+
1327+
// 9. Build the transformation
1328+
// Note: We use applyBuildFnNoErase. We replace register uses and let DCE
1329+
// clean up dead instructions.
1330+
MatchInfo = [=, &MI, &MRI](MachineIRBuilder &B) {
1331+
// Step 1: Unmerge the accumulator into two halves
1332+
const Register AccLoReg = MRI.createGenericVirtualRegister(AccHalfTy);
1333+
const Register AccHiReg = MRI.createGenericVirtualRegister(AccHalfTy);
1334+
B.buildUnmerge({AccLoReg, AccHiReg}, AccReg);
1335+
1336+
// Step 2: Create two split intrinsics using the ID from getSplitIntrinsic
1337+
const bool HasSideEffects =
1338+
(Opcode == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1339+
1340+
const Register IntrOutLoReg =
1341+
MRI.createGenericVirtualRegister(IntrOutHalfTy);
1342+
B.buildIntrinsic(*SplitIntrinsicID, IntrOutLoReg, HasSideEffects,
1343+
/*isConvergent=*/false)
1344+
.addUse(AccLoReg)
1345+
.addUse(ShiftReg)
1346+
.addUse(SignReg);
1347+
1348+
const Register IntrOutHiReg =
1349+
MRI.createGenericVirtualRegister(IntrOutHalfTy);
1350+
B.buildIntrinsic(*SplitIntrinsicID, IntrOutHiReg, HasSideEffects,
1351+
/*isConvergent=*/false)
1352+
.addUse(AccHiReg)
1353+
.addUse(ShiftReg)
1354+
.addUse(SignReg);
1355+
1356+
// Step 3: Bitcast each intrinsic result to the store type
1357+
B.buildBitcast(LoReg, IntrOutLoReg);
1358+
B.buildBitcast(HiReg, IntrOutHiReg);
1359+
1360+
MI.eraseFromParent();
1361+
UnmergeMI->eraseFromParent();
1362+
BitcastMI->eraseFromParent();
1363+
};
1364+
1365+
return true;
1366+
}
1367+
12071368
/// Get an s32/s20 value from an s20 register that comes from either:
12081369
/// 1. G_TRUNC of s32 -> returns the original s32 register
12091370
/// 2. G_ZEXTLOAD of s16 -> returns the s20 register (already zero-extended)

llvm/lib/Target/AIE/AIECombinerHelper.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,30 @@ void applyAlternatingBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
395395
AIEAlternatingBuildVectorMatchData &MatchInfo,
396396
GISelChangeObserver &Observer);
397397

398+
/// Match and split a 512-bit SRS intrinsic that feeds stores through BITCAST
399+
/// and UNMERGE. This enables later SRS+STORE fusion in instruction selection.
400+
/// Pattern matched:
401+
/// %srs:_(<64 x s8>) = G_INTRINSIC_W_SIDE_EFFECTS
402+
/// intrinsic(@llvm.aie2ps.I512.v64.acc32.srs), %acc(<64 x s32>), %shift,
403+
/// %sign
404+
/// %bitcast:_(<16 x s32>) = G_BITCAST %srs
405+
/// %lo:_(<8 x s32>), %hi:_(<8 x s32>) = G_UNMERGE_VALUES %bitcast
406+
/// G_STORE %lo, %ptr1
407+
/// G_STORE %hi, %ptr2
408+
/// Transforms to:
409+
/// %acc_lo:_(<32 x s32>), %acc_hi:_(<32 x s32>) = G_UNMERGE_VALUES %acc
410+
/// %srs_lo:_(<32 x s8>) = G_INTRINSIC_W_SIDE_EFFECTS
411+
/// intrinsic(@llvm.aie2ps.I256.v32.acc32.srs), %acc_lo, %shift, %sign
412+
/// %srs_hi:_(<32 x s8>) = G_INTRINSIC_W_SIDE_EFFECTS
413+
/// intrinsic(@llvm.aie2ps.I256.v32.acc32.srs), %acc_hi, %shift, %sign
414+
/// %lo:_(<8 x s32>) = G_BITCAST %srs_lo
415+
/// %hi:_(<8 x s32>) = G_BITCAST %srs_hi
416+
/// G_STORE %lo, %ptr1
417+
/// G_STORE %hi, %ptr2
418+
bool matchSplitIntrinsicForStore(MachineInstr &MI, MachineRegisterInfo &MRI,
419+
const AIEBaseInstrInfo &TII,
420+
BuildFnTy &MatchInfo);
421+
398422
bool matchVShiftChainToCopy(MachineInstr &MI, MachineRegisterInfo &MRI,
399423
const AIEBaseInstrInfo &TII, BuildFnTy &MatchInfo);
400424

0 commit comments

Comments
 (0)