|
27 | 27 | #include "llvm/CodeGen/TargetOpcodes.h" |
28 | 28 | #include "llvm/IR/IntrinsicsAIE2.h" |
29 | 29 | #include "llvm/IR/IntrinsicsAIE2P.h" |
| 30 | +#include "llvm/IR/IntrinsicsAIE2PS.h" |
30 | 31 | #include "llvm/Support/Alignment.h" |
31 | 32 | #include "llvm/Support/ErrorHandling.h" |
32 | 33 | #include <optional> |
@@ -1204,6 +1205,166 @@ void llvm::applyAddVecEltUndef(MachineInstr &MI, MachineRegisterInfo &MRI, |
1204 | 1205 | MI.eraseFromParent(); |
1205 | 1206 | } |
1206 | 1207 |
|
| 1208 | +//===----------------------------------------------------------------------===// |
| 1209 | +// combine_split_intrinsic_for_store |
| 1210 | +//===----------------------------------------------------------------------===// |
| 1211 | + |
| 1212 | +/// Returns the split intrinsic ID for intrinsics that can be divided into |
| 1213 | +/// two smaller operations. This is used to optimize wide intrinsics that feed |
| 1214 | +/// stores by splitting them into narrower operations that may have better |
| 1215 | +/// instruction selection. |
| 1216 | +/// |
| 1217 | +/// Currently supported: |
| 1218 | +/// - aie2ps_I512_v64_acc32_srs -> aie2ps_I256_v32_acc32_srs |
| 1219 | +/// |
| 1220 | +/// \param OriginalID The intrinsic ID to check for splitting |
| 1221 | +/// \return The split intrinsic ID if supported, std::nullopt otherwise |
| 1222 | +/// |
| 1223 | +/// NOTE: This list may be extended in the future with additional intrinsics |
| 1224 | +/// after proper benchmarking to ensure the split version provides performance |
| 1225 | +/// benefits over the original wide intrinsic. |
| 1226 | +static std::optional<Intrinsic::ID> |
| 1227 | +getSplitIntrinsic(Intrinsic::ID OriginalID) { |
| 1228 | + switch (OriginalID) { |
| 1229 | + case Intrinsic::aie2ps_I512_v64_acc32_srs: |
| 1230 | + return Intrinsic::aie2ps_I256_v32_acc32_srs; |
| 1231 | + // Future intrinsics can be added here after benchmarking |
| 1232 | + default: |
| 1233 | + return std::nullopt; |
| 1234 | + } |
| 1235 | +} |
| 1236 | + |
| 1237 | +/// Match and split wide intrinsics that feed stores into narrower operations. |
| 1238 | +/// This combiner runs in the pre-legalizer stage and handles intrinsics that |
| 1239 | +/// can be split into two half-width operations. |
| 1240 | +/// |
| 1241 | +/// Pattern matched: |
| 1242 | +/// %result = G_INTRINSIC[_W_SIDE_EFFECTS] @wide_intrinsic, %inputs... |
| 1243 | +/// %bitcast = G_BITCAST %result |
| 1244 | +/// %lo, %hi = G_UNMERGE_VALUES %bitcast |
| 1245 | +/// G_STORE %lo, ... |
| 1246 | +/// G_STORE %hi, ... |
| 1247 | +/// |
| 1248 | +/// Transforms to: |
| 1249 | +/// %acc_lo, %acc_hi = G_UNMERGE_VALUES %input_acc |
| 1250 | +/// %result_lo = G_INTRINSIC[_W_SIDE_EFFECTS] @split_intrinsic, %acc_lo, ... |
| 1251 | +/// %result_hi = G_INTRINSIC[_W_SIDE_EFFECTS] @split_intrinsic, %acc_hi, ... |
| 1252 | +/// %new_lo = G_BITCAST %result_lo |
| 1253 | +/// %new_hi = G_BITCAST %result_hi |
| 1254 | +/// G_STORE %new_lo, ... |
| 1255 | +/// G_STORE %new_hi, ... |
| 1256 | +bool llvm::matchSplitIntrinsicForStore(MachineInstr &MI, |
| 1257 | + MachineRegisterInfo &MRI, |
| 1258 | + const AIEBaseInstrInfo &TII, |
| 1259 | + BuildFnTy &MatchInfo) { |
| 1260 | + // 1. Verify this is an intrinsic and check if it can be split |
| 1261 | + const unsigned Opcode = MI.getOpcode(); |
| 1262 | + if (Opcode != TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS && |
| 1263 | + Opcode != TargetOpcode::G_INTRINSIC) |
| 1264 | + return false; |
| 1265 | + |
| 1266 | + const auto *IntrMI = cast<GIntrinsic>(&MI); |
| 1267 | + const Intrinsic::ID IntrinsicID = IntrMI->getIntrinsicID(); |
| 1268 | + |
| 1269 | + const auto SplitIntrinsicID = getSplitIntrinsic(IntrinsicID); |
| 1270 | + if (!SplitIntrinsicID) |
| 1271 | + return false; |
| 1272 | + |
| 1273 | + // 2. Get intrinsic output register and verify single use |
| 1274 | + const Register IntrinsicOutReg = MI.getOperand(0).getReg(); |
| 1275 | + |
| 1276 | + auto GetSingleOpcodeUse = [&MRI](Register Reg, |
| 1277 | + unsigned Opcode) -> MachineInstr * { |
| 1278 | + if (!MRI.hasOneNonDBGUse(Reg)) |
| 1279 | + return nullptr; |
| 1280 | + MachineInstr *SingleMI = &*MRI.use_nodbg_instructions(Reg).begin(); |
| 1281 | + if (SingleMI && (SingleMI->getOpcode() == Opcode)) |
| 1282 | + return SingleMI; |
| 1283 | + return nullptr; |
| 1284 | + }; |
| 1285 | + |
| 1286 | + // 3. Check that the single use is a BITCAST |
| 1287 | + MachineInstr *BitcastMI = |
| 1288 | + GetSingleOpcodeUse(IntrinsicOutReg, TargetOpcode::G_BITCAST); |
| 1289 | + if (!BitcastMI) |
| 1290 | + return false; |
| 1291 | + |
| 1292 | + const Register BitcastReg = BitcastMI->getOperand(0).getReg(); |
| 1293 | + |
| 1294 | + // 4. Check that the single use is an UNMERGE |
| 1295 | + MachineInstr *UnmergeMI = |
| 1296 | + GetSingleOpcodeUse(BitcastReg, TargetOpcode::G_UNMERGE_VALUES); |
| 1297 | + if (!UnmergeMI) |
| 1298 | + return false; |
| 1299 | + |
| 1300 | + // 5. Verify UNMERGE produces exactly 2 results |
| 1301 | + if (UnmergeMI->getNumDefs() != 2) |
| 1302 | + return false; |
| 1303 | + |
| 1304 | + // 6. Get the two unmerge output registers |
| 1305 | + const Register LoReg = UnmergeMI->getOperand(0).getReg(); |
| 1306 | + const Register HiReg = UnmergeMI->getOperand(1).getReg(); |
| 1307 | + |
| 1308 | + if (!GetSingleOpcodeUse(LoReg, TargetOpcode::G_STORE) || |
| 1309 | + !GetSingleOpcodeUse(HiReg, TargetOpcode::G_STORE)) |
| 1310 | + return false; |
| 1311 | + |
| 1312 | + // 7. Extract intrinsic operands (first operand after the intrinsic ID) |
| 1313 | + // For G_INTRINSIC_W_SIDE_EFFECTS: operand 0 = def, 1 = ID, 2+ = inputs |
| 1314 | + // For G_INTRINSIC: operand 0 = def, 1 = ID, 2+ = inputs |
| 1315 | + const Register AccReg = MI.getOperand(2).getReg(); |
| 1316 | + const Register ShiftReg = MI.getOperand(3).getReg(); |
| 1317 | + const Register SignReg = MI.getOperand(4).getReg(); |
| 1318 | + |
| 1319 | + // 8. Derive types from the IR (no hardcoded types!) |
| 1320 | + const LLT OrigAccTy = MRI.getType(AccReg); |
| 1321 | + const LLT OrigIntrOutTy = MRI.getType(IntrinsicOutReg); |
| 1322 | + |
| 1323 | + // Calculate split types by dividing by 2 |
| 1324 | + const LLT AccHalfTy = OrigAccTy.divide(2); |
| 1325 | + const LLT IntrOutHalfTy = OrigIntrOutTy.divide(2); |
| 1326 | + |
| 1327 | + // 9. Build the transformation |
| 1328 | + // Note: We use applyBuildFnNoErase. We replace register uses and let DCE |
| 1329 | + // clean up dead instructions. |
| 1330 | + MatchInfo = [=, &MI, &MRI](MachineIRBuilder &B) { |
| 1331 | + // Step 1: Unmerge the accumulator into two halves |
| 1332 | + const Register AccLoReg = MRI.createGenericVirtualRegister(AccHalfTy); |
| 1333 | + const Register AccHiReg = MRI.createGenericVirtualRegister(AccHalfTy); |
| 1334 | + B.buildUnmerge({AccLoReg, AccHiReg}, AccReg); |
| 1335 | + |
| 1336 | + // Step 2: Create two split intrinsics using the ID from getSplitIntrinsic |
| 1337 | + const bool HasSideEffects = |
| 1338 | + (Opcode == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); |
| 1339 | + |
| 1340 | + const Register IntrOutLoReg = |
| 1341 | + MRI.createGenericVirtualRegister(IntrOutHalfTy); |
| 1342 | + B.buildIntrinsic(*SplitIntrinsicID, IntrOutLoReg, HasSideEffects, |
| 1343 | + /*isConvergent=*/false) |
| 1344 | + .addUse(AccLoReg) |
| 1345 | + .addUse(ShiftReg) |
| 1346 | + .addUse(SignReg); |
| 1347 | + |
| 1348 | + const Register IntrOutHiReg = |
| 1349 | + MRI.createGenericVirtualRegister(IntrOutHalfTy); |
| 1350 | + B.buildIntrinsic(*SplitIntrinsicID, IntrOutHiReg, HasSideEffects, |
| 1351 | + /*isConvergent=*/false) |
| 1352 | + .addUse(AccHiReg) |
| 1353 | + .addUse(ShiftReg) |
| 1354 | + .addUse(SignReg); |
| 1355 | + |
| 1356 | + // Step 3: Bitcast each intrinsic result to the store type |
| 1357 | + B.buildBitcast(LoReg, IntrOutLoReg); |
| 1358 | + B.buildBitcast(HiReg, IntrOutHiReg); |
| 1359 | + |
| 1360 | + MI.eraseFromParent(); |
| 1361 | + UnmergeMI->eraseFromParent(); |
| 1362 | + BitcastMI->eraseFromParent(); |
| 1363 | + }; |
| 1364 | + |
| 1365 | + return true; |
| 1366 | +} |
| 1367 | + |
1207 | 1368 | /// Get an s32/s20 value from an s20 register that comes from either: |
1208 | 1369 | /// 1. G_TRUNC of s32 -> returns the original s32 register |
1209 | 1370 | /// 2. G_ZEXTLOAD of s16 -> returns the s20 register (already zero-extended) |
|
0 commit comments