[AIE2PS] Spill 512-bit accumulators into vector registers

F-Stuckmann · F-Stuckmann · commit dcb9fb6d928e · 2026-04-29T07:06:51.000-06:00
Introduce a combined 512-bit spill register class that unions the
vector (mXm), accumulator (mBMm), and FIFO (lfh0/lfh1/lfl0/lfl1/
sfl/sfh/lfe) physical registers. Exposing this class to the register
coalescer via getLargestLegalSuperClass lets a 512-bit value stored
in an ACC512 vreg be allocated to a free X register instead of
spilling to the stack when the accumulator bank is under pressure.
This mirrors the existing AIE2P optimization.

The widening is opt-in only for ACC512 and VEC512 (compared by
pointer equality, not hasSubClassEq) to limit ripple effects on
operand-restricted sub-classes that would otherwise alter coalescing
and pre-RA scheduling.

Spill / reload of a composite-class vreg goes through two new
pseudos, VST_512_COMPOSED_REG_SPILL and VLDA_512_COMPOSED_REG_SPILL.
eliminateFrameIndex resolves the frame index to an SP-relative
immediate, and expandPostRAPseudo swaps the descriptor to the native
opcode that matches the actual physical register chosen by the
allocator: VST_dmx_sts_x_spill / VLDA_dmx_lda_x_spill for VEC512,
and VST_dmx_sts_bm_spill / VLDA_dmx_lda_bm_spill for ACC512. AIE2PS
has no native FIFO spill opcode, so the FIFO branch falls through to
report_fatal_error; in practice the allocator should not assign a
FIFO physreg to a composite-class vreg.

A new test exercises both branches of the post-RA descriptor swap
end-to-end through prologepilog and postrapseudos.
diff --git a/llvm/lib/Target/AIE/aie2ps/AIE2PSInstrInfo.cpp b/llvm/lib/Target/AIE/aie2ps/AIE2PSInstrInfo.cpp
@@ -966,6 +966,9 @@ void AIE2PSInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
              regClassMatches(AIE2PS::spill_eS_to_eRRegClass, RC, SrcReg)) {
     // Can't spill these directly.  Need to bounce through a GPR.
     return bounceViaRegClass(&AIE2PS::eRRegClass);
+  } else if (regClassMatches(AIE2PS::spill_vec512_to_compositeRegClass, RC,
+                             SrcReg)) {
+    Opcode = AIE2PS::VST_512_COMPOSED_REG_SPILL;
   } else {
     LLVM_DEBUG(I->dump());
     llvm_unreachable("Can't store this register to stack slot: is it virtual?");
@@ -1080,6 +1083,9 @@ void AIE2PSInstrInfo::loadRegFromStackSlot(
              regClassMatches(AIE2PS::spill_eS_to_eRRegClass, RC, DstReg)) {
     // Can't spill these directly.  Need to bounce through a GPR.
     return bounceViaRegClass(&AIE2PS::eRRegClass);
+  } else if (regClassMatches(AIE2PS::spill_vec512_to_compositeRegClass, RC,
+                             DstReg)) {
+    Opcode = AIE2PS::VLDA_512_COMPOSED_REG_SPILL;
   } else {
     LLVM_DEBUG(I->dump());
     llvm_unreachable(
@@ -1562,6 +1568,38 @@ bool AIE2PSInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent();
     return true;
   }
+  case AIE2PS::VLDA_512_COMPOSED_REG_SPILL: {
+    unsigned int Opcode;
+    const Register Dst = MI.getOperand(0).getReg();
+    if (AIE2PS::VEC512RegClass.contains(Dst)) {
+      Opcode = AIE2PS::VLDA_dmx_lda_x_spill;
+    } else if (AIE2PS::ACC512RegClass.contains(Dst)) {
+      Opcode = AIE2PS::VLDA_dmx_lda_bm_spill;
+    } else {
+      // FIFO512 is part of the composite RC for parity with AIE2P, but
+      // AIE2PS has no native FIFO spill opcode. The allocator should not
+      // assign a FIFO physreg to a composite-class vreg in practice; if
+      // it does, fail loudly so the assumption can be revisited.
+      report_fatal_error("VLDA_512_COMPOSED_REG_SPILL: no native AIE2PS "
+                         "spill opcode for non-VEC/non-ACC physreg");
+    }
+    MI.setDesc(get(Opcode));
+    return false;
+  }
+  case AIE2PS::VST_512_COMPOSED_REG_SPILL: {
+    unsigned int Opcode;
+    const Register Src = MI.getOperand(0).getReg();
+    if (AIE2PS::VEC512RegClass.contains(Src)) {
+      Opcode = AIE2PS::VST_dmx_sts_x_spill;
+    } else if (AIE2PS::ACC512RegClass.contains(Src)) {
+      Opcode = AIE2PS::VST_dmx_sts_bm_spill;
+    } else {
+      report_fatal_error("VST_512_COMPOSED_REG_SPILL: no native AIE2PS "
+                         "spill opcode for non-VEC/non-ACC physreg");
+    }
+    MI.setDesc(get(Opcode));
+    return false;
+  }
   }
   return false;
 }
diff --git a/llvm/lib/Target/AIE/aie2ps/AIE2PSInstrInfo.td b/llvm/lib/Target/AIE/aie2ps/AIE2PSInstrInfo.td
@@ -160,6 +160,7 @@ def VST_Y_SPILL : Pseudo<(outs ), (ins VEC1024:$src, c17n_step128:$imm), "vst_y_
 def VST_BM_SPILL : Pseudo<(outs ), (ins ACC512:$src, c16n_step64:$imm), "vst_bm_spill", "$src, [sp, $imm]">;
 def VST_CM_SPILL : Pseudo<(outs ), (ins ACC1024:$src, c17n_step128:$imm), "vst_cm_spill", "$src, [sp, $imm]">;
 def VST_DM_SPILL : Pseudo<(outs ), (ins ACC2048:$src, c17n_step128:$imm), "vst_dm_spill", "$src, [sp, $imm]">;
+def VST_512_COMPOSED_REG_SPILL : Pseudo<(outs ), (ins spill_vec512_to_composite:$src, c16n_step64:$imm), "vst_512_composed_reg_spill", "${src}, [sp, $imm]">;
 
 def VST_E_SPILL : Pseudo<(outs ), (ins mEs:$src, c12n_step4:$imm), "vst_e_spill", "$src, [sp, $imm]">;
 def VST_EE_SPILL : Pseudo<(outs ), (ins mEEs:$src, c13n_step8:$imm), "vst_ee_spill", "$src, [sp, $imm]">;
@@ -193,6 +194,7 @@ def VLDA_Y_SPILL : Pseudo<(outs VEC1024:$dst), (ins c17n_step128:$imm), "vlda_y_
 def VLDA_BM_SPILL : Pseudo<(outs ACC512:$dst), (ins c16n_step64:$imm), "vlda_bm_spill", "${dst}, [sp, $imm]">;
 def VLDA_CM_SPILL : Pseudo<(outs ACC1024:$dst), (ins c17n_step128:$imm), "vlda_cm_spill", "${dst}, [sp, $imm]">;
 def VLDA_DM_SPILL : Pseudo<(outs ACC2048:$dst), (ins c17n_step128:$imm), "vlda_dm_spill", "${dst}, [sp, $imm]">;
+def VLDA_512_COMPOSED_REG_SPILL : Pseudo<(outs spill_vec512_to_composite:$dst), (ins c16n_step64:$imm), "vlda_512_composed_reg_spill", "${dst}, [sp, $imm]">;
 
 def VLDA_E_SPILL : Pseudo<(outs mEs:$dst), (ins c12n_step4:$imm), "vlda_e_spill", "$dst, [sp, $imm]">;
 def VLDA_EE_SPILL : Pseudo<(outs mEEs:$dst), (ins c13n_step8:$imm), "vlda_ee_spill", "$dst, [sp, $imm]">;
diff --git a/llvm/lib/Target/AIE/aie2ps/AIE2PSRegisterInfo.cpp b/llvm/lib/Target/AIE/aie2ps/AIE2PSRegisterInfo.cpp
@@ -188,6 +188,8 @@ bool AIE2PSRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   case AIE2PS::VST_dmw_sts_w_spill:
   case AIE2PS::VST_dmx_sts_bm_spill:
   case AIE2PS::VST_dmx_sts_x_spill:
+  case AIE2PS::VLDA_512_COMPOSED_REG_SPILL:
+  case AIE2PS::VST_512_COMPOSED_REG_SPILL:
     MI.getOperand(FIOperandNum).ChangeToImmediate(Offset);
     return false;
   case AIE2PS::LDA_R_SPILL:
@@ -394,6 +396,10 @@ AIE2PSRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
     return &AIE2PS::spill_eDC_to_eRRegClass;
   if (AIE2PS::eSRegClass.hasSubClassEq(RC))
     return &AIE2PS::spill_eS_to_eRRegClass;
+  if (RC == &AIE2PS::ACC512RegClass || RC == &AIE2PS::VEC512RegClass)
+    // using hasSubClassEq leads to register coalescer changes (spill_vec512
+    // will be used more frequently) and thus change machine scheduling
+    return &AIE2PS::spill_vec512_to_compositeRegClass;
 
   return RC;
 }
diff --git a/llvm/lib/Target/AIE/aie2ps/AIE2PSRegisterInfo.td b/llvm/lib/Target/AIE/aie2ps/AIE2PSRegisterInfo.td
@@ -1207,6 +1207,10 @@ def spill_eDN_to_eR : AIE2PSScalarRegisterClass<(add eDN, eR)>;
 def spill_eDJ_to_eR : AIE2PSScalarRegisterClass<(add eDJ, eR, eDN)>;
 def spill_eDC_to_eR : AIE2PSScalarRegisterClass<(add eDC, eR)>;
 
+def spill_vec512_to_composite : AIE2PSVector512RegisterClass<(add mXm, mBMm, lfh0, lfh1, lfl0, lfl1, sfl, sfh, lfe)> {
+  let ConsiderInPreRAScheduling = false;
+}
+
 class AIE2PVector1076FifoRegisterClass<dag reglist> :
     AIE2PSRegisterClass<1088, 512, [i32], reglist>;
 def sub_fifo : SubRegIndex<1024, 0>;
diff --git a/llvm/test/CodeGen/AIE/aie2ps/ra/spill-vec-acc-postra-expand.mir b/llvm/test/CodeGen/AIE/aie2ps/ra/spill-vec-acc-postra-expand.mir
@@ -0,0 +1,50 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates
+
+# RUN: llc -mtriple=aie2ps -verify-machineinstrs \
+# RUN:     -run-pass=prologepilog -run-pass=postrapseudos %s -o - \
+# RUN:   | FileCheck %s
+
+# The combined spill_vec512_to_composite register class lets the
+# allocator place a 512-bit value in either an X (VEC512) or a BM
+# (ACC512) physreg, sharing one stack slot for either bank. After
+# allocation, eliminateFrameIndex resolves the FI to an SP-relative
+# immediate and expandPostRAPseudo swaps the composite pseudo for the
+# native opcode that matches the actual physical register chosen by
+# the allocator. This test exercises both branches of that swap.
+
+---
+name: composite_spill_lowers_to_x_native
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: spill-slot, size: 64, alignment: 64 }
+body: |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: composite_spill_lowers_to_x_native
+    ; CHECK: VST_dmx_sts_x_spill renamable $x0,
+    ; CHECK: renamable $x0 = VLDA_dmx_lda_x_spill
+    VST_512_COMPOSED_REG_SPILL renamable $x0, %stack.0, implicit $sp :: (store (s512) into %stack.0)
+    renamable $x0 = VLDA_512_COMPOSED_REG_SPILL %stack.0, implicit $sp :: (load (s512) from %stack.0)
+    PseudoRET implicit $lr, implicit killed $x0
+...
+
+---
+name: composite_spill_lowers_to_bm_native
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: spill-slot, size: 64, alignment: 64 }
+body: |
+  bb.0:
+    liveins: $bmll0
+    ; CHECK-LABEL: name: composite_spill_lowers_to_bm_native
+    ; CHECK: VST_dmx_sts_bm_spill renamable $bmll0,
+    ; CHECK: renamable $bmll0 = VLDA_dmx_lda_bm_spill
+    VST_512_COMPOSED_REG_SPILL renamable $bmll0, %stack.0, implicit $sp :: (store (s512) into %stack.0)
+    renamable $bmll0 = VLDA_512_COMPOSED_REG_SPILL %stack.0, implicit $sp :: (load (s512) from %stack.0)
+    PseudoRET implicit $lr, implicit killed $bmll0
+...
diff --git a/llvm/test/CodeGen/AIE/aie2ps/ra/spill-vec-acc.mir b/llvm/test/CodeGen/AIE/aie2ps/ra/spill-vec-acc.mir
@@ -31,7 +31,7 @@ body:             |
   ; CHECK-NEXT:   renamable $x5 = VBCST_32 renamable $r16
   ; CHECK-NEXT:   renamable $r0 = MOVXM_lng_cg -19312
   ; CHECK-NEXT:   renamable $x0 = VBCST_16 killed renamable $r0
-  ; CHECK-NEXT:   VST_X_SPILL killed renamable $x0, %stack.0, implicit $sp :: (store (s512) into %stack.0)
+  ; CHECK-NEXT:   renamable $bmll2 = COPY killed renamable $x0
   ; CHECK-NEXT:   renamable $r0 = MOVXM_lng_cg -19360
   ; CHECK-NEXT:   renamable $x2 = VBCST_16 killed renamable $r0
   ; CHECK-NEXT:   renamable $r0 = MOVXM_lng_cg -19424
@@ -56,10 +56,10 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $cml0:0x000000000000000C, $r16, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
+  ; CHECK-NEXT:   liveins: $bmll2, $cml0:0x000000000000000C, $r16, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $x11 = VCONV_bf16_fp32_mv_conv_mv_x_srs_bf16 renamable $cml0, implicit-def dead $srf2fflags, implicit $crf2fmask, implicit $crfpconvsat, implicit $crrnd
-  ; CHECK-NEXT:   renamable $x0 = VLDA_X_SPILL %stack.0, implicit $sp :: (load (s512) from %stack.0)
+  ; CHECK-NEXT:   renamable $x0 = COPY renamable $bmll2
   ; CHECK-NEXT:   renamable $r18 = VGE_bf16 renamable $x11, killed renamable $x0, implicit $crbf8conf, implicit $crfp8conf
   ; CHECK-NEXT:   renamable $x0 = VSEL_16 renamable $x7, renamable $x10, killed renamable $r18, implicit $crbf8conf, implicit $crfp8conf
   ; CHECK-NEXT:   renamable $r18 = VGE_bf16 renamable $x11, renamable $x2, implicit $crbf8conf, implicit $crfp8conf