Skip to content

Commit 73bdeb4

Browse files
committed
[AIEX] Enable AIEWAWRegRewriter latency-aware heuristic
1 parent 092de1d commit 73bdeb4

9 files changed

Lines changed: 221 additions & 219 deletions

File tree

llvm/lib/Target/AIE/AIEWawRegRewriter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ static cl::opt<unsigned>
6060
cl::init(3));
6161

6262
static cl::opt<bool>
63-
LatencyAware("aie-realloc-latencyaware", cl::Hidden, cl::init(false),
63+
LatencyAware("aie-realloc-latencyaware", cl::Hidden, cl::init(true),
6464
cl::desc("Enable latency-aware allocation strategy"));
6565

6666
namespace {

llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll

Lines changed: 30 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ declare { ptr, i20, i20 } @llvm.aie2.add.3d(ptr, i20, i20, i20, i20, i20, i20, i
3434
define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm2_data, ptr noalias %ofm_data, ptr %.out, ptr %conv.i.i.i.out, ptr %idx.ext9.out, ptr %.out1, ptr %.out2, ptr %.out3, ptr %.out4, ptr %.out5, ptr %conv.i.i.i.i.i.out, ptr %.out6, ptr %conv.i.i.i46.out, ptr %xtraiter.out, ptr %in_ptr1.051.unr.ce.out, ptr %in_ptr2.0.in50.unr.ce.out, ptr %out_ptr.049.unr.ce.out, ptr %itr_left_cnt0.048.unr.ce.out, ptr %itr_left_cnt1.047.unr.ce.out) #3 {
3535
; ASM-LABEL: add2d:
3636
; ASM: // %bb.0: // %newFuncRoot
37-
; ASM-NEXT: lda r2, [p0, #64]; paddb [p0], #40; nopxm ; nops
38-
; ASM-NEXT: lda m2, [p0], #-4
37+
; ASM-NEXT: paddb [p0], #40; lda r2, [p0, #64]; nops ; nopxm ; nopv
38+
; ASM-NEXT: lda m2, [p0], #-4; nopx
3939
; ASM-NEXT: lda m5, [p0], #8
4040
; ASM-NEXT: lda m4, [p0], #8
4141
; ASM-NEXT: lda m3, [p0], #-24
@@ -61,13 +61,13 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
6161
; ASM-NEXT: lda p4, [p7], #-4; st m0, [p0, #0]; ne r6, r0, r6
6262
; ASM-NEXT: lda r13, [p7], #-4; st dj0, [p0, #0]; movx r0, #3
6363
; ASM-NEXT: st dj4, [p0, #0]; ltu r7, r7, r0
64-
; ASM-NEXT: lda r9, [p7], #-4; st dn0, [p0, #0]; nez r1, r1
65-
; ASM-NEXT: lda r10, [p7], #-4; st dn4, [p5, #0]; jz r7, #.LBB0_2
66-
; ASM-NEXT: lda r11, [p7], #-4; st r1, [p6, #0] // Delay Slot 5
67-
; ASM-NEXT: lda p7, [p7, #-4]; st r5, [p0, #0] // Delay Slot 4
68-
; ASM-NEXT: paddb [p2], m5; st r6, [p4, #0] // Delay Slot 3
69-
; ASM-NEXT: lda r12, [p7, #0]; paddb [p2], m4; and r8, r2, r0; mov p0, r13 // Delay Slot 2
70-
; ASM-NEXT: padda [p1], m2; paddb [p2], m3; movx r0, #0; st r8, [p0, #0] // Delay Slot 1
64+
; ASM-NEXT: st dn0, [p0, #0]; nez r1, r1
65+
; ASM-NEXT: lda r9, [p7], #-4; st dn4, [p5, #0]; jz r7, #.LBB0_2
66+
; ASM-NEXT: lda r10, [p7], #-4; st r1, [p6, #0] // Delay Slot 5
67+
; ASM-NEXT: lda r11, [p7], #-4; st r5, [p0, #0] // Delay Slot 4
68+
; ASM-NEXT: lda p7, [p7, #-4]; paddb [p2], m5; st r6, [p4, #0] // Delay Slot 3
69+
; ASM-NEXT: lda r12, [p7, #0]; paddb [p2], m4; and r8, r2, r0; mov p0, r13; padds [p1], m2 // Delay Slot 2
70+
; ASM-NEXT: mova r0, #0; paddb [p2], m3; st r8, [p0, #0] // Delay Slot 1
7171
; ASM-NEXT: // %bb.1:
7272
; ASM-NEXT: j #.LBB0_5
7373
; ASM-NEXT: nop // Delay Slot 5
@@ -76,41 +76,43 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
7676
; ASM-NEXT: nop // Delay Slot 2
7777
; ASM-NEXT: mova r1, #0 // Delay Slot 1
7878
; ASM-NEXT: .LBB0_2: // %entry.new
79-
; ASM-NEXT: nopb ; vlda.ups.s32.d8 cm3, s1, [p1], m1; nops ; nopx ; mov dc0, #0; nopv
79+
; ASM-NEXT: nopb ; vlda.ups.s32.d8 cm0, s1, [p1], m1; nops ; nopx ; mov dc0, #0; nopv
8080
; ASM-NEXT: vlda.ups.s32.d8 cm2, s1, [p1], m1; mov dc4, dc0
81-
; ASM-NEXT: vlda.3d.ups.s32.d8 cm0, s1, [p2], d0; nopx
82-
; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1; movxm ls, #.LBB0_3
81+
; ASM-NEXT: vlda.3d.ups.s32.d8 cm8, s1, [p2], d0
82+
; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1
8383
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; mov crUPSSign, r4
84-
; ASM-NEXT: vlda.ups.s32.d8 cm1, s1, [p1], m1; movx r0, #-4; mov s1, r3
85-
; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0; movxm le, #.L_LEnd0
86-
; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0; and r0, r2, r0
84+
; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1; mov s1, r3
85+
; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0
86+
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; movxm ls, #.LBB0_3
87+
; ASM-NEXT: mova r0, #-4; movxm le, #.L_LEnd0
88+
; ASM-NEXT: and r0, r2, r0
8789
; ASM-NEXT: mova r2, #-2; add r0, r0, #-4
8890
; ASM-NEXT: lshl r0, r0, r2; mov crSRSSign, r6
8991
; ASM-NEXT: add r0, r0, #1; mov s0, r5
9092
; ASM-NEXT: add.nc lc, r0, #-1
9193
; ASM-NEXT: .LBB0_3: // %for.body
9294
; ASM-NEXT: // =>This Inner Loop Header: Depth=1
93-
; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; vadd cm0, cm0, cm3, r1
94-
; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
95-
; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
95+
; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; vadd cm0, cm8, cm0, r1
9696
; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
97-
; ASM-NEXT: vlda.3d.ups.s32.d8 cm0, s1, [p2], d0; nopx ; vadd cm3, cm6, cm2, r1
98-
; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1; vst.srs.d8.s32 cm0, s0, [p3], #32; vadd cm2, cm5, cm4, r1
99-
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; vadd cm1, cm7, cm1, r1
100-
; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0
97+
; ASM-NEXT: nopb ; vlda.3d.ups.s32.d8 cm8, s1, [p2], d0; nops ; nopxm ; vadd cm2, cm6, cm2, r1
98+
; ASM-NEXT: nopb ; vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; nops ; nopxm ; nopv
99+
; ASM-NEXT: nopb ; vlda.ups.s32.d8 cm0, s1, [p1], m1; nops ; nopxm ; vadd cm3, cm5, cm3, r1
100+
; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0; vst.srs.d8.s32 cm0, s0, [p3], #32; nopx ; vadd cm7, cm1, cm4, r1
101101
; ASM-NEXT: vlda.ups.s32.d8 cm2, s1, [p1], m1
102+
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0
103+
; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1; vst.srs.d8.s32 cm2, s0, [p3], #32
102104
; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1; vst.srs.d8.s32 cm3, s0, [p3], #32
103-
; ASM-NEXT: vlda.ups.s32.d8 cm1, s1, [p1], m1; vst.srs.d8.s32 cm2, s0, [p3], #32
104105
; ASM-NEXT: .L_LEnd0:
105-
; ASM-NEXT: nopb ; vlda.3d.ups.s32.d8 cm7, s1, [p2], d0; vst.srs.d8.s32 cm1, s0, [p3], #32; nopxm ; nopv
106+
; ASM-NEXT: nopb ; nopa ; vst.srs.d8.s32 cm7, s0, [p3], #32; nopxm ; nopv
106107
; ASM-NEXT: // %bb.4:
107-
; ASM-NEXT: nopa ; nopb ; nopx
108+
; ASM-NEXT: nopa ; nopx
109+
; ASM-NEXT: nop
108110
; ASM-NEXT: nop
109111
; ASM-NEXT: nop
110-
; ASM-NEXT: vadd cm0, cm0, cm3, r1
111-
; ASM-NEXT: vadd cm3, cm5, cm4, r1
112+
; ASM-NEXT: vadd cm3, cm5, cm3, r1
113+
; ASM-NEXT: vadd cm0, cm8, cm0, r1
112114
; ASM-NEXT: vadd cm2, cm6, cm2, r1
113-
; ASM-NEXT: vadd cm1, cm7, cm1, r1
115+
; ASM-NEXT: vadd cm1, cm1, cm4, r1
114116
; ASM-NEXT: nop
115117
; ASM-NEXT: nop
116118
; ASM-NEXT: vst.srs.d8.s32 cm0, s0, [p3], #32

llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -275,12 +275,12 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
275275
; DCL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5
276276
; DCL-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m1, r11
277277
; DCL-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m1
278-
; DCL-NEXT: vlda.ups.s32.s16 bmh5, s0, [p2, #32]
279-
; DCL-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m5; vldb wl3, [p0], m6; mov r0, p0
280-
; DCL-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32]; vldb wh3, [p0], m6
281-
; DCL-NEXT: vlda.ups.s32.s16 bml4, s0, [p2], m7; vldb wl7, [p0], m6
282-
; DCL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]; vldb.3d wh7, [p0], d0
283-
; DCL-NEXT: vlda.ups.s32.s16 bml7, s0, [p2], m5
278+
; DCL-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32]
279+
; DCL-NEXT: vlda.ups.s32.s16 bml4, s0, [p2], m5; vldb wl3, [p0], m6; mov r0, p0
280+
; DCL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]; vldb wh3, [p0], m6
281+
; DCL-NEXT: vlda.ups.s32.s16 bml7, s0, [p2], m7; vldb wl7, [p0], m6
282+
; DCL-NEXT: vlda.ups.s32.s16 bmh5, s0, [p2, #32]; vldb.3d wh7, [p0], d0
283+
; DCL-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m5
284284
; DCL-NEXT: vldb wl6, [p1], #32
285285
; DCL-NEXT: vldb wh6, [p1], #32
286286
; DCL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32]; vldb wl5, [p0], m6; and r0, r0, r9; mov r1, p0
@@ -295,13 +295,13 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
295295
; DCL-NEXT: // Parent Loop BB0_1 Depth=1
296296
; DCL-NEXT: // => This Inner Loop Header: Depth=2
297297
; DCL-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4
298-
; DCL-NEXT: nopa ; nopb ; nopx ; vshift.align x4, x4, s1, x5, r0; vmac cm4, cm4, x9, x8, r4
298+
; DCL-NEXT: nopa ; nopb ; nopx ; vshift.align x4, x4, s1, x5, r0; vmac cm7, cm7, x9, x8, r4
299299
; DCL-NEXT: vldb wl5, [p0], m6; vshift.align x2, x2, s1, x3, r0
300300
; DCL-NEXT: vldb wh5, [p0], m6; add r1, r1, #-1; vshuffle x11, x9, x0, r8
301301
; DCL-NEXT: vlda wl3, [p0], m6; jnz r1, #.LBB0_2; vmac cm0, cm0, x7, x6, r4
302-
; DCL-NEXT: vlda.3d wh3, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm5, cm5, x7, x8, r4 // Delay Slot 5
302+
; DCL-NEXT: vlda.3d wh3, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 // Delay Slot 5
303303
; DCL-NEXT: vldb wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 // Delay Slot 4
304-
; DCL-NEXT: vldb wh1, [p1], #32; vmov x6, x1; vmac cm7, cm7, x9, x8, r4 // Delay Slot 3
304+
; DCL-NEXT: vldb wh1, [p1], #32; vmov x6, x1; vmac cm5, cm5, x9, x8, r4 // Delay Slot 3
305305
; DCL-NEXT: vldb wl10, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2
306306
; DCL-NEXT: vldb wh10, [p1], #32; and r10, r10, r9; vmov x8, x10; vmac cm6, cm6, x11, x8, r4 // Delay Slot 1
307307
; DCL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1
@@ -315,9 +315,9 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
315315
; DCL-NEXT: vlda wl6, [sp, #-160]; vmac cm0, cm0, x7, x6, r4 // 32-byte Folded Reload
316316
; DCL-NEXT: vlda wh6, [sp, #-128]; vmac cm1, cm1, x9, x6, r4 // 32-byte Folded Reload
317317
; DCL-NEXT: vlda wh0, [sp, #-32]; vmac cm2, cm2, x0, x6, r4 // 32-byte Folded Reload
318-
; DCL-NEXT: lda dn7, [sp, #-92]; vmac cm5, cm7, x0, x8, r4 // 4-byte Folded Reload
319-
; DCL-NEXT: vmac cm8, cm5, x7, x8, r4
320-
; DCL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm4, cm4, x9, x8, r4 // 4-byte Folded Reload
318+
; DCL-NEXT: lda dn7, [sp, #-92]; vmac cm5, cm5, x0, x8, r4 // 4-byte Folded Reload
319+
; DCL-NEXT: vmac cm8, cm4, x7, x8, r4
320+
; DCL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm4, cm7, x9, x8, r4 // 4-byte Folded Reload
321321
; DCL-NEXT: vshift.align x2, x2, s1, x3, r0; vmac cm3, cm3, x11, x6, r4
322322
; DCL-NEXT: st m7, [sp, #-96]; vshuffle x6, x4, x2, r2 // 4-byte Folded Spill
323323
; DCL-NEXT: vmac cm6, cm6, x6, x8, r4
@@ -410,12 +410,12 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
410410
; ZOL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5
411411
; ZOL-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m1, r10
412412
; ZOL-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m1
413-
; ZOL-NEXT: vlda.ups.s32.s16 bmh5, s0, [p2, #32]
414-
; ZOL-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m5; vldb wl3, [p0], m6; mov r0, p0
415-
; ZOL-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32]; vldb wh3, [p0], m6
416-
; ZOL-NEXT: vlda.ups.s32.s16 bml4, s0, [p2], m7; vldb wl7, [p0], m6
417-
; ZOL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]; vldb.3d wh7, [p0], d0
418-
; ZOL-NEXT: vlda.ups.s32.s16 bml7, s0, [p2], m5; movxm ls, #.LBB0_2
413+
; ZOL-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32]
414+
; ZOL-NEXT: vlda.ups.s32.s16 bml4, s0, [p2], m5; vldb wl3, [p0], m6; mov r0, p0
415+
; ZOL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]; vldb wh3, [p0], m6
416+
; ZOL-NEXT: vlda.ups.s32.s16 bml7, s0, [p2], m7; vldb wl7, [p0], m6
417+
; ZOL-NEXT: vlda.ups.s32.s16 bmh5, s0, [p2, #32]; vldb.3d wh7, [p0], d0
418+
; ZOL-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m5; movxm ls, #.LBB0_2
419419
; ZOL-NEXT: vldb wl6, [p1], #32; movxm le, #.L_LEnd0
420420
; ZOL-NEXT: vlda wh6, [p1], #32; vldb wl5, [p0], m6; mov r1, p0
421421
; ZOL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32]; vldb wh5, [p0], m6; and r0, r0, r9
@@ -430,12 +430,12 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
430430
; ZOL-NEXT: // Parent Loop BB0_1 Depth=1
431431
; ZOL-NEXT: // => This Inner Loop Header: Depth=2
432432
; ZOL-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4
433-
; ZOL-NEXT: vldb wl5, [p0], m6; nopa ; nops ; nopx ; vshift.align x4, x4, s1, x5, r0; vmac cm4, cm4, x9, x8, r4
433+
; ZOL-NEXT: vldb wl5, [p0], m6; nopa ; nops ; nopx ; vshift.align x4, x4, s1, x5, r0; vmac cm7, cm7, x9, x8, r4
434434
; ZOL-NEXT: vldb wh5, [p0], m6; nopa ; nops ; nopx ; vshift.align x2, x2, s1, x3, r0; nopv
435435
; ZOL-NEXT: nopb ; vlda wl3, [p0], m6; nops ; nopx ; vshuffle x11, x9, x0, r8; vmac cm0, cm0, x7, x6, r4
436-
; ZOL-NEXT: nopb ; vlda.3d wh3, [p0], d0; nops ; nopx ; vshuffle x7, x4, x2, r2; vmac cm5, cm5, x7, x8, r4
436+
; ZOL-NEXT: nopb ; vlda.3d wh3, [p0], d0; nops ; nopx ; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4
437437
; ZOL-NEXT: vldb wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4
438-
; ZOL-NEXT: vldb wh1, [p1], #32; vmov x6, x1; vmac cm7, cm7, x9, x8, r4
438+
; ZOL-NEXT: vldb wh1, [p1], #32; vmov x6, x1; vmac cm5, cm5, x9, x8, r4
439439
; ZOL-NEXT: vldb wl10, [p1], #32; add r0, r1, #33; mov r1, p0; vmac cm3, cm3, x11, x6, r4
440440
; ZOL-NEXT: .L_LEnd0:
441441
; ZOL-NEXT: vldb wh10, [p1], #32; nopa ; nops ; and r1, r1, r9; vmov x8, x10; vmac cm6, cm6, x11, x8, r4
@@ -450,9 +450,9 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
450450
; ZOL-NEXT: vlda wl6, [sp, #-160]; vmac cm0, cm0, x7, x6, r4 // 32-byte Folded Reload
451451
; ZOL-NEXT: vlda wh6, [sp, #-128]; vmac cm1, cm1, x9, x6, r4 // 32-byte Folded Reload
452452
; ZOL-NEXT: vlda wh0, [sp, #-32]; vmac cm2, cm2, x0, x6, r4 // 32-byte Folded Reload
453-
; ZOL-NEXT: lda dn7, [sp, #-92]; vmac cm5, cm7, x0, x8, r4 // 4-byte Folded Reload
454-
; ZOL-NEXT: vmac cm8, cm5, x7, x8, r4
455-
; ZOL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm4, cm4, x9, x8, r4 // 4-byte Folded Reload
453+
; ZOL-NEXT: lda dn7, [sp, #-92]; vmac cm5, cm5, x0, x8, r4 // 4-byte Folded Reload
454+
; ZOL-NEXT: vmac cm8, cm4, x7, x8, r4
455+
; ZOL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm4, cm7, x9, x8, r4 // 4-byte Folded Reload
456456
; ZOL-NEXT: vshift.align x2, x2, s1, x3, r0; vmac cm3, cm3, x11, x6, r4
457457
; ZOL-NEXT: st m7, [sp, #-96]; vshuffle x6, x4, x2, r2 // 4-byte Folded Spill
458458
; ZOL-NEXT: vmac cm6, cm6, x6, x8, r4

0 commit comments

Comments
 (0)