@@ -275,12 +275,12 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
275275; DCL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5
276276; DCL-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m1, r11
277277; DCL-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m1
278- ; DCL-NEXT: vlda.ups.s32.s16 bmh5 , s0, [p2, #32]
279- ; DCL-NEXT: vlda.ups.s32.s16 bml5 , s0, [p2], m5; vldb wl3, [p0], m6; mov r0, p0
280- ; DCL-NEXT: vlda.ups.s32.s16 bmh4 , s0, [p2, #32]; vldb wh3, [p0], m6
281- ; DCL-NEXT: vlda.ups.s32.s16 bml4 , s0, [p2], m7; vldb wl7, [p0], m6
282- ; DCL-NEXT: vlda.ups.s32.s16 bmh7 , s0, [p2, #32]; vldb.3d wh7, [p0], d0
283- ; DCL-NEXT: vlda.ups.s32.s16 bml7 , s0, [p2], m5
278+ ; DCL-NEXT: vlda.ups.s32.s16 bmh4 , s0, [p2, #32]
279+ ; DCL-NEXT: vlda.ups.s32.s16 bml4 , s0, [p2], m5; vldb wl3, [p0], m6; mov r0, p0
280+ ; DCL-NEXT: vlda.ups.s32.s16 bmh7 , s0, [p2, #32]; vldb wh3, [p0], m6
281+ ; DCL-NEXT: vlda.ups.s32.s16 bml7 , s0, [p2], m7; vldb wl7, [p0], m6
282+ ; DCL-NEXT: vlda.ups.s32.s16 bmh5 , s0, [p2, #32]; vldb.3d wh7, [p0], d0
283+ ; DCL-NEXT: vlda.ups.s32.s16 bml5 , s0, [p2], m5
284284; DCL-NEXT: vldb wl6, [p1], #32
285285; DCL-NEXT: vldb wh6, [p1], #32
286286; DCL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32]; vldb wl5, [p0], m6; and r0, r0, r9; mov r1, p0
@@ -295,13 +295,13 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
295295; DCL-NEXT: // Parent Loop BB0_1 Depth=1
296296; DCL-NEXT: // => This Inner Loop Header: Depth=2
297297; DCL-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4
298- ; DCL-NEXT: nopa ; nopb ; nopx ; vshift.align x4, x4, s1, x5, r0; vmac cm4, cm4 , x9, x8, r4
298+ ; DCL-NEXT: nopa ; nopb ; nopx ; vshift.align x4, x4, s1, x5, r0; vmac cm7, cm7 , x9, x8, r4
299299; DCL-NEXT: vldb wl5, [p0], m6; vshift.align x2, x2, s1, x3, r0
300300; DCL-NEXT: vldb wh5, [p0], m6; add r1, r1, #-1; vshuffle x11, x9, x0, r8
301301; DCL-NEXT: vlda wl3, [p0], m6; jnz r1, #.LBB0_2; vmac cm0, cm0, x7, x6, r4
302- ; DCL-NEXT: vlda.3d wh3, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm5, cm5 , x7, x8, r4 // Delay Slot 5
302+ ; DCL-NEXT: vlda.3d wh3, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4 , x7, x8, r4 // Delay Slot 5
303303; DCL-NEXT: vldb wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 // Delay Slot 4
304- ; DCL-NEXT: vldb wh1, [p1], #32; vmov x6, x1; vmac cm7, cm7 , x9, x8, r4 // Delay Slot 3
304+ ; DCL-NEXT: vldb wh1, [p1], #32; vmov x6, x1; vmac cm5, cm5 , x9, x8, r4 // Delay Slot 3
305305; DCL-NEXT: vldb wl10, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2
306306; DCL-NEXT: vldb wh10, [p1], #32; and r10, r10, r9; vmov x8, x10; vmac cm6, cm6, x11, x8, r4 // Delay Slot 1
307307; DCL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1
@@ -315,9 +315,9 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
315315; DCL-NEXT: vlda wl6, [sp, #-160]; vmac cm0, cm0, x7, x6, r4 // 32-byte Folded Reload
316316; DCL-NEXT: vlda wh6, [sp, #-128]; vmac cm1, cm1, x9, x6, r4 // 32-byte Folded Reload
317317; DCL-NEXT: vlda wh0, [sp, #-32]; vmac cm2, cm2, x0, x6, r4 // 32-byte Folded Reload
318- ; DCL-NEXT: lda dn7, [sp, #-92]; vmac cm5, cm7 , x0, x8, r4 // 4-byte Folded Reload
319- ; DCL-NEXT: vmac cm8, cm5 , x7, x8, r4
320- ; DCL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm4, cm4 , x9, x8, r4 // 4-byte Folded Reload
318+ ; DCL-NEXT: lda dn7, [sp, #-92]; vmac cm5, cm5 , x0, x8, r4 // 4-byte Folded Reload
319+ ; DCL-NEXT: vmac cm8, cm4 , x7, x8, r4
320+ ; DCL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm4, cm7 , x9, x8, r4 // 4-byte Folded Reload
321321; DCL-NEXT: vshift.align x2, x2, s1, x3, r0; vmac cm3, cm3, x11, x6, r4
322322; DCL-NEXT: st m7, [sp, #-96]; vshuffle x6, x4, x2, r2 // 4-byte Folded Spill
323323; DCL-NEXT: vmac cm6, cm6, x6, x8, r4
@@ -410,12 +410,12 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
410410; ZOL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5
411411; ZOL-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m1, r10
412412; ZOL-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m1
413- ; ZOL-NEXT: vlda.ups.s32.s16 bmh5 , s0, [p2, #32]
414- ; ZOL-NEXT: vlda.ups.s32.s16 bml5 , s0, [p2], m5; vldb wl3, [p0], m6; mov r0, p0
415- ; ZOL-NEXT: vlda.ups.s32.s16 bmh4 , s0, [p2, #32]; vldb wh3, [p0], m6
416- ; ZOL-NEXT: vlda.ups.s32.s16 bml4 , s0, [p2], m7; vldb wl7, [p0], m6
417- ; ZOL-NEXT: vlda.ups.s32.s16 bmh7 , s0, [p2, #32]; vldb.3d wh7, [p0], d0
418- ; ZOL-NEXT: vlda.ups.s32.s16 bml7 , s0, [p2], m5; movxm ls, #.LBB0_2
413+ ; ZOL-NEXT: vlda.ups.s32.s16 bmh4 , s0, [p2, #32]
414+ ; ZOL-NEXT: vlda.ups.s32.s16 bml4 , s0, [p2], m5; vldb wl3, [p0], m6; mov r0, p0
415+ ; ZOL-NEXT: vlda.ups.s32.s16 bmh7 , s0, [p2, #32]; vldb wh3, [p0], m6
416+ ; ZOL-NEXT: vlda.ups.s32.s16 bml7 , s0, [p2], m7; vldb wl7, [p0], m6
417+ ; ZOL-NEXT: vlda.ups.s32.s16 bmh5 , s0, [p2, #32]; vldb.3d wh7, [p0], d0
418+ ; ZOL-NEXT: vlda.ups.s32.s16 bml5 , s0, [p2], m5; movxm ls, #.LBB0_2
419419; ZOL-NEXT: vldb wl6, [p1], #32; movxm le, #.L_LEnd0
420420; ZOL-NEXT: vlda wh6, [p1], #32; vldb wl5, [p0], m6; mov r1, p0
421421; ZOL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32]; vldb wh5, [p0], m6; and r0, r0, r9
@@ -430,12 +430,12 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
430430; ZOL-NEXT: // Parent Loop BB0_1 Depth=1
431431; ZOL-NEXT: // => This Inner Loop Header: Depth=2
432432; ZOL-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4
433- ; ZOL-NEXT: vldb wl5, [p0], m6; nopa ; nops ; nopx ; vshift.align x4, x4, s1, x5, r0; vmac cm4, cm4 , x9, x8, r4
433+ ; ZOL-NEXT: vldb wl5, [p0], m6; nopa ; nops ; nopx ; vshift.align x4, x4, s1, x5, r0; vmac cm7, cm7 , x9, x8, r4
434434; ZOL-NEXT: vldb wh5, [p0], m6; nopa ; nops ; nopx ; vshift.align x2, x2, s1, x3, r0; nopv
435435; ZOL-NEXT: nopb ; vlda wl3, [p0], m6; nops ; nopx ; vshuffle x11, x9, x0, r8; vmac cm0, cm0, x7, x6, r4
436- ; ZOL-NEXT: nopb ; vlda.3d wh3, [p0], d0; nops ; nopx ; vshuffle x7, x4, x2, r2; vmac cm5, cm5 , x7, x8, r4
436+ ; ZOL-NEXT: nopb ; vlda.3d wh3, [p0], d0; nops ; nopx ; vshuffle x7, x4, x2, r2; vmac cm4, cm4 , x7, x8, r4
437437; ZOL-NEXT: vldb wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4
438- ; ZOL-NEXT: vldb wh1, [p1], #32; vmov x6, x1; vmac cm7, cm7 , x9, x8, r4
438+ ; ZOL-NEXT: vldb wh1, [p1], #32; vmov x6, x1; vmac cm5, cm5 , x9, x8, r4
439439; ZOL-NEXT: vldb wl10, [p1], #32; add r0, r1, #33; mov r1, p0; vmac cm3, cm3, x11, x6, r4
440440; ZOL-NEXT: .L_LEnd0:
441441; ZOL-NEXT: vldb wh10, [p1], #32; nopa ; nops ; and r1, r1, r9; vmov x8, x10; vmac cm6, cm6, x11, x8, r4
@@ -450,9 +450,9 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
450450; ZOL-NEXT: vlda wl6, [sp, #-160]; vmac cm0, cm0, x7, x6, r4 // 32-byte Folded Reload
451451; ZOL-NEXT: vlda wh6, [sp, #-128]; vmac cm1, cm1, x9, x6, r4 // 32-byte Folded Reload
452452; ZOL-NEXT: vlda wh0, [sp, #-32]; vmac cm2, cm2, x0, x6, r4 // 32-byte Folded Reload
453- ; ZOL-NEXT: lda dn7, [sp, #-92]; vmac cm5, cm7 , x0, x8, r4 // 4-byte Folded Reload
454- ; ZOL-NEXT: vmac cm8, cm5 , x7, x8, r4
455- ; ZOL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm4, cm4 , x9, x8, r4 // 4-byte Folded Reload
453+ ; ZOL-NEXT: lda dn7, [sp, #-92]; vmac cm5, cm5 , x0, x8, r4 // 4-byte Folded Reload
454+ ; ZOL-NEXT: vmac cm8, cm4 , x7, x8, r4
455+ ; ZOL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm4, cm7 , x9, x8, r4 // 4-byte Folded Reload
456456; ZOL-NEXT: vshift.align x2, x2, s1, x3, r0; vmac cm3, cm3, x11, x6, r4
457457; ZOL-NEXT: st m7, [sp, #-96]; vshuffle x6, x4, x2, r2 // 4-byte Folded Spill
458458; ZOL-NEXT: vmac cm6, cm6, x6, x8, r4
0 commit comments