-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathTanhTemplated-swp.ll
331 lines (305 loc) · 19.3 KB
/
TanhTemplated-swp.ll
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
;
; This file is licensed under the Apache License v2.0 with LLVM Exceptions.
; See https://llvm.org/LICENSE.txt for license information.
; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
;
; (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
; RUN: llc -O2 -mtriple=aie2 %s -o - | FileCheck %s
; The test is meant as a quick way to spot QoR regressions. In this test, the
; code can only be pipelined (Pre-SWP) because of the removal of some WAW
; dependencies related to sticky status registers.
; Function Attrs: nounwind memory(none)
declare <8 x i64> @llvm.aie2.v16accfloat() #0
; Function Attrs: nounwind memory(none)
declare <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat) #0
; Function Attrs: nounwind memory(none)
declare <32 x bfloat> @llvm.aie2.v32bfloat16() #0
; Function Attrs: nounwind memory(none)
declare <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat>, i32) #0
; Function Attrs: nounwind memory(inaccessiblemem: read)
declare <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat>, <32 x bfloat>, i32) #1
; Function Attrs: nounwind memory(inaccessiblemem: read)
declare <8 x i64> @llvm.aie2.bf.mac16.conf(<32 x bfloat>, <32 x bfloat>, <8 x i64>, i32) #1
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare { <32 x bfloat>, i32 } @llvm.aie2.vmax.ltbf16(<32 x bfloat>, <32 x bfloat>) #2
; Function Attrs: nounwind memory(none)
declare <16 x bfloat> @llvm.aie2.v16bfloat16() #0
; Function Attrs: nounwind memory(inaccessiblemem: read)
declare <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64>) #1
; Function Attrs: nounwind memory(none)
declare <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat>, i32) #0
; Function Attrs: nounwind memory(none)
declare <8 x i64> @llvm.aie2.v16bf16.to.v16accfloat(<16 x bfloat>) #0
; Function Attrs: nounwind memory(none)
declare <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat>, <16 x bfloat>, i32) #0
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
declare <8 x i64> @llvm.aie2.sub.accfloat(<8 x i64>, <8 x i64>, i32) #3
; Function Attrs: nounwind memory(inaccessiblemem: read)
declare <8 x i64> @llvm.aie2.bf.msc16.conf(<32 x bfloat>, <32 x bfloat>, <8 x i64>, i32) #1
; Function Attrs: nounwind memory(none)
declare <32 x i16> @llvm.aie2.vbroadcast16.I512(i32) #0
; Function Attrs: mustprogress noinline
define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr nonnull align 32 dereferenceable(64) %params) align 2 {
; CHECK-LABEL: TanhTemplated:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %for.body.lr.ph
; CHECK-NEXT: nop ; movxm r3, #16512
; CHECK-NEXT: movxm r4, #-16256
; CHECK-NEXT: movxm r5, #32767
; CHECK-NEXT: movxm r0, #16256
; CHECK-NEXT: movxm r1, #16384
; CHECK-NEXT: lda r0, [p2, #0]; movxm r2, #16128
; CHECK-NEXT: vbcst.16 x0, r1
; CHECK-NEXT: vldb wl3, [p0], #32; vbcst.16 x3, r0
; CHECK-NEXT: vbcst.16 x2, r2
; CHECK-NEXT: mova r1, #0; vconv.fp32.bf16 bmh0, wl2
; CHECK-NEXT: vbcst.16 x2, r1
; CHECK-NEXT: vldb wl3, [p0], #32; vmov wh0, wl2
; CHECK-NEXT: mova r1, #-5; vmov wh3, wl2
; CHECK-NEXT: mova r1, #60; lshl r2, r0, r1; vconv.fp32.bf16 bmh1, wl3
; CHECK-NEXT: movxm r6, #15616; vmul.f bmh2, x0, x3, r1
; CHECK-NEXT: movxm r7, #16000
; CHECK-NEXT: vbcst.16 x1, r3
; CHECK-NEXT: vbcst.16 x10, r4
; CHECK-NEXT: vbcst.16 x8, r5; vmul.f bmh3, x0, x3, r1
; CHECK-NEXT: vbcst.16 x6, r6
; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh2; vbcst.16 x4, r7
; CHECK-NEXT: vmov wh6, wl2
; CHECK-NEXT: vmin_ge.bf16 x3, r16, x3, x1
; CHECK-NEXT: or r8, r16, r16; vmax_lt.bf16 x3, r16, x3, x10
; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh3; vband x7, x8, x3
; CHECK-NEXT: vmov wh7, wl2
; CHECK-NEXT: vmin_ge.bf16 x5, r16, x5, x1
; CHECK-NEXT: vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x5, x10
; CHECK-NEXT: vband x7, x8, x5
; CHECK-NEXT: vldb wl7, [p0], #32; vmov wh7, wl2; vmul.f bmh2, x6, x7, r1
; CHECK-NEXT: vmov wh4, wl2
; CHECK-NEXT: vmov wh3, wl2; vmul.f bmh4, x6, x7, r1
; CHECK-NEXT: nop
; CHECK-NEXT: vmov wh5, wl2; vmac.f bmh3, bmh0, x3, x4, r1
; CHECK-NEXT: vmul.f bmh5, x0, x7, r1
; CHECK-NEXT: movxm ls, #.LBB0_1; vmac.f bmh6, bmh0, x5, x4, r1
; CHECK-NEXT: vconv.bf16.fp32 wl7, bmh2; movxm le, #.L_LEnd0; vmul.f bmh7, x0, x7, r1
; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh4; add.nc lc, r2, #-2
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmsc.f bmh3, bmh3, x7, x3, r1
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmsc.f bml4, bmh6, x3, x5, r1
; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl3, bmh5; nopxm ; nopv
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl5, bmh7; nopx ; vmin_ge.bf16 x3, r16, x3, x1; nopv
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmax_lt.bf16 x3, r16, x3, x10; nopv
; CHECK-NEXT: nopb ; mova r0, #28; vconv.bf16.fp32 wl7, bmh3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_1: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: nopa ; nopb ; nopx ; vband x9, x8, x3; nops
; CHECK-NEXT: vmax_lt.bf16 x5, r16, x5, x10
; CHECK-NEXT: vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmov wh3, wl2
; CHECK-NEXT: vmov wh9, wl2; vmul.f bmh6, x7, x0, r1
; CHECK-NEXT: vldb wl7, [p0], #32; vband x9, x8, x5; vmul.f bmh2, x7, x0, r1
; CHECK-NEXT: vmov wh9, wl2; vmul.f bmh3, x6, x9, r1
; CHECK-NEXT: vmac.f bmh5, bmh0, x3, x4, r1
; CHECK-NEXT: vmul.f bmh4, x6, x9, r1
; CHECK-NEXT: vmov wh5, wl2; vsub.f bml1, bmh6, bmh1, r0
; CHECK-NEXT: vmul.f bmh7, x0, x7, r1
; CHECK-NEXT: vmac.f bml2, bmh0, x5, x4, r1
; CHECK-NEXT: vconv.bf16.fp32 wl7, bmh3; vmul.f bmh8, x0, x7, r1
; CHECK-NEXT: vsub.f bml0, bmh2, bmh1, r0
; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh4; vmsc.f bml3, bmh5, x7, x3, r1
; CHECK-NEXT: nop
; CHECK-NEXT: vconv.bf16.fp32 wl11, bmh7; vmsc.f bml4, bml2, x3, x5, r1
; CHECK-NEXT: vst.conv.bf16.fp32 bml1, [p1], #32
; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh8; vmin_ge.bf16 x3, r16, x11, x1
; CHECK-NEXT: vst.conv.bf16.fp32 bml0, [p1], #32; vmax_lt.bf16 x3, r16, x3, x10
; CHECK-NEXT: .L_LEnd0:
; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl7, bml3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv
; CHECK-NEXT: // %bb.2:
; CHECK-NEXT: nopa ; nopb ; nopx ; vmov wh7, wl2; nops
; CHECK-NEXT: vconv.bf16.fp32 wl1, bml4; vmov wh1, wl2
; CHECK-NEXT: vmov wh6, wl2; vmul.f bmh3, x7, x0, r1
; CHECK-NEXT: vmax_lt.bf16 x10, r16, x5, x10; vmul.f bmh2, x1, x0, r1
; CHECK-NEXT: vband x1, x8, x3
; CHECK-NEXT: vband x8, x8, x10
; CHECK-NEXT: vmov wh1, wl2; vsub.f bmh3, bmh3, bmh1, r0
; CHECK-NEXT: vmov wh8, wl2; vsub.f bmh2, bmh2, bmh1, r0
; CHECK-NEXT: vmul.f bmh2, x6, x1, r1
; CHECK-NEXT: vmov wh4, wl2; vmul.f bmh3, x6, x8, r1
; CHECK-NEXT: vmov wh3, wl2
; CHECK-NEXT: vmov wh10, wl2
; CHECK-NEXT: vst.conv.bf16.fp32 bmh3, [p1], #32; vmac.f bmh4, bmh0, x3, x4, r1
; CHECK-NEXT: vst.conv.bf16.fp32 bmh2, [p1], #32; vmac.f bmh0, bmh0, x10, x4, r1
; CHECK-NEXT: vconv.bf16.fp32 wl4, bmh2
; CHECK-NEXT: vconv.bf16.fp32 wl4, bmh3
; CHECK-NEXT: vmsc.f bmh2, bmh4, x4, x3, r1
; CHECK-NEXT: vmsc.f bmh0, bmh0, x4, x10, r1
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: vconv.bf16.fp32 wl4, bmh2
; CHECK-NEXT: vconv.bf16.fp32 wl4, bmh0
; CHECK-NEXT: vmul.f bmh2, x4, x0, r1
; CHECK-NEXT: vmul.f bmh0, x4, x0, r1
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: vsub.f bmh2, bmh2, bmh1, r0
; CHECK-NEXT: vsub.f bmh0, bmh0, bmh1, r0
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: ret lr
; CHECK-NEXT: vst.conv.bf16.fp32 bmh2, [p1], #32 // Delay Slot 5
; CHECK-NEXT: vst.conv.bf16.fp32 bmh0, [p1], #32 // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: mov r16, r8 // Delay Slot 1
for.body.lr.ph:
%0 = tail call noundef <16 x bfloat> @llvm.aie2.v16bfloat16()
%1 = tail call noundef <8 x i64> @llvm.aie2.v16accfloat()
%2 = tail call noundef <32 x bfloat> @llvm.aie2.v32bfloat16()
%3 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR3F80)
%4 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %3, i32 0)
%5 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR4000)
%6 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %5, i32 0)
%7 = tail call noundef <8 x i64> @llvm.aie2.v16bf16.to.v16accfloat(<16 x bfloat> %6)
%8 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR3F00)
%9 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %8, i32 0)
%10 = tail call noundef <8 x i64> @llvm.aie2.v16bf16.to.v16accfloat(<16 x bfloat> %9)
%11 = load i32, ptr %params, align 32, !tbaa !4
%div16 = lshr i32 %11, 5
%12 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR0000)
%13 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %12, i32 0)
%14 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %6, i32 0)
%15 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %14, <16 x bfloat> %13, i32 1)
call void @llvm.set.loop.iterations.i32(i32 %div16)
br label %for.body
for.cond.cleanup: ; preds = %for.body
ret void
for.body: ; preds = %for.body, %for.body.lr.ph
%p_out.0.in17 = phi ptr [ %ofm, %for.body.lr.ph ], [ %add.ptr.i9.i, %for.body ]
%p_in.0.in16 = phi ptr [ %ifm, %for.body.lr.ph ], [ %add.ptr.i.i.i, %for.body ]
%p_out.0 = addrspacecast ptr %p_out.0.in17 to ptr addrspace(6)
%p_in.0 = addrspacecast ptr %p_in.0.in16 to ptr addrspace(5)
%16 = load <16 x bfloat>, ptr addrspace(5) %p_in.0, align 32, !tbaa !11
%add.ptr.i.i = getelementptr inbounds i8, ptr %p_in.0.in16, i20 32
%add.ptr.ascast.i.i = addrspacecast ptr %add.ptr.i.i to ptr addrspace(5)
%17 = load <16 x bfloat>, ptr addrspace(5) %add.ptr.ascast.i.i, align 32, !tbaa !11
%add.ptr.i.i.i = getelementptr inbounds i8, ptr %p_in.0.in16, i20 64
%18 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %16, i32 0)
%19 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %18, <16 x bfloat> %13, i32 1)
%20 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %15, <32 x bfloat> %19, i32 60)
%21 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %20)
%22 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR4080)
%23 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %22, i32 0)
%24 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %21, i32 0)
%25 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %23, i32 0)
%26 = tail call { <32 x bfloat>, i32 } @llvm.aie2.vmin.gebf16(<32 x bfloat> %24, <32 x bfloat> %25)
%27 = extractvalue { <32 x bfloat>, i32 } %26, 0
%28 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %27, i32 0)
%29 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xRC080)
%30 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %29, i32 0)
%31 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %28, i32 0)
%32 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %30, i32 0)
%33 = tail call { <32 x bfloat>, i32 } @llvm.aie2.vmax.ltbf16(<32 x bfloat> %31, <32 x bfloat> %32)
%34 = extractvalue { <32 x bfloat>, i32 } %33, 0
%35 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %34, i32 0)
%36 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %35, i32 0)
%37 = bitcast <32 x bfloat> %36 to <32 x i16>
%38 = tail call noundef <32 x i16> @llvm.aie2.vbroadcast16.I512(i32 32767)
%and.i.i.i.i.i.i.i.i.i.i.i = and <32 x i16> %38, %37
%39 = bitcast <32 x i16> %and.i.i.i.i.i.i.i.i.i.i.i to <32 x bfloat>
%40 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %39, i32 0)
%41 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR3D00)
%42 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %41, i32 0)
%43 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %42, i32 0)
%44 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %43, <16 x bfloat> %13, i32 1)
%45 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %40, i32 0)
%46 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %45, <16 x bfloat> %13, i32 1)
%47 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %44, <32 x bfloat> %46, i32 60)
%48 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR3E80)
%49 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %48, i32 0)
%50 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %36, <16 x bfloat> %13, i32 1)
%51 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %49, i32 0)
%52 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %51, <16 x bfloat> %13, i32 1)
%53 = tail call noundef <8 x i64> @llvm.aie2.bf.mac16.conf(<32 x bfloat> %50, <32 x bfloat> %52, <8 x i64> %10, i32 60)
%54 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %47)
%55 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %54, i32 0)
%56 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %55, <16 x bfloat> %13, i32 1)
%57 = tail call noundef <8 x i64> @llvm.aie2.bf.msc16.conf(<32 x bfloat> %56, <32 x bfloat> %50, <8 x i64> %53, i32 60)
%58 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %57)
%59 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %58, i32 0)
%60 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %59, <16 x bfloat> %13, i32 1)
%61 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %60, <32 x bfloat> %15, i32 60)
%62 = tail call noundef <8 x i64> @llvm.aie2.v16bf16.to.v16accfloat(<16 x bfloat> %4)
%63 = tail call noundef <8 x i64> @llvm.aie2.sub.accfloat(<8 x i64> %61, <8 x i64> %62, i32 28)
%64 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %63)
%65 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %17, i32 0)
%66 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %65, <16 x bfloat> %13, i32 1)
%67 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %15, <32 x bfloat> %66, i32 60)
%68 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %67)
%69 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %68, i32 0)
%70 = tail call { <32 x bfloat>, i32 } @llvm.aie2.vmin.gebf16(<32 x bfloat> %69, <32 x bfloat> %25)
%71 = extractvalue { <32 x bfloat>, i32 } %70, 0
%72 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %71, i32 0)
%73 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %72, i32 0)
%74 = tail call { <32 x bfloat>, i32 } @llvm.aie2.vmax.ltbf16(<32 x bfloat> %73, <32 x bfloat> %32)
%75 = extractvalue { <32 x bfloat>, i32 } %74, 0
%76 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %75, i32 0)
%77 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %76, i32 0)
%78 = bitcast <32 x bfloat> %77 to <32 x i16>
%and.i.i.i.i.i.i.i.i.i.i.i.i = and <32 x i16> %38, %78
%79 = bitcast <32 x i16> %and.i.i.i.i.i.i.i.i.i.i.i.i to <32 x bfloat>
%80 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %79, i32 0)
%81 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %80, i32 0)
%82 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %81, <16 x bfloat> %13, i32 1)
%83 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %44, <32 x bfloat> %82, i32 60)
%84 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %77, <16 x bfloat> %13, i32 1)
%85 = tail call noundef <8 x i64> @llvm.aie2.bf.mac16.conf(<32 x bfloat> %84, <32 x bfloat> %52, <8 x i64> %10, i32 60)
%86 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %83)
%87 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %86, i32 0)
%88 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %87, <16 x bfloat> %13, i32 1)
%89 = tail call noundef <8 x i64> @llvm.aie2.bf.msc16.conf(<32 x bfloat> %88, <32 x bfloat> %84, <8 x i64> %85, i32 60)
%90 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %89)
%91 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %90, i32 0)
%92 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %91, <16 x bfloat> %13, i32 1)
%93 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %92, <32 x bfloat> %15, i32 60)
%94 = tail call noundef <8 x i64> @llvm.aie2.sub.accfloat(<8 x i64> %93, <8 x i64> %62, i32 28)
%95 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %94)
store <16 x bfloat> %64, ptr addrspace(6) %p_out.0, align 32, !tbaa !11
%add.ptr.i7.i.i = getelementptr inbounds i8, ptr %p_out.0.in17, i20 32
%add.ptr.ascast.i8.i.i = addrspacecast ptr %add.ptr.i7.i.i to ptr addrspace(6)
store <16 x bfloat> %95, ptr addrspace(6) %add.ptr.ascast.i8.i.i, align 32, !tbaa !11
%add.ptr.i9.i = getelementptr inbounds i8, ptr %p_out.0.in17, i20 64
%96 = call i1 @llvm.loop.decrement.i32(i32 1)
br i1 %96, label %for.body, label %for.cond.cleanup, !llvm.loop !12
}
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare { <32 x bfloat>, i32 } @llvm.aie2.vmin.gebf16(<32 x bfloat>, <32 x bfloat>) #2
; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
declare void @llvm.set.loop.iterations.i32(i32) #5
; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
declare i1 @llvm.loop.decrement.i32(i32) #5
attributes #0 = { nounwind memory(none) }
attributes #1 = { nounwind memory(inaccessiblemem: read) }
attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
attributes #3 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
attributes #4 = { mustprogress noinline "no-builtin-memcpy" "no-jump-tables"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
attributes #5 = { nocallback noduplicate nofree nosync nounwind willreturn }
!llvm.linker.options = !{}
!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}
!0 = !{i32 7, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{i32 1, !"wchar_size", i32 4}
!3 = !{!"clang version 19.0.0git (/scratch/llvm-aie/clang 640962db16e997d4aaf9dadcf09d9a4fc7e06fe4)"}
!4 = !{!5, !6, i64 0}
!5 = !{!"TanhTemplated", !6, i64 0, !7, i64 4, !7, i64 5, !7, i64 6, !7, i64 7, !7, i64 8, !7, i64 9, !9, i64 10, !6, i64 12, !6, i64 16, !6, i64 20, !6, i64 24, !6, i64 28, !10, i64 32}
!6 = !{!"int", !7, i64 0}
!7 = !{!"omnipotent char", !8, i64 0}
!8 = !{!"Simple C++ TBAA"}
!9 = !{!"short", !7, i64 0}
!10 = !{!"_ZTS23tanh_templated_params_tIu6__bf16E"}
!11 = !{!7, !7, i64 0}
!12 = distinct !{!12, !13, !14}
!13 = !{!"llvm.loop.mustprogress"}
!14 = !{!"llvm.loop.itercount.range", i64 4}