llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
;
; This file is licensed under the Apache License v2.0 with LLVM Exceptions.
; See https://llvm.org/LICENSE.txt for license information.
; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
;
; (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates

; RUN: llc -O2 -mtriple=aie2 %s -o - | FileCheck %s

; The test is meant as a quick way to spot QoR regressions. In this test, the
; code can only be pipelined (Pre-SWP) because of the removal of some WAW
; dependencies related to sticky status registers.

; Function Attrs: nounwind memory(none)
declare <8 x i64> @llvm.aie2.v16accfloat() #0

; Function Attrs: nounwind memory(none)
declare <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat) #0

; Function Attrs: nounwind memory(none)
declare <32 x bfloat> @llvm.aie2.v32bfloat16() #0

; Function Attrs: nounwind memory(none)
declare <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat>, i32) #0

; Function Attrs: nounwind memory(inaccessiblemem: read)
declare <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat>, <32 x bfloat>, i32) #1

; Function Attrs: nounwind memory(inaccessiblemem: read)
declare <8 x i64> @llvm.aie2.bf.mac16.conf(<32 x bfloat>, <32 x bfloat>, <8 x i64>, i32) #1

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare { <32 x bfloat>, i32 } @llvm.aie2.vmax.ltbf16(<32 x bfloat>, <32 x bfloat>) #2

; Function Attrs: nounwind memory(none)
declare <16 x bfloat> @llvm.aie2.v16bfloat16() #0

; Function Attrs: nounwind memory(inaccessiblemem: read)
declare <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64>) #1

; Function Attrs: nounwind memory(none)
declare <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat>, i32) #0

; Function Attrs: nounwind memory(none)
declare <8 x i64> @llvm.aie2.v16bf16.to.v16accfloat(<16 x bfloat>) #0

; Function Attrs: nounwind memory(none)
declare <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat>, <16 x bfloat>, i32) #0

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
declare <8 x i64> @llvm.aie2.sub.accfloat(<8 x i64>, <8 x i64>, i32) #3

; Function Attrs: nounwind memory(inaccessiblemem: read)
declare <8 x i64> @llvm.aie2.bf.msc16.conf(<32 x bfloat>, <32 x bfloat>, <8 x i64>, i32) #1

; Function Attrs: nounwind memory(none)
declare <32 x i16> @llvm.aie2.vbroadcast16.I512(i32) #0

; Function Attrs: mustprogress noinline
define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr nonnull align 32 dereferenceable(64) %params) align 2 {
; CHECK-LABEL: TanhTemplated:
; CHECK:         .p2align 4
; CHECK-NEXT:  // %bb.0: // %for.body.lr.ph
; CHECK-NEXT:    nop ; movxm r3, #16512
; CHECK-NEXT:    movxm r4, #-16256
; CHECK-NEXT:    movxm r5, #32767
; CHECK-NEXT:    movxm r0, #16256
; CHECK-NEXT:    movxm r1, #16384
; CHECK-NEXT:    lda r0, [p2, #0]; movxm r2, #16128
; CHECK-NEXT:    vbcst.16 x0, r1
; CHECK-NEXT:    vldb wl3, [p0], #32; vbcst.16 x3, r0
; CHECK-NEXT:    vbcst.16 x2, r2
; CHECK-NEXT:    mova r1, #0; vconv.fp32.bf16 bmh0, wl2
; CHECK-NEXT:    vbcst.16 x2, r1
; CHECK-NEXT:    vldb wl3, [p0], #32; vmov wh0, wl2
; CHECK-NEXT:    mova r1, #-5; vmov wh3, wl2
; CHECK-NEXT:    mova r1, #60; lshl r2, r0, r1; vconv.fp32.bf16 bmh1, wl3
; CHECK-NEXT:    movxm r6, #15616; vmul.f bmh2, x0, x3, r1
; CHECK-NEXT:    movxm r7, #16000
; CHECK-NEXT:    vbcst.16 x1, r3
; CHECK-NEXT:    vbcst.16 x10, r4
; CHECK-NEXT:    vbcst.16 x8, r5; vmul.f bmh3, x0, x3, r1
; CHECK-NEXT:    vbcst.16 x6, r6
; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh2; vbcst.16 x4, r7
; CHECK-NEXT:    vmov wh6, wl2
; CHECK-NEXT:    vmin_ge.bf16 x3, r16, x3, x1
; CHECK-NEXT:    or r8, r16, r16; vmax_lt.bf16 x3, r16, x3, x10
; CHECK-NEXT:    vconv.bf16.fp32 wl5, bmh3; vband x7, x8, x3
; CHECK-NEXT:    vmov wh7, wl2
; CHECK-NEXT:    vmin_ge.bf16 x5, r16, x5, x1
; CHECK-NEXT:    vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x5, x10
; CHECK-NEXT:    vband x7, x8, x5
; CHECK-NEXT:    vldb wl7, [p0], #32; vmov wh7, wl2; vmul.f bmh2, x6, x7, r1
; CHECK-NEXT:    vmov wh4, wl2
; CHECK-NEXT:    vmov wh3, wl2; vmul.f bmh4, x6, x7, r1
; CHECK-NEXT:    nop
; CHECK-NEXT:    vmov wh5, wl2; vmac.f bmh3, bmh0, x3, x4, r1
; CHECK-NEXT:    vmul.f bmh5, x0, x7, r1
; CHECK-NEXT:    movxm ls, #.LBB0_1; vmac.f bmh6, bmh0, x5, x4, r1
; CHECK-NEXT:    vconv.bf16.fp32 wl7, bmh2; movxm le, #.L_LEnd0; vmul.f bmh7, x0, x7, r1
; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh4; add.nc lc, r2, #-2
; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; vmsc.f bmh3, bmh3, x7, x3, r1
; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; vmsc.f bml4, bmh6, x3, x5, r1
; CHECK-NEXT:    nopb ; nopa ; vconv.bf16.fp32 wl3, bmh5; nopxm ; nopv
; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT:    nopb ; nopa ; vconv.bf16.fp32 wl5, bmh7; nopx ; vmin_ge.bf16 x3, r16, x3, x1; nopv
; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vmax_lt.bf16 x3, r16, x3, x10; nopv
; CHECK-NEXT:    nopb ; mova r0, #28; vconv.bf16.fp32 wl7, bmh3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv
; CHECK-NEXT:    .p2align 4
; CHECK-NEXT:  .LBB0_1: // %for.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    nopa ; nopb ; nopx ; vband x9, x8, x3; nops
; CHECK-NEXT:    vmax_lt.bf16 x5, r16, x5, x10
; CHECK-NEXT:    vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmov wh3, wl2
; CHECK-NEXT:    vmov wh9, wl2; vmul.f bmh6, x7, x0, r1
; CHECK-NEXT:    vldb wl7, [p0], #32; vband x9, x8, x5; vmul.f bmh2, x7, x0, r1
; CHECK-NEXT:    vmov wh9, wl2; vmul.f bmh3, x6, x9, r1
; CHECK-NEXT:    vmac.f bmh5, bmh0, x3, x4, r1
; CHECK-NEXT:    vmul.f bmh4, x6, x9, r1
; CHECK-NEXT:    vmov wh5, wl2; vsub.f bml1, bmh6, bmh1, r0
; CHECK-NEXT:    vmul.f bmh7, x0, x7, r1
; CHECK-NEXT:    vmac.f bml2, bmh0, x5, x4, r1
; CHECK-NEXT:    vconv.bf16.fp32 wl7, bmh3; vmul.f bmh8, x0, x7, r1
; CHECK-NEXT:    vsub.f bml0, bmh2, bmh1, r0
; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh4; vmsc.f bml3, bmh5, x7, x3, r1
; CHECK-NEXT:    nop
; CHECK-NEXT:    vconv.bf16.fp32 wl11, bmh7; vmsc.f bml4, bml2, x3, x5, r1
; CHECK-NEXT:    vst.conv.bf16.fp32 bml1, [p1], #32
; CHECK-NEXT:    vconv.bf16.fp32 wl5, bmh8; vmin_ge.bf16 x3, r16, x11, x1
; CHECK-NEXT:    vst.conv.bf16.fp32 bml0, [p1], #32; vmax_lt.bf16 x3, r16, x3, x10
; CHECK-NEXT:  .L_LEnd0:
; CHECK-NEXT:    nopb ; nopa ; vconv.bf16.fp32 wl7, bml3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv
; CHECK-NEXT:  // %bb.2:
; CHECK-NEXT:    nopa ; nopb ; nopx ; vmov wh7, wl2; nops
; CHECK-NEXT:    vconv.bf16.fp32 wl1, bml4; vmov wh1, wl2
; CHECK-NEXT:    vmov wh6, wl2; vmul.f bmh3, x7, x0, r1
; CHECK-NEXT:    vmax_lt.bf16 x10, r16, x5, x10; vmul.f bmh2, x1, x0, r1
; CHECK-NEXT:    vband x1, x8, x3
; CHECK-NEXT:    vband x8, x8, x10
; CHECK-NEXT:    vmov wh1, wl2; vsub.f bmh3, bmh3, bmh1, r0
; CHECK-NEXT:    vmov wh8, wl2; vsub.f bmh2, bmh2, bmh1, r0
; CHECK-NEXT:    vmul.f bmh2, x6, x1, r1
; CHECK-NEXT:    vmov wh4, wl2; vmul.f bmh3, x6, x8, r1
; CHECK-NEXT:    vmov wh3, wl2
; CHECK-NEXT:    vmov wh10, wl2
; CHECK-NEXT:    vst.conv.bf16.fp32 bmh3, [p1], #32; vmac.f bmh4, bmh0, x3, x4, r1
; CHECK-NEXT:    vst.conv.bf16.fp32 bmh2, [p1], #32; vmac.f bmh0, bmh0, x10, x4, r1
; CHECK-NEXT:    vconv.bf16.fp32 wl4, bmh2
; CHECK-NEXT:    vconv.bf16.fp32 wl4, bmh3
; CHECK-NEXT:    vmsc.f bmh2, bmh4, x4, x3, r1
; CHECK-NEXT:    vmsc.f bmh0, bmh0, x4, x10, r1
; CHECK-NEXT:    nop
; CHECK-NEXT:    nop
; CHECK-NEXT:    nop
; CHECK-NEXT:    nop
; CHECK-NEXT:    vconv.bf16.fp32 wl4, bmh2
; CHECK-NEXT:    vconv.bf16.fp32 wl4, bmh0
; CHECK-NEXT:    vmul.f bmh2, x4, x0, r1
; CHECK-NEXT:    vmul.f bmh0, x4, x0, r1
; CHECK-NEXT:    nop
; CHECK-NEXT:    nop
; CHECK-NEXT:    vsub.f bmh2, bmh2, bmh1, r0
; CHECK-NEXT:    vsub.f bmh0, bmh0, bmh1, r0
; CHECK-NEXT:    nop
; CHECK-NEXT:    nop
; CHECK-NEXT:    nop
; CHECK-NEXT:    ret lr
; CHECK-NEXT:    vst.conv.bf16.fp32 bmh2, [p1], #32 // Delay Slot 5
; CHECK-NEXT:    vst.conv.bf16.fp32 bmh0, [p1], #32 // Delay Slot 4
; CHECK-NEXT:    nop // Delay Slot 3
; CHECK-NEXT:    nop // Delay Slot 2
; CHECK-NEXT:    mov r16, r8 // Delay Slot 1
for.body.lr.ph:
  %0 = tail call noundef <16 x bfloat> @llvm.aie2.v16bfloat16()
  %1 = tail call noundef <8 x i64> @llvm.aie2.v16accfloat()
  %2 = tail call noundef <32 x bfloat> @llvm.aie2.v32bfloat16()
  %3 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR3F80)
  %4 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %3, i32 0)
  %5 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR4000)
  %6 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %5, i32 0)
  %7 = tail call noundef <8 x i64> @llvm.aie2.v16bf16.to.v16accfloat(<16 x bfloat> %6)
  %8 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR3F00)
  %9 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %8, i32 0)
  %10 = tail call noundef <8 x i64> @llvm.aie2.v16bf16.to.v16accfloat(<16 x bfloat> %9)
  %11 = load i32, ptr %params, align 32, !tbaa !4
  %div16 = lshr i32 %11, 5
  %12 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR0000)
  %13 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %12, i32 0)
  %14 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %6, i32 0)
  %15 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %14, <16 x bfloat> %13, i32 1)
  call void @llvm.set.loop.iterations.i32(i32 %div16)
  br label %for.body

for.cond.cleanup:                                 ; preds = %for.body
  ret void

for.body:                                         ; preds = %for.body, %for.body.lr.ph
  %p_out.0.in17 = phi ptr [ %ofm, %for.body.lr.ph ], [ %add.ptr.i9.i, %for.body ]
  %p_in.0.in16 = phi ptr [ %ifm, %for.body.lr.ph ], [ %add.ptr.i.i.i, %for.body ]
  %p_out.0 = addrspacecast ptr %p_out.0.in17 to ptr addrspace(6)
  %p_in.0 = addrspacecast ptr %p_in.0.in16 to ptr addrspace(5)
  %16 = load <16 x bfloat>, ptr addrspace(5) %p_in.0, align 32, !tbaa !11
  %add.ptr.i.i = getelementptr inbounds i8, ptr %p_in.0.in16, i20 32
  %add.ptr.ascast.i.i = addrspacecast ptr %add.ptr.i.i to ptr addrspace(5)
  %17 = load <16 x bfloat>, ptr addrspace(5) %add.ptr.ascast.i.i, align 32, !tbaa !11
  %add.ptr.i.i.i = getelementptr inbounds i8, ptr %p_in.0.in16, i20 64
  %18 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %16, i32 0)
  %19 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %18, <16 x bfloat> %13, i32 1)
  %20 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %15, <32 x bfloat> %19, i32 60)
  %21 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %20)
  %22 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR4080)
  %23 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %22, i32 0)
  %24 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %21, i32 0)
  %25 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %23, i32 0)
  %26 = tail call { <32 x bfloat>, i32 } @llvm.aie2.vmin.gebf16(<32 x bfloat> %24, <32 x bfloat> %25)
  %27 = extractvalue { <32 x bfloat>, i32 } %26, 0
  %28 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %27, i32 0)
  %29 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xRC080)
  %30 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %29, i32 0)
  %31 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %28, i32 0)
  %32 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %30, i32 0)
  %33 = tail call { <32 x bfloat>, i32 } @llvm.aie2.vmax.ltbf16(<32 x bfloat> %31, <32 x bfloat> %32)
  %34 = extractvalue { <32 x bfloat>, i32 } %33, 0
  %35 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %34, i32 0)
  %36 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %35, i32 0)
  %37 = bitcast <32 x bfloat> %36 to <32 x i16>
  %38 = tail call noundef <32 x i16> @llvm.aie2.vbroadcast16.I512(i32 32767)
  %and.i.i.i.i.i.i.i.i.i.i.i = and <32 x i16> %38, %37
  %39 = bitcast <32 x i16> %and.i.i.i.i.i.i.i.i.i.i.i to <32 x bfloat>
  %40 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %39, i32 0)
  %41 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR3D00)
  %42 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %41, i32 0)
  %43 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %42, i32 0)
  %44 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %43, <16 x bfloat> %13, i32 1)
  %45 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %40, i32 0)
  %46 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %45, <16 x bfloat> %13, i32 1)
  %47 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %44, <32 x bfloat> %46, i32 60)
  %48 = tail call noundef <32 x bfloat> @llvm.aie2.vbroadcast16.bf512(bfloat 0xR3E80)
  %49 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %48, i32 0)
  %50 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %36, <16 x bfloat> %13, i32 1)
  %51 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %49, i32 0)
  %52 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %51, <16 x bfloat> %13, i32 1)
  %53 = tail call noundef <8 x i64> @llvm.aie2.bf.mac16.conf(<32 x bfloat> %50, <32 x bfloat> %52, <8 x i64> %10, i32 60)
  %54 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %47)
  %55 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %54, i32 0)
  %56 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %55, <16 x bfloat> %13, i32 1)
  %57 = tail call noundef <8 x i64> @llvm.aie2.bf.msc16.conf(<32 x bfloat> %56, <32 x bfloat> %50, <8 x i64> %53, i32 60)
  %58 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %57)
  %59 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %58, i32 0)
  %60 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %59, <16 x bfloat> %13, i32 1)
  %61 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %60, <32 x bfloat> %15, i32 60)
  %62 = tail call noundef <8 x i64> @llvm.aie2.v16bf16.to.v16accfloat(<16 x bfloat> %4)
  %63 = tail call noundef <8 x i64> @llvm.aie2.sub.accfloat(<8 x i64> %61, <8 x i64> %62, i32 28)
  %64 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %63)
  %65 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %17, i32 0)
  %66 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %65, <16 x bfloat> %13, i32 1)
  %67 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %15, <32 x bfloat> %66, i32 60)
  %68 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %67)
  %69 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %68, i32 0)
  %70 = tail call { <32 x bfloat>, i32 } @llvm.aie2.vmin.gebf16(<32 x bfloat> %69, <32 x bfloat> %25)
  %71 = extractvalue { <32 x bfloat>, i32 } %70, 0
  %72 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %71, i32 0)
  %73 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %72, i32 0)
  %74 = tail call { <32 x bfloat>, i32 } @llvm.aie2.vmax.ltbf16(<32 x bfloat> %73, <32 x bfloat> %32)
  %75 = extractvalue { <32 x bfloat>, i32 } %74, 0
  %76 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %75, i32 0)
  %77 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %76, i32 0)
  %78 = bitcast <32 x bfloat> %77 to <32 x i16>
  %and.i.i.i.i.i.i.i.i.i.i.i.i = and <32 x i16> %38, %78
  %79 = bitcast <32 x i16> %and.i.i.i.i.i.i.i.i.i.i.i.i to <32 x bfloat>
  %80 = tail call <16 x bfloat> @llvm.aie2.ext.bf256.bf512(<32 x bfloat> %79, i32 0)
  %81 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %80, i32 0)
  %82 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %81, <16 x bfloat> %13, i32 1)
  %83 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %44, <32 x bfloat> %82, i32 60)
  %84 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %77, <16 x bfloat> %13, i32 1)
  %85 = tail call noundef <8 x i64> @llvm.aie2.bf.mac16.conf(<32 x bfloat> %84, <32 x bfloat> %52, <8 x i64> %10, i32 60)
  %86 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %83)
  %87 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %86, i32 0)
  %88 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %87, <16 x bfloat> %13, i32 1)
  %89 = tail call noundef <8 x i64> @llvm.aie2.bf.msc16.conf(<32 x bfloat> %88, <32 x bfloat> %84, <8 x i64> %85, i32 60)
  %90 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %89)
  %91 = tail call <32 x bfloat> @llvm.aie2.set.bf512.bf256(<16 x bfloat> %90, i32 0)
  %92 = tail call <32 x bfloat> @llvm.aie2.upd.bf512.bf256(<32 x bfloat> %91, <16 x bfloat> %13, i32 1)
  %93 = tail call noundef <8 x i64> @llvm.aie2.bf.mul16.conf(<32 x bfloat> %92, <32 x bfloat> %15, i32 60)
  %94 = tail call noundef <8 x i64> @llvm.aie2.sub.accfloat(<8 x i64> %93, <8 x i64> %62, i32 28)
  %95 = tail call noundef <16 x bfloat> @llvm.aie2.v16accfloat.to.v16bf16(<8 x i64> %94)
  store <16 x bfloat> %64, ptr addrspace(6) %p_out.0, align 32, !tbaa !11
  %add.ptr.i7.i.i = getelementptr inbounds i8, ptr %p_out.0.in17, i20 32
  %add.ptr.ascast.i8.i.i = addrspacecast ptr %add.ptr.i7.i.i to ptr addrspace(6)
  store <16 x bfloat> %95, ptr addrspace(6) %add.ptr.ascast.i8.i.i, align 32, !tbaa !11
  %add.ptr.i9.i = getelementptr inbounds i8, ptr %p_out.0.in17, i20 64
  %96 = call i1 @llvm.loop.decrement.i32(i32 1)
  br i1 %96, label %for.body, label %for.cond.cleanup, !llvm.loop !12
}

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare { <32 x bfloat>, i32 } @llvm.aie2.vmin.gebf16(<32 x bfloat>, <32 x bfloat>) #2

; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
declare void @llvm.set.loop.iterations.i32(i32) #5

; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
declare i1 @llvm.loop.decrement.i32(i32) #5

attributes #0 = { nounwind memory(none) }
attributes #1 = { nounwind memory(inaccessiblemem: read) }
attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
attributes #3 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
attributes #4 = { mustprogress noinline "no-builtin-memcpy" "no-jump-tables"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
attributes #5 = { nocallback noduplicate nofree nosync nounwind willreturn }

!llvm.linker.options = !{}
!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}

!0 = !{i32 7, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{i32 1, !"wchar_size", i32 4}
!3 = !{!"clang version 19.0.0git (/scratch/llvm-aie/clang 640962db16e997d4aaf9dadcf09d9a4fc7e06fe4)"}
!4 = !{!5, !6, i64 0}
!5 = !{!"TanhTemplated", !6, i64 0, !7, i64 4, !7, i64 5, !7, i64 6, !7, i64 7, !7, i64 8, !7, i64 9, !9, i64 10, !6, i64 12, !6, i64 16, !6, i64 20, !6, i64 24, !6, i64 28, !10, i64 32}
!6 = !{!"int", !7, i64 0}
!7 = !{!"omnipotent char", !8, i64 0}
!8 = !{!"Simple C++ TBAA"}
!9 = !{!"short", !7, i64 0}
!10 = !{!"_ZTS23tanh_templated_params_tIu6__bf16E"}
!11 = !{!7, !7, i64 0}
!12 = distinct !{!12, !13, !14}
!13 = !{!"llvm.loop.mustprogress"}
!14 = !{!"llvm.loop.itercount.range", i64 4}