llvm
diff --git a/Diff for: ‎llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+3-3 b/Diff for: ‎llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+3-3
diff --git a/Diff for: ‎llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+264-100 b/Diff for: ‎llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+264-100
diff --git a/Diff for: ‎llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
+4-4 b/Diff for: ‎llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
+4-4
diff --git a/Diff for: ‎llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+231-108 b/Diff for: ‎llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+231-108
diff --git a/Diff for: ‎llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
+159 b/Diff for: ‎llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
+159
diff --git a/Diff for: ‎llvm/test/CodeGen/AMDGPU/memory_clause.ll
+10-13 b/Diff for: ‎llvm/test/CodeGen/AMDGPU/memory_clause.ll
+10-13
diff --git a/Diff for: ‎llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
+18 b/Diff for: ‎llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
+18
diff --git a/Diff for: ‎llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
+7-7 b/Diff for: ‎llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
+7-7
@@ -1092,7 +1092,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // is transformed to:
   //
   //   addr2 = gep float, float* p, i64 a ; inbounds removed
-  //   addr  = gep inbounds float, float* addr2, i64 5
+  //   addr  = gep float, float* addr2, i64 5 ; inbounds removed
   //
   // If a is -4, although the old index b is in bounds, the new index a is
   // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
@@ -1103,7 +1103,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // TODO(jingyue): do some range analysis to keep as many inbounds as
   // possible. GEPs with inbounds are more friendly to alias analysis.
   // TODO(gep_nowrap): Preserve nuw at least.
-  bool GEPWasInBounds = GEP->isInBounds();
+  GEPNoWrapFlags NewGEPFlags = GEPNoWrapFlags::none();
   GEP->setNoWrapFlags(GEPNoWrapFlags::none());
 
   // Lowers a GEP to either GEPs with a single index or arithmetic operations.
@@ -1153,7 +1153,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   IRBuilder<> Builder(GEP);
   NewGEP = cast<Instruction>(Builder.CreatePtrAdd(
       NewGEP, ConstantInt::get(PtrIdxTy, AccumulativeByteOffset, true),
-      GEP->getName(), GEPWasInBounds));
+      GEP->getName(), NewGEPFlags));
   NewGEP->copyMetadata(*GEP);
 
   GEP->replaceAllUsesWith(NewGEP);
 
@@ -238,8 +238,8 @@ main_body:
   %25 = getelementptr inbounds [0 x <8 x i32>], ptr addrspace(6) %1, i32 0, i32 %24, !amdgpu.uniform !0
   %26 = load <8 x i32>, ptr addrspace(6) %25, align 32, !invariant.load !0
   %27 = shl i32 %23, 2
-  %28 = or disjoint i32 %27, 3
-  %29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %28, !amdgpu.uniform !0
+  %28 = getelementptr [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %27, !amdgpu.uniform !0
+  %29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %28, i32 0, i32 3, !amdgpu.uniform !0
   %30 = load <4 x i32>, ptr addrspace(6) %29, align 16, !invariant.load !0
   %31 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %30, i1 0, i32 0, i32 0) #8
   %32 = extractelement <4 x float> %31, i32 0
@@ -270,8 +270,8 @@ main_body:
   %25 = getelementptr inbounds [0 x <8 x i32>], ptr addrspace(6) %1, i32 0, i32 %24
   %26 = load <8 x i32>, ptr addrspace(6) %25, align 32, !invariant.load !0
   %27 = shl i32 %23, 2
-  %28 = or disjoint i32 %27, 3
-  %29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %28
+  %28 = getelementptr [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %27
+  %29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %28, i32 0, i32 3
   %30 = load <4 x i32>, ptr addrspace(6) %29, align 16, !invariant.load !0
   %31 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %30, i1 0, i32 0, i32 0) #8
   %32 = extractelement <4 x float> %31, i32 0
 
@@ -0,0 +1,159 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-MUBUF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-FLATSCR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-MUBUF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-FLATSCR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s
+
+; This test checks memory addresses with constant offset components that should
+; not be folded into memory accesses with immediate offsets.
+; SeparateConstOffsetsFromGEP transforms the GEPs in a way that can lead to
+; out-of-bounds or negative intermediate results in the address computation,
+; which are problematic for flat and scratch instructions:
+;     gep[inbounds](p, i + 3) -> gep(gep(p, i), 3)
+
+
+; FIXME the offset here should not be folded: if %p points to the beginning of
+; scratch or LDS and %i is -1, a folded offset crashes the program.
+define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
+; GFX90A-LABEL: flat_offset_maybe_oob:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX90A-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX90A-NEXT:    flat_load_dword v0, v[0:1] offset:12
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_offset_maybe_oob:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    flat_load_dword v0, v[0:1] offset:12
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: flat_offset_maybe_oob:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GFX942-NEXT:    flat_load_dword v0, v[0:1] offset:12
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_offset_maybe_oob:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    flat_load_b32 v0, v[0:1] offset:12
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_offset_maybe_oob:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    flat_load_b32 v0, v[0:1] offset:12
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %idx = add nsw i32 %i, 3
+  %arrayidx = getelementptr inbounds i32, ptr %p, i32 %idx
+  %l = load i32, ptr %arrayidx
+  ret i32 %l
+}
+
+; For MUBUF and for GFX12, folding the offset is okay.
+define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) {
+; GFX90A-MUBUF-LABEL: private_offset_maybe_oob:
+; GFX90A-MUBUF:       ; %bb.0:
+; GFX90A-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-MUBUF-NEXT:    v_lshl_add_u32 v0, v1, 2, v0
+; GFX90A-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:12
+; GFX90A-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-FLATSCR-LABEL: private_offset_maybe_oob:
+; GFX90A-FLATSCR:       ; %bb.0:
+; GFX90A-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-FLATSCR-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX90A-FLATSCR-NEXT:    v_add3_u32 v0, v0, v1, 12
+; GFX90A-FLATSCR-NEXT:    scratch_load_dword v0, v0, off
+; GFX90A-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-MUBUF-LABEL: private_offset_maybe_oob:
+; GFX10-MUBUF:       ; %bb.0:
+; GFX10-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-MUBUF-NEXT:    v_lshl_add_u32 v0, v1, 2, v0
+; GFX10-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:12
+; GFX10-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-FLATSCR-LABEL: private_offset_maybe_oob:
+; GFX10-FLATSCR:       ; %bb.0:
+; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLATSCR-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX10-FLATSCR-NEXT:    v_add3_u32 v0, v0, v1, 12
+; GFX10-FLATSCR-NEXT:    scratch_load_dword v0, v0, off
+; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: private_offset_maybe_oob:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX942-NEXT:    v_add3_u32 v0, v0, v1, 12
+; GFX942-NEXT:    scratch_load_dword v0, v0, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: private_offset_maybe_oob:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v0, v0, v1, 12
+; GFX11-NEXT:    scratch_load_b32 v0, v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: private_offset_maybe_oob:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, v1, 2, v0
+; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %idx = add nsw i32 %i, 3
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(5) %p, i32 %idx
+  %l = load i32, ptr addrspace(5) %arrayidx
+  ret i32 %l
+}
@@ -225,22 +225,19 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr
 ; GCN-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp2 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp
-  %tmp3 = load <4 x i32>, ptr addrspace(5) %tmp2, align 16
-  %tmp4 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp
-  %tmp5 = add nuw nsw i32 %tmp, 1
-  %tmp6 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp5
+  %base = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp
+  %tmp3 = load <4 x i32>, ptr addrspace(5) %base, align 16
+  %base1 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp
+  %tmp6 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base, i32 1
   %tmp7 = load <4 x i32>, ptr addrspace(5) %tmp6, align 16
-  %tmp8 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp5
-  %tmp9 = add nuw nsw i32 %tmp, 2
-  %tmp10 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp9
+  %tmp8 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base1, i32 1
+  %tmp10 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base, i32 2
   %tmp11 = load <4 x i32>, ptr addrspace(5) %tmp10, align 16
-  %tmp12 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp9
-  %tmp13 = add nuw nsw i32 %tmp, 3
-  %tmp14 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp13
+  %tmp12 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base1, i32 2
+  %tmp14 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base, i32 3
   %tmp15 = load <4 x i32>, ptr addrspace(5) %tmp14, align 16
-  %tmp16 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp13
-  store <4 x i32> %tmp3, ptr addrspace(5) %tmp4, align 16
+  %tmp16 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base1, i32 3
+  store <4 x i32> %tmp3, ptr addrspace(5) %base1, align 16
   store <4 x i32> %tmp7, ptr addrspace(5) %tmp8, align 16
   store <4 x i32> %tmp11, ptr addrspace(5) %tmp12, align 16
   store <4 x i32> %tmp15, ptr addrspace(5) %tmp16, align 16
 
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=separate-const-offset-from-gep -S | FileCheck %s
+
+; The inbounds flags cannot be preserved here: If the pointers point to the
+; beginning of an object and %i is 1, the intermediate GEPs are out of bounds.
+define ptr @maybe_oob(ptr %p, i64 %i) {
+; CHECK-LABEL: @maybe_oob(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDX1:%.*]] = sub i64 0, [[I:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IDX1]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = sub nsw i64 1, %i
+  %arrayidx = getelementptr inbounds i32, ptr %p, i64 %idx
+  ret ptr %arrayidx
+}
@@ -11,9 +11,9 @@ define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, ptr addrspace(1) nocaptu
 ; IR-NEXT:    [[TMP:%.*]] = sext i32 [[Y]] to i64
 ; IR-NEXT:    [[TMP1:%.*]] = sext i32 [[X]] to i64
 ; IR-NEXT:    [[TMP2:%.*]] = getelementptr [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 [[TMP1]], i64 [[TMP]]
-; IR-NEXT:    [[TMP82:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP2]], i64 4
-; IR-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP2]], i64 128
-; IR-NEXT:    [[TMP187:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP2]], i64 132
+; IR-NEXT:    [[TMP82:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 4
+; IR-NEXT:    [[TMP144:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 128
+; IR-NEXT:    [[TMP187:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 132
 ; IR-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUTPUT]], align 4
 ; IR-NEXT:    ret void
 ;
@@ -51,7 +51,7 @@ define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, pt
 ; IR-NEXT:    [[TMP2:%.*]] = getelementptr [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP1]], i64 [[TMP]]
 ; IR-NEXT:    [[TMP6:%.*]] = add i32 [[Y]], 255
 ; IR-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
-; IR-NEXT:    [[TMP82:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP2]], i64 1020
+; IR-NEXT:    [[TMP82:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 1020
 ; IR-NEXT:    [[TMP12:%.*]] = add i32 [[X]], 256
 ; IR-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
 ; IR-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP13]], i64 [[TMP]]
@@ -91,13 +91,13 @@ define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y
 ; IR-NEXT:    [[TMP2:%.*]] = getelementptr [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 [[X]], i32 [[Y]]
 ; IR-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(3) [[TMP2]], align 4
 ; IR-NEXT:    [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00
-; IR-NEXT:    [[TMP82:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i32 1020
+; IR-NEXT:    [[TMP82:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 1020
 ; IR-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(3) [[TMP82]], align 4
 ; IR-NEXT:    [[TMP11:%.*]] = fadd float [[TMP5]], [[TMP10]]
-; IR-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i32 64512
+; IR-NEXT:    [[TMP144:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 64512
 ; IR-NEXT:    [[TMP16:%.*]] = load float, ptr addrspace(3) [[TMP144]], align 4
 ; IR-NEXT:    [[TMP17:%.*]] = fadd float [[TMP11]], [[TMP16]]
-; IR-NEXT:    [[TMP187:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i32 65532
+; IR-NEXT:    [[TMP187:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 65532
 ; IR-NEXT:    [[TMP20:%.*]] = load float, ptr addrspace(3) [[TMP187]], align 4
 ; IR-NEXT:    [[TMP21:%.*]] = fadd float [[TMP17]], [[TMP20]]
 ; IR-NEXT:    store float [[TMP21]], ptr addrspace(1) [[OUTPUT]], align 4