Skip to content

Commit 0232309

Browse files
committed
[SOL] Correctly copy 16-byte aligned memory (anza-xyz#97)
* Fix issue with copying 16-byte aligned memory
1 parent b20c727 commit 0232309

File tree

4 files changed

+142
-34
lines changed

4 files changed

+142
-34
lines changed

llvm/lib/Target/SBF/SBFInstrInfo.cpp

Lines changed: 43 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ void SBFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
5252
DebugLoc dl = MI->getDebugLoc();
5353
unsigned LdOpc, StOpc;
5454

55+
unsigned BytesPerOp = std::min(static_cast<unsigned>(Alignment), 8u);
5556
switch (Alignment) {
5657
case 1:
5758
LdOpc = SBF::LDB;
@@ -66,49 +67,62 @@ void SBFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
6667
StOpc = SBF::STW;
6768
break;
6869
case 8:
70+
case 16:
6971
LdOpc = SBF::LDD;
7072
StOpc = SBF::STD;
7173
break;
7274
default:
7375
llvm_unreachable("unsupported memcpy alignment");
7476
}
7577

76-
unsigned IterationNum = CopyLen >> Log2_64(Alignment);
77-
for(unsigned I = 0; I < IterationNum; ++I) {
78+
unsigned IterationNum = (CopyLen >> Log2_64(BytesPerOp));
79+
for (unsigned I = 0; I < IterationNum; ++I) {
7880
BuildMI(*BB, MI, dl, get(LdOpc))
79-
.addReg(ScratchReg, RegState::Define).addReg(SrcReg)
80-
.addImm(I * Alignment);
81+
.addReg(ScratchReg, RegState::Define)
82+
.addReg(SrcReg)
83+
.addImm(I * BytesPerOp);
8184
BuildMI(*BB, MI, dl, get(StOpc))
82-
.addReg(ScratchReg, RegState::Kill).addReg(DstReg)
83-
.addImm(I * Alignment);
85+
.addReg(ScratchReg, RegState::Kill)
86+
.addReg(DstReg)
87+
.addImm(I * BytesPerOp);
8488
}
8589

86-
unsigned BytesLeft = CopyLen & (Alignment - 1);
87-
unsigned Offset = IterationNum * Alignment;
88-
bool Hanging4Byte = BytesLeft & 0x4;
89-
bool Hanging2Byte = BytesLeft & 0x2;
90-
bool Hanging1Byte = BytesLeft & 0x1;
91-
if (Hanging4Byte) {
92-
BuildMI(*BB, MI, dl, get(SBF::LDW))
93-
.addReg(ScratchReg, RegState::Define).addReg(SrcReg).addImm(Offset);
94-
BuildMI(*BB, MI, dl, get(SBF::STW))
95-
.addReg(ScratchReg, RegState::Kill).addReg(DstReg).addImm(Offset);
96-
Offset += 4;
90+
unsigned BytesLeft = CopyLen - IterationNum * BytesPerOp;
91+
unsigned Offset;
92+
if (BytesLeft == 0) {
93+
BB->erase(MI);
94+
return;
9795
}
98-
if (Hanging2Byte) {
99-
BuildMI(*BB, MI, dl, get(SBF::LDH))
100-
.addReg(ScratchReg, RegState::Define).addReg(SrcReg).addImm(Offset);
101-
BuildMI(*BB, MI, dl, get(SBF::STH))
102-
.addReg(ScratchReg, RegState::Kill).addReg(DstReg).addImm(Offset);
103-
Offset += 2;
104-
}
105-
if (Hanging1Byte) {
106-
BuildMI(*BB, MI, dl, get(SBF::LDB))
107-
.addReg(ScratchReg, RegState::Define).addReg(SrcReg).addImm(Offset);
108-
BuildMI(*BB, MI, dl, get(SBF::STB))
109-
.addReg(ScratchReg, RegState::Kill).addReg(DstReg).addImm(Offset);
96+
97+
if (BytesLeft < 2) {
98+
Offset = CopyLen - 1;
99+
LdOpc = SBF::LDB;
100+
StOpc = SBF::STB;
101+
} else if (BytesLeft <= 2) {
102+
Offset = CopyLen - 2;
103+
LdOpc = SBF::LDH;
104+
StOpc = SBF::STH;
105+
} else if (BytesLeft <= 4) {
106+
Offset = CopyLen - 4;
107+
LdOpc = SBF::LDW;
108+
StOpc = SBF::STW;
109+
} else if (BytesLeft <= 8) {
110+
Offset = CopyLen - 8;
111+
LdOpc = SBF::LDD;
112+
StOpc = SBF::STD;
113+
} else {
114+
llvm_unreachable("There cannot be more than 8 bytes left");
110115
}
111116

117+
BuildMI(*BB, MI, dl, get(LdOpc))
118+
.addReg(ScratchReg, RegState::Define)
119+
.addReg(SrcReg)
120+
.addImm(Offset);
121+
BuildMI(*BB, MI, dl, get(StOpc))
122+
.addReg(ScratchReg, RegState::Kill)
123+
.addReg(DstReg)
124+
.addImm(Offset);
125+
112126
BB->erase(MI);
113127
}
114128

llvm/lib/Target/SBF/SBFSelectionDAGInfo.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,11 @@ SDValue SBFSelectionDAGInfo::EmitTargetCodeForMemcpy(
2727
return SDValue();
2828

2929
unsigned CopyLen = ConstantSize->getZExtValue();
30-
unsigned StoresNumEstimate = alignTo(CopyLen, Alignment) >> Log2(Alignment);
30+
// If the alignment is greater than 8, we can only store and load 8 bytes at a
31+
// time.
32+
uint64_t BytesPerOp = std::min(Alignment.value(), static_cast<uint64_t>(8));
33+
unsigned StoresNumEstimate =
34+
alignTo(CopyLen, Alignment) >> Log2_64(BytesPerOp);
3135
// Impose the same copy length limit as MaxStoresPerMemcpy.
3236
if (StoresNumEstimate > getCommonMaxStoresPerMemFunc())
3337
return SDValue();

llvm/test/CodeGen/SBF/memcpy-expand-in-order.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,5 @@ entry:
7272
; CHECK: stxdw [[[DST_REG]] + 8], [[SCRATCH_REG]]
7373
; CHECK: ldxdw [[SCRATCH_REG]], [[[SRC_REG]] + 16]
7474
; CHECK: stxdw [[[DST_REG]] + 16], [[SCRATCH_REG]]
75-
; CHECK: ldxh [[SCRATCH_REG]], [[[SRC_REG]] + 24]
76-
; CHECK: stxh [[[DST_REG]] + 24], [[SCRATCH_REG]]
77-
; CHECK: ldxb [[SCRATCH_REG]], [[[SRC_REG]] + 26]
78-
; CHECK: stxb [[[DST_REG]] + 26], [[SCRATCH_REG]]
75+
; CHECK: ldxw [[SCRATCH_REG]], [[[SRC_REG]] + 23]
76+
; CHECK: stxw [[[DST_REG]] + 23], [[SCRATCH_REG]]

llvm/test/CodeGen/SBF/memcpy_16.ll

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
; RUN: llc < %s -march=sbf -sbf-expand-memcpy-in-order | FileCheck %s
2+
3+
; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite)
4+
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
5+
6+
define void @memcpy_test_1(ptr align 16 %a, ptr align 16 %b) local_unnamed_addr #0 {
7+
entry:
8+
call void @llvm.memcpy.p0.p0.i64(ptr align 16 %a, ptr align 16 %b, i64 32, i1 0)
9+
10+
; 4 pairs of loads and stores
11+
; CHECK: memcpy_test_1
12+
; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 0]
13+
; CHECK: stxdw [[[DST_REG:r[0-9]]] + 0], [[SCRATCH_REG:r[0-9]]]
14+
; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 8]
15+
; CHECK: stxdw [[[DST_REG:r[0-9]]] + 8], [[SCRATCH_REG:r[0-9]]]
16+
; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 16]
17+
; CHECK: stxdw [[[DST_REG:r[0-9]]] + 16], [[SCRATCH_REG:r[0-9]]]
18+
; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 24]
19+
; CHECK: stxdw [[[DST_REG:r[0-9]]] + 24], [[SCRATCH_REG:r[0-9]]]
20+
ret void
21+
}
22+
23+
define void @memcpy_test_2(ptr align 16 %a, ptr align 16 %b) local_unnamed_addr #0 {
24+
entry:
25+
call void @llvm.memcpy.p0.p0.i64(ptr align 16 %a, ptr align 16 %b, i64 17, i1 0)
26+
27+
; 2 pairs of loads and stores + 1 pair for the byte
28+
; CHECK: memcpy_test_2
29+
; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 0]
30+
; CHECK: stxdw [[[DST_REG:r[0-9]]] + 0], [[SCRATCH_REG:r[0-9]]]
31+
; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 8]
32+
; CHECK: stxdw [[[DST_REG:r[0-9]]] + 8], [[SCRATCH_REG:r[0-9]]]
33+
; CHECK: ldxb [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 16]
34+
; CHECK: stxb [[[DST_REG:r[0-9]]] + 16], [[SCRATCH_REG:r[0-9]]]
35+
ret void
36+
}
37+
38+
define void @memcpy_test_3(ptr align 16 %a, ptr align 16 %b) local_unnamed_addr #0 {
39+
entry:
40+
call void @llvm.memcpy.p0.p0.i64(ptr align 16 %a, ptr align 16 %b, i64 18, i1 0)
41+
42+
; 2 pairs of loads and stores + 1 pair for the 2 bytes
43+
; CHECK: memcpy_test_3
44+
; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 0]
45+
; CHECK: stxdw [[[DST_REG:r[0-9]]] + 0], [[SCRATCH_REG:r[0-9]]]
46+
; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 8]
47+
; CHECK: stxdw [[[DST_REG:r[0-9]]] + 8], [[SCRATCH_REG:r[0-9]]]
48+
; CHECK: ldxh [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 16]
49+
; CHECK: stxh [[[DST_REG:r[0-9]]] + 16], [[SCRATCH_REG:r[0-9]]]
50+
ret void
51+
}
52+
53+
define void @memcpy_test_4(ptr align 16 %a, ptr align 16 %b) local_unnamed_addr #0 {
54+
entry:
55+
call void @llvm.memcpy.p0.p0.i64(ptr align 16 %a, ptr align 16 %b, i64 19, i1 0)
56+
57+
; 2 pairs of loads and stores + 1 pair for the 3 bytes
58+
; CHECK: memcpy_test_4
59+
; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 0]
60+
; CHECK: stxdw [[[DST_REG:r[0-9]]] + 0], [[SCRATCH_REG:r[0-9]]]
61+
; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 8]
62+
; CHECK: stxdw [[[DST_REG:r[0-9]]] + 8], [[SCRATCH_REG:r[0-9]]]
63+
; CHECK: ldxw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 15]
64+
; CHECK: stxw [[[DST_REG:r[0-9]]] + 15], [[SCRATCH_REG:r[0-9]]]
65+
ret void
66+
}
67+
68+
define void @memcpy_test_5(ptr align 16 %a, ptr align 16 %b) local_unnamed_addr #0 {
69+
entry:
70+
call void @llvm.memcpy.p0.p0.i64(ptr align 16 %a, ptr align 16 %b, i64 21, i1 0)
71+
72+
; 2 pairs of loads and stores + 1 pair for the 5 bytes
73+
; CHECK: memcpy_test_5
74+
; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 0]
75+
; CHECK: stxdw [[[DST_REG:r[0-9]]] + 0], [[SCRATCH_REG:r[0-9]]]
76+
; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 8]
77+
; CHECK: stxdw [[[DST_REG:r[0-9]]] + 8], [[SCRATCH_REG:r[0-9]]]
78+
; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 13]
79+
; CHECK: stxdw [[[DST_REG:r[0-9]]] + 13], [[SCRATCH_REG:r[0-9]]]
80+
ret void
81+
}
82+
83+
define void @memcpy_test_6(ptr align 16 %a, ptr align 16 %b) local_unnamed_addr #0 {
84+
entry:
85+
call void @llvm.memcpy.p0.p0.i64(ptr align 16 %a, ptr align 16 %b, i64 33, i1 0)
86+
87+
; More than 32 bytes, call memcpy
88+
; CHECK: memcpy_test_6
89+
; CHECK: mov64 r3, 33
90+
; CHECK: call memcpy
91+
ret void
92+
}

0 commit comments

Comments
 (0)