Skip to content

Commit 3f16af7

Browse files
author
Alex Turjan
committed
target dependent phase to improves memory accesses
This phase identifies memory loads that use a base address calculated via instructions like sh1add. It traces the index register of that calculation back through previous instructions to find increments. As a result instead of having a separate instruction to increment followed by a pointer to scale and add it, this pass merges the constant offset directly into the memory load. Thus it reduces the total number of instructions leading to smaller code size and faster execution. I added also a test that checks if offsets > 0 are generated out memory accesses inside unrolled loops.
1 parent 522e38b commit 3f16af7

8 files changed

Lines changed: 424 additions & 4 deletions

File tree

gcc/config.gcc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -554,8 +554,11 @@ pru-*-*)
554554
;;
555555
riscv*)
556556
cpu_type=riscv
557-
extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o riscv-shorten-memrefs.o riscv-selftests.o riscv-string.o"
558-
extra_objs="${extra_objs} riscv-v.o riscv-vsetvl.o riscv-vector-costs.o riscv-avlprop.o"
557+
extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o"
558+
extra_objs="${extra_objs} riscv-shorten-memrefs.o riscv-selftests.o"
559+
extra_objs="${extra_objs} riscv-selftests.o riscv-string.o"
560+
extra_objs="${extra_objs} riscv-fold-mem.o riscv-v.o riscv-vsetvl.o"
561+
extra_objs="${extra_objs} riscv-vector-costs.o riscv-avlprop.o"
559562
extra_objs="${extra_objs} riscv-vector-builtins.o riscv-vector-builtins-shapes.o riscv-vector-builtins-bases.o sifive-vector-builtins-bases.o"
560563
extra_objs="${extra_objs} thead.o riscv-target-attr.o riscv-zicfilp.o riscv-apex-lto.o riscv-arcv.o"
561564
d_target_objs="riscv-d.o"

gcc/config/riscv/riscv-fold-mem.cc

Lines changed: 356 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,356 @@
1+
/* RISC-V memory folding pass
2+
* The phase is designed to exploit the zba extensions by targeting
3+
* address generation patterns where an index is being incremented
4+
* and then used to access an array. The phase folds constants into
5+
* memory accesses: instead of performing a separate add and shift
6+
* or shiftadd followed by a load it folds the cumulative increment
7+
* directly into the offset field of the load/store instruction.
8+
* As a result it replaces two or more instructions (an address
9+
* calculation and a memory access) with a single instruction which
10+
* reduces both code size and execution time. */
11+
12+
#include "config.h"
13+
#include "system.h"
14+
#include "coretypes.h"
15+
#include "backend.h"
16+
#include "rtl.h"
17+
#include "df.h"
18+
#include "options.h"
19+
#include "insn-config.h"
20+
#include "recog.h"
21+
#include "target.h"
22+
#include "tree-pass.h"
23+
#include "memmodel.h"
24+
#include "emit-rtl.h"
25+
#include "rtl-iter.h"
26+
#include <vector>
27+
28+
namespace
29+
{
30+
31+
/* Configuration data. */
32+
const pass_data pass_data_riscv_fold_mem = {
33+
RTL_PASS, /* type. */
34+
"riscv_fold_mem", /* name. */
35+
OPTGROUP_LOOP, /* optinfo_flags. */
36+
TV_NONE,
37+
0,
38+
0,
39+
0,
40+
0,
41+
0,
42+
};
43+
44+
class pass_riscv_fold_mem : public rtl_opt_pass
45+
{
46+
public:
47+
pass_riscv_fold_mem (gcc::context *ctxt)
48+
: rtl_opt_pass (pass_data_riscv_fold_mem, ctxt)
49+
{
50+
}
51+
52+
/* run if zba extension and 64 bit mode are enabled. */
53+
virtual bool
54+
gate (function *) final override
55+
{
56+
return optimize > 0 && TARGET_64BIT && riscv_fold_mem;
57+
}
58+
59+
virtual unsigned int execute (function *) override;
60+
61+
private:
62+
rtx_insn *find_def (rtx_insn *insn, rtx reg);
63+
rtx_insn *find_nearest_def_in_bb (rtx_insn *insn, rtx reg);
64+
rtx trace_increments (rtx_insn *insn, rtx reg, HOST_WIDE_INT *total_offset);
65+
};
66+
67+
/* Find the unique definition of REG used in INSN. */
68+
rtx_insn *
69+
pass_riscv_fold_mem::find_def (rtx_insn *insn, rtx reg)
70+
{
71+
if (!reg || !REG_P (reg))
72+
return NULL;
73+
74+
df_ref use = df_find_use (insn, reg);
75+
if (!use)
76+
return NULL;
77+
78+
struct df_link *defs = DF_REF_CHAIN (use);
79+
if (!defs || defs->next)
80+
return NULL;
81+
82+
return DF_REF_INSN (defs->ref);
83+
}
84+
85+
/* Locate the definition of REG that most closely precedes INSN
86+
within the same basic block. */
87+
rtx_insn *
88+
pass_riscv_fold_mem::find_nearest_def_in_bb (rtx_insn *insn, rtx reg)
89+
{
90+
df_ref def_ref;
91+
rtx_insn *best_def = NULL;
92+
int current_luid = DF_INSN_LUID (insn);
93+
int best_luid = -1;
94+
basic_block curr_bb = BLOCK_FOR_INSN (insn);
95+
96+
for (def_ref = DF_REG_DEF_CHAIN (REGNO (reg)); def_ref;
97+
def_ref = DF_REF_NEXT_REG (def_ref))
98+
{
99+
rtx_insn *def_insn = DF_REF_INSN (def_ref);
100+
101+
if (BLOCK_FOR_INSN (def_insn) == curr_bb)
102+
{
103+
int def_luid = DF_INSN_LUID (def_insn);
104+
105+
if (def_luid < current_luid && def_luid > best_luid)
106+
{
107+
best_luid = def_luid;
108+
best_def = def_insn;
109+
}
110+
}
111+
}
112+
return best_def;
113+
}
114+
115+
/* Recursively trace register increments to find the root register
116+
* and calculate the cumulative offset. */
117+
rtx
118+
pass_riscv_fold_mem::trace_increments (rtx_insn *insn, rtx reg,
119+
HOST_WIDE_INT *total_offset)
120+
{
121+
if (!reg || !REG_P (reg))
122+
return reg;
123+
124+
df_ref use = df_find_use (insn, reg);
125+
if (!use)
126+
return reg;
127+
128+
struct df_link *defs = DF_REF_CHAIN (use);
129+
if (!defs || defs->next)
130+
return reg;
131+
132+
rtx_insn *def_insn = DF_REF_INSN (defs->ref);
133+
134+
if (BLOCK_FOR_INSN (def_insn) != BLOCK_FOR_INSN (insn))
135+
return reg;
136+
137+
rtx set = single_set (def_insn);
138+
if (!set)
139+
return reg;
140+
141+
rtx src = SET_SRC (set);
142+
/* Look for rtx insn:
143+
* (sign_extend (plus (subreg (reg)) (const_int))) */
144+
if (GET_CODE (src) == SIGN_EXTEND && GET_CODE (XEXP (src, 0)) == PLUS)
145+
{
146+
rtx plus_op = XEXP (src, 0);
147+
if (CONST_INT_P (XEXP (plus_op, 1)))
148+
{
149+
*total_offset += INTVAL (XEXP (plus_op, 1));
150+
rtx inner_reg = NULL_RTX;
151+
152+
if (REG_P (XEXP (plus_op, 0)))
153+
inner_reg = XEXP (plus_op, 0);
154+
else if (GET_CODE (XEXP (plus_op, 0)) == SUBREG)
155+
inner_reg = XEXP (XEXP (plus_op, 0), 0);
156+
157+
if (inner_reg != NULL_RTX)
158+
return trace_increments (def_insn, inner_reg, total_offset);
159+
}
160+
}
161+
return reg;
162+
}
163+
164+
/* Main entry point for memory folding pass. */
165+
unsigned int
166+
pass_riscv_fold_mem::execute (function *fun)
167+
{
168+
df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
169+
df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
170+
df_analyze ();
171+
172+
rtx_insn *insn;
173+
for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
174+
{
175+
if (!NONDEBUG_INSN_P (insn))
176+
continue;
177+
178+
rtx set = single_set (insn);
179+
if (!set)
180+
continue;
181+
182+
rtx mem = NULL_RTX;
183+
rtx src = (GET_CODE (SET_SRC (set)) == SIGN_EXTEND)
184+
? XEXP (SET_SRC (set), 0)
185+
: SET_SRC (set);
186+
rtx dest = (GET_CODE (SET_DEST (set)) == SIGN_EXTEND)
187+
? XEXP (SET_DEST (set), 0)
188+
: SET_DEST (set);
189+
190+
if (MEM_P (src))
191+
{
192+
mem = src;
193+
}
194+
else if (MEM_P (dest))
195+
{
196+
mem = dest;
197+
}
198+
/* pick only loads or stores. */
199+
if (!mem)
200+
continue;
201+
202+
rtx addr_reg = XEXP (mem, 0);
203+
if (!REG_P (addr_reg))
204+
continue;
205+
206+
df_ref addr_use = df_find_use (insn, addr_reg);
207+
if (!addr_use || !DF_REF_CHAIN (addr_use)
208+
|| DF_REF_CHAIN (addr_use)->next)
209+
continue;
210+
211+
/* looking after shiftadd type of instructions through which
212+
* prior immediate adds can be propagated. */
213+
rtx_insn *insn_addr = DF_REF_INSN (DF_REF_CHAIN (addr_use)->ref);
214+
rtx set_addr = single_set (insn_addr);
215+
if (!set_addr || GET_CODE (SET_SRC (set_addr)) != PLUS)
216+
continue;
217+
218+
rtx plus_src = SET_SRC (set_addr);
219+
rtx and_rtx = XEXP (plus_src, 0);
220+
221+
if (GET_CODE (and_rtx) != AND || GET_CODE (XEXP (and_rtx, 0)) != ASHIFT)
222+
continue;
223+
224+
rtx index_reg = XEXP (XEXP (and_rtx, 0), 0);
225+
int shift_amount = INTVAL (XEXP (XEXP (and_rtx, 0), 1));
226+
227+
HOST_WIDE_INT cumulative_inc = 0;
228+
rtx root_reg = trace_increments (insn_addr, index_reg, &cumulative_inc);
229+
230+
if (cumulative_inc == 0 || root_reg == index_reg)
231+
continue;
232+
233+
HOST_WIDE_INT scaled_offset = (unsigned HOST_WIDE_INT)cumulative_inc
234+
<< shift_amount;
235+
236+
/* prevent immediates over 12bit. */
237+
if (!SMALL_OPERAND (scaled_offset))
238+
continue;
239+
240+
bool hazard = false;
241+
242+
for (rtx_insn *curr = NEXT_INSN (insn_addr); curr && curr != insn;
243+
curr = NEXT_INSN (curr))
244+
{
245+
if (!NONDEBUG_INSN_P (curr))
246+
continue;
247+
subrtx_iterator::array_type array;
248+
FOR_EACH_SUBRTX (iter, array, PATTERN (curr), NONCONST)
249+
{
250+
const_rtx x = *iter;
251+
if (MEM_P (x))
252+
{
253+
/* If we find any other access that coflicts with the picked
254+
* store/load. */
255+
if (true_dependence (mem, GET_MODE (mem), x)
256+
|| canon_output_dependence (mem, false, x, GET_MODE (x),
257+
curr))
258+
{
259+
hazard = true;
260+
break;
261+
}
262+
}
263+
}
264+
if (hazard)
265+
break;
266+
}
267+
268+
if (hazard)
269+
continue;
270+
271+
/* update index register and the load instruction. */
272+
validate_change (insn_addr, &XEXP (XEXP (and_rtx, 0), 0), root_reg, 1);
273+
rtx new_mem_addr = plus_constant (Pmode, addr_reg, scaled_offset);
274+
validate_change (insn, &XEXP (mem, 0), new_mem_addr, 1);
275+
276+
if (apply_change_group ())
277+
{
278+
confirm_change_group ();
279+
df_insn_rescan_all ();
280+
}
281+
else
282+
cancel_changes (0);
283+
}
284+
285+
/* Clean up redundant copies and fold expressions. */
286+
basic_block bb;
287+
FOR_EACH_BB_FN (bb, fun)
288+
{
289+
rtx_insn *insn, *next;
290+
std::vector<std::pair<rtx, rtx> > shadd_exprs;
291+
292+
for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
293+
{
294+
next = NEXT_INSN (insn);
295+
if (!NONDEBUG_INSN_P (insn))
296+
continue;
297+
298+
rtx set = single_set (insn);
299+
if (!set)
300+
continue;
301+
302+
rtx src = SET_SRC (set);
303+
rtx dest = SET_DEST (set);
304+
305+
/* registers propagation. */
306+
if (REG_P (dest) && REG_P (src))
307+
{
308+
if (DF_REG_DEF_COUNT (REGNO (src)) == 1)
309+
{
310+
df_ref def_ref = DF_REG_DEF_CHAIN (REGNO (src));
311+
if (def_ref && !DF_REF_CHAIN (def_ref))
312+
{
313+
rtx_insn *def_insn = DF_REF_INSN (def_ref);
314+
rtx def_set = single_set (def_insn);
315+
if (def_set && REG_P (SET_SRC (def_set)))
316+
{
317+
rtx orig_src = SET_SRC (def_set);
318+
if (validate_change (insn, &SET_SRC (set), orig_src,
319+
0))
320+
df_insn_rescan (insn);
321+
}
322+
}
323+
}
324+
}
325+
326+
/* Redundancy elimination for shadd expressions. */
327+
if (GET_CODE (src) == PLUS && GET_CODE (XEXP (src, 0)) == AND)
328+
{
329+
rtx existing_reg = NULL_RTX;
330+
for (auto const &entry : shadd_exprs)
331+
if (rtx_equal_p (src, entry.first))
332+
{
333+
existing_reg = entry.second;
334+
break;
335+
}
336+
337+
if (existing_reg)
338+
validate_change (insn, &SET_SRC (set), existing_reg, 0);
339+
else
340+
shadd_exprs.push_back ({ src, dest });
341+
}
342+
}
343+
}
344+
345+
df_finish_pass (true);
346+
return 0;
347+
}
348+
349+
}
350+
351+
/* Create a new instance of the RISC-V memory folding pass. */
352+
rtl_opt_pass *
353+
make_pass_riscv_fold_mem (gcc::context *ctxt)
354+
{
355+
return new pass_riscv_fold_mem (ctxt);
356+
}

gcc/config/riscv/riscv-passes.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,5 @@ INSERT_PASS_AFTER (pass_rtl_store_motion, 1, pass_shorten_memrefs);
2121
INSERT_PASS_AFTER (pass_split_all_insns, 1, pass_avlprop);
2222
INSERT_PASS_BEFORE (pass_fast_rtl_dce, 1, pass_vsetvl);
2323
INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_landing_pad);
24-
24+
INSERT_PASS_AFTER (pass_combine, 1, pass_riscv_fold_mem);
2525
INSERT_PASS_AFTER (pass_reorder_blocks, 1, pass_cprop_hardreg);

gcc/config/riscv/riscv-protos.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ rtl_opt_pass * make_pass_shorten_memrefs (gcc::context *ctxt);
206206
rtl_opt_pass * make_pass_avlprop (gcc::context *ctxt);
207207
rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt);
208208
rtl_opt_pass * make_pass_insert_landing_pad (gcc::context *ctxt);
209+
rtl_opt_pass *make_pass_riscv_fold_mem (gcc::context *ctxt);
209210

210211
/* Routines implemented in riscv-string.c. */
211212
extern bool riscv_expand_block_compare (rtx, rtx, rtx, rtx);

0 commit comments

Comments
 (0)