Skip to content

Commit 9dd10de

Browse files
zhongjuzheIncarnation-p-lee
authored andcommitted
RISC-V: Fix VSETLV PASS compile-time issue
The compile time issue was discovered in SPEC 2017 wrf: Use time and -ftime-report to analyze the profile data of SPEC 2017 wrf compilation . Before this patch (Lazy vsetvl): scheduling : 121.89 ( 15%) 0.53 ( 11%) 122.72 ( 15%) 13M ( 1%) machine dep reorg : 424.61 ( 53%) 1.84 ( 37%) 427.44 ( 53%) 5290k ( 0%) real 13m27.074s user 13m19.539s sys 0m5.180s Simple vsetvl: machine dep reorg : 0.10 ( 0%) 0.00 ( 0%) 0.11 ( 0%) 4138k ( 0%) real 6m5.780s user 6m2.396s sys 0m2.373s The machine dep reorg is the compile time of VSETVL PASS (424 seconds) which counts 53% of the compilation time, spends much more time than scheduling. After investigation, the critical patch of VSETVL pass is compute_lcm_local_properties which is called every iteration of phase 2 (earliest fusion) and phase 3 (global lcm). This patch optimized the codes of compute_lcm_local_properties to reduce the compilation time. After this patch: scheduling : 117.51 ( 27%) 0.21 ( 6%) 118.04 ( 27%) 13M ( 1%) machine dep reorg : 80.13 ( 18%) 0.91 ( 26%) 81.26 ( 18%) 5290k ( 0%) real 7m25.374s user 7m20.116s sys 0m3.795s The optimization of this patch is very obvious, lazy VSETVL PASS: 424s (53%) -> 80s (18%) which spend less time than scheduling. Tested on both RV32 and RV64 no regression. Ok for trunk ? PR target/113495 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (extract_single_source): Remove. (pre_vsetvl::compute_vsetvl_def_data): Fix compile time issue. (pre_vsetvl::compute_transparent): New function. (pre_vsetvl::compute_lcm_local_properties): Fix compile time time issue.
1 parent 097ddd5 commit 9dd10de

File tree

1 file changed

+60
-124
lines changed

1 file changed

+60
-124
lines changed

gcc/config/riscv/riscv-vsetvl.cc

+60-124
Original file line numberDiff line numberDiff line change
@@ -599,14 +599,6 @@ extract_single_source (set_info *set)
599599
return first_insn;
600600
}
601601

602-
static insn_info *
603-
extract_single_source (def_info *def)
604-
{
605-
if (!def)
606-
return nullptr;
607-
return extract_single_source (dyn_cast<set_info *> (def));
608-
}
609-
610602
static bool
611603
same_equiv_note_p (set_info *set1, set_info *set2)
612604
{
@@ -2374,6 +2366,7 @@ class pre_vsetvl
23742366
}
23752367

23762368
void compute_vsetvl_def_data ();
2369+
void compute_transparent (const bb_info *);
23772370
void compute_lcm_local_properties ();
23782371

23792372
void fuse_local_vsetvl_info ();
@@ -2452,20 +2445,16 @@ pre_vsetvl::compute_vsetvl_def_data ()
24522445
{
24532446
for (unsigned i = 0; i < m_vsetvl_def_exprs.length (); i += 1)
24542447
{
2455-
const vsetvl_info &info = *m_vsetvl_def_exprs[i];
2456-
if (!info.has_nonvlmax_reg_avl ())
2457-
continue;
2458-
unsigned int regno;
2459-
sbitmap_iterator sbi;
2460-
EXECUTE_IF_SET_IN_BITMAP (m_reg_def_loc[bb->index ()], 0, regno,
2461-
sbi)
2462-
if (regno == REGNO (info.get_avl ()))
2463-
{
2464-
bitmap_set_bit (m_kill[bb->index ()], i);
2465-
bitmap_set_bit (def_loc[bb->index ()],
2466-
get_expr_index (m_vsetvl_def_exprs,
2467-
m_unknow_info));
2468-
}
2448+
auto *info = m_vsetvl_def_exprs[i];
2449+
if (info->has_nonvlmax_reg_avl ()
2450+
&& bitmap_bit_p (m_reg_def_loc[bb->index ()],
2451+
REGNO (info->get_avl ())))
2452+
{
2453+
bitmap_set_bit (m_kill[bb->index ()], i);
2454+
bitmap_set_bit (def_loc[bb->index ()],
2455+
get_expr_index (m_vsetvl_def_exprs,
2456+
m_unknow_info));
2457+
}
24692458
}
24702459
continue;
24712460
}
@@ -2516,6 +2505,36 @@ pre_vsetvl::compute_vsetvl_def_data ()
25162505
sbitmap_vector_free (m_kill);
25172506
}
25182507

2508+
/* Subroutine of compute_lcm_local_properties which Compute local transparent
2509+
BB. Note that the compile time is very sensitive to compute_transparent and
2510+
compute_lcm_local_properties, any change of these 2 functions should be
2511+
aware of the compile time changing of the program which has a large number of
2512+
blocks, e.g SPEC 2017 wrf.
2513+
2514+
Current compile time profile of SPEC 2017 wrf:
2515+
2516+
1. scheduling - 27%
2517+
2. machine dep reorg (VSETVL PASS) - 18%
2518+
2519+
VSETVL pass should not spend more time than scheduling in compilation. */
2520+
void
2521+
pre_vsetvl::compute_transparent (const bb_info *bb)
2522+
{
2523+
int num_exprs = m_exprs.length ();
2524+
unsigned bb_index = bb->index ();
2525+
for (int i = 0; i < num_exprs; i++)
2526+
{
2527+
auto *info = m_exprs[i];
2528+
if (info->has_nonvlmax_reg_avl ()
2529+
&& bitmap_bit_p (m_reg_def_loc[bb_index], REGNO (info->get_avl ())))
2530+
bitmap_clear_bit (m_transp[bb_index], i);
2531+
else if (info->has_vl ()
2532+
&& bitmap_bit_p (m_reg_def_loc[bb_index],
2533+
REGNO (info->get_vl ())))
2534+
bitmap_clear_bit (m_transp[bb_index], i);
2535+
}
2536+
}
2537+
25192538
/* Compute the local properties of each recorded expression.
25202539
25212540
Local properties are those that are defined by the block, irrespective of
@@ -2572,7 +2591,7 @@ pre_vsetvl::compute_lcm_local_properties ()
25722591

25732592
bitmap_vector_clear (m_avloc, last_basic_block_for_fn (cfun));
25742593
bitmap_vector_clear (m_antloc, last_basic_block_for_fn (cfun));
2575-
bitmap_vector_clear (m_transp, last_basic_block_for_fn (cfun));
2594+
bitmap_vector_ones (m_transp, last_basic_block_for_fn (cfun));
25762595

25772596
/* - If T is locally available at the end of a block, then T' must be
25782597
available at the end of the same block. Since some optimization has
@@ -2598,117 +2617,34 @@ pre_vsetvl::compute_lcm_local_properties ()
25982617

25992618
/* Compute m_transp */
26002619
if (block_info.empty_p ())
2620+
compute_transparent (bb);
2621+
else
26012622
{
2602-
bitmap_ones (m_transp[bb_index]);
2603-
for (int i = 0; i < num_exprs; i += 1)
2604-
{
2605-
const vsetvl_info &info = *m_exprs[i];
2606-
if (!info.has_nonvlmax_reg_avl () && !info.has_vl ())
2607-
continue;
2608-
2609-
if (info.has_nonvlmax_reg_avl ())
2610-
{
2611-
unsigned int regno;
2612-
sbitmap_iterator sbi;
2613-
EXECUTE_IF_SET_IN_BITMAP (m_reg_def_loc[bb->index ()], 0,
2614-
regno, sbi)
2615-
{
2616-
if (regno == REGNO (info.get_avl ()))
2617-
bitmap_clear_bit (m_transp[bb->index ()], i);
2618-
}
2619-
}
2620-
2621-
for (insn_info *insn : bb->real_nondebug_insns ())
2622-
{
2623-
if (info.has_nonvlmax_reg_avl ()
2624-
&& find_access (insn->defs (), REGNO (info.get_avl ())))
2625-
{
2626-
bitmap_clear_bit (m_transp[bb_index], i);
2627-
break;
2628-
}
2629-
2630-
if (info.has_vl ()
2631-
&& reg_mentioned_p (info.get_vl (), insn->rtl ()))
2632-
{
2633-
if (find_access (insn->defs (), REGNO (info.get_vl ())))
2634-
/* We can't fuse vsetvl into the blocks that modify the
2635-
VL operand since successors of such blocks will need
2636-
the value of those blocks are defining.
2637-
2638-
bb 4: def a5
2639-
/ \
2640-
bb 5:use a5 bb 6:vsetvl a5, 5
2641-
2642-
The example above shows that we can't fuse vsetvl
2643-
from bb 6 into bb 4 since the successor bb 5 is using
2644-
the value defined in bb 4. */
2645-
;
2646-
else
2647-
{
2648-
/* We can't fuse vsetvl into the blocks that use the
2649-
VL operand which has different value from the
2650-
vsetvl info.
2651-
2652-
bb 4: def a5
2653-
|
2654-
bb 5: use a5
2655-
|
2656-
bb 6: def a5
2657-
|
2658-
bb 7: use a5
2659-
2660-
The example above shows that we can't fuse vsetvl
2661-
from bb 6 into bb 5 since their value is different.
2662-
*/
2663-
resource_info resource
2664-
= full_register (REGNO (info.get_vl ()));
2665-
def_lookup dl = crtl->ssa->find_def (resource, insn);
2666-
def_info *def
2667-
= dl.matching_set_or_last_def_of_prev_group ();
2668-
insn_info *def_insn = extract_single_source (def);
2669-
if (def_insn && vsetvl_insn_p (def_insn->rtl ()))
2670-
{
2671-
vsetvl_info def_info = vsetvl_info (def_insn);
2672-
if (m_dem.compatible_p (def_info, info))
2673-
continue;
2674-
}
2675-
}
2623+
bitmap_clear (m_transp[bb_index]);
2624+
vsetvl_info &header_info = block_info.get_entry_info ();
2625+
vsetvl_info &footer_info = block_info.get_exit_info ();
26762626

2677-
bitmap_clear_bit (m_transp[bb_index], i);
2678-
break;
2679-
}
2680-
}
2681-
}
2627+
if (header_info.valid_p () && anticipated_exp_p (header_info))
2628+
bitmap_set_bit (m_antloc[bb_index],
2629+
get_expr_index (m_exprs, header_info));
26822630

2683-
continue;
2631+
if (footer_info.valid_p ())
2632+
for (int i = 0; i < num_exprs; i += 1)
2633+
{
2634+
const vsetvl_info &info = *m_exprs[i];
2635+
if (!info.valid_p ())
2636+
continue;
2637+
if (available_exp_p (footer_info, info))
2638+
bitmap_set_bit (m_avloc[bb_index], i);
2639+
}
26842640
}
26852641

2686-
vsetvl_info &header_info = block_info.get_entry_info ();
2687-
vsetvl_info &footer_info = block_info.get_exit_info ();
2688-
2689-
if (header_info.valid_p () && anticipated_exp_p (header_info))
2690-
bitmap_set_bit (m_antloc[bb_index],
2691-
get_expr_index (m_exprs, header_info));
2692-
2693-
if (footer_info.valid_p ())
2694-
for (int i = 0; i < num_exprs; i += 1)
2695-
{
2696-
const vsetvl_info &info = *m_exprs[i];
2697-
if (!info.valid_p ())
2698-
continue;
2699-
if (available_exp_p (footer_info, info))
2700-
bitmap_set_bit (m_avloc[bb_index], i);
2701-
}
2702-
}
2703-
2704-
for (const bb_info *bb : crtl->ssa->bbs ())
2705-
{
2706-
unsigned bb_index = bb->index ();
27072642
if (invalid_opt_bb_p (bb->cfg_bb ()))
27082643
{
27092644
bitmap_clear (m_antloc[bb_index]);
27102645
bitmap_clear (m_transp[bb_index]);
27112646
}
2647+
27122648
/* Compute ae_kill for each basic block using:
27132649
27142650
~(TRANSP | COMP)

0 commit comments

Comments
 (0)