Skip to content

Commit 506aa59

Browse files
committed
x86_64: align loop headers to 64 bytes
1 parent 3fa545b commit 506aa59

File tree

7 files changed

+50
-8
lines changed

7 files changed

+50
-8
lines changed

cranelift/codegen/src/context.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ impl Context {
300300
/// Compute the loop analysis.
301301
pub fn compute_loop_analysis(&mut self) {
302302
self.loop_analysis
303-
.compute(&self.func, &self.cfg, &self.domtree)
303+
.compute(&mut self.func, &self.cfg, &self.domtree)
304304
}
305305

306306
/// Compute the control flow graph and dominator tree.

cranelift/codegen/src/ir/layout.rs

+11
Original file line numberDiff line numberDiff line change
@@ -482,6 +482,16 @@ impl Layout {
482482
pub fn is_cold(&self, block: Block) -> bool {
483483
self.blocks[block].cold
484484
}
485+
486+
/// This block is the start of a loop.
487+
pub fn set_loop_header(&mut self, block: Block) {
488+
self.blocks[block].loop_header = true;
489+
}
490+
491+
/// Is the given block known to be a loop header?
492+
pub fn is_loop_header(&self, block: Block) -> bool {
493+
self.blocks[block].loop_header
494+
}
485495
}
486496

487497
/// A single node in the linked-list of blocks.
@@ -494,6 +504,7 @@ struct BlockNode {
494504
last_inst: PackedOption<Inst>,
495505
seq: SequenceNumber,
496506
cold: bool,
507+
loop_header: bool,
497508
}
498509

499510
/// Iterate over blocks in layout order. See [crate::ir::layout::Layout::blocks].

cranelift/codegen/src/isa/x64/inst/mod.rs

+11
Original file line numberDiff line numberDiff line change
@@ -2211,6 +2211,17 @@ impl MachInst for Inst {
22112211
Inst::nop(std::cmp::min(preferred_size, 15) as u8)
22122212
}
22132213

2214+
fn align_basic_block(offset: CodeOffset, loop_header: bool) -> CodeOffset {
2215+
if loop_header {
2216+
// Unaligned loop headers can cause severe performance problems.
2217+
// See https://github.com/bytecodealliance/wasmtime/issues/4883.
2218+
// Here we use conservative 64 bytes alignment.
2219+
align_to(offset, 64)
2220+
} else {
2221+
offset
2222+
}
2223+
}
2224+
22142225
fn rc_for_type(ty: Type) -> CodegenResult<(&'static [RegClass], &'static [Type])> {
22152226
match ty {
22162227
types::I8 => Ok((&[RegClass::Int], &[types::I8])),

cranelift/codegen/src/loop_analysis.rs

+7-5
Original file line numberDiff line numberDiff line change
@@ -100,12 +100,12 @@ impl LoopAnalysis {
100100

101101
impl LoopAnalysis {
102102
/// Detects the loops in a function. Needs the control flow graph and the dominator tree.
103-
pub fn compute(&mut self, func: &Function, cfg: &ControlFlowGraph, domtree: &DominatorTree) {
103+
pub fn compute(&mut self, func: &mut Function, cfg: &ControlFlowGraph, domtree: &DominatorTree) {
104104
let _tt = timing::loop_analysis();
105105
self.loops.clear();
106106
self.block_loop_map.clear();
107107
self.block_loop_map.resize(func.dfg.num_blocks());
108-
self.find_loop_headers(cfg, domtree, &func.layout);
108+
self.find_loop_headers(cfg, domtree, &mut func.layout);
109109
self.discover_loop_blocks(cfg, domtree, &func.layout);
110110
self.valid = true;
111111
}
@@ -134,7 +134,7 @@ impl LoopAnalysis {
134134
&mut self,
135135
cfg: &ControlFlowGraph,
136136
domtree: &DominatorTree,
137-
layout: &Layout,
137+
layout: &mut Layout,
138138
) {
139139
// We traverse the CFG in reverse postorder
140140
for &block in domtree.cfg_postorder().iter().rev() {
@@ -147,6 +147,8 @@ impl LoopAnalysis {
147147
// This block is a loop header, so we create its associated loop
148148
let lp = self.loops.push(LoopData::new(block, None));
149149
self.block_loop_map[block] = lp.into();
150+
// We also need to mark this block as a loop header in the layout.
151+
layout.set_loop_header(block);
150152
break;
151153
// We break because we only need one back edge to identify a loop header.
152154
}
@@ -270,7 +272,7 @@ mod tests {
270272
let mut domtree = DominatorTree::new();
271273
cfg.compute(&func);
272274
domtree.compute(&func, &cfg);
273-
loop_analysis.compute(&func, &cfg, &domtree);
275+
loop_analysis.compute(&mut func, &cfg, &domtree);
274276

275277
let loops = loop_analysis.loops().collect::<Vec<Loop>>();
276278
assert_eq!(loops.len(), 2);
@@ -329,7 +331,7 @@ mod tests {
329331
let mut domtree = DominatorTree::new();
330332
cfg.compute(&func);
331333
domtree.compute(&func, &cfg);
332-
loop_analysis.compute(&func, &cfg, &domtree);
334+
loop_analysis.compute(&mut func, &cfg, &domtree);
333335

334336
let loops = loop_analysis.loops().collect::<Vec<Loop>>();
335337
assert_eq!(loops.len(), 3);

cranelift/codegen/src/machinst/blockorder.rs

+17
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ pub struct BlockLoweringOrder {
106106
/// which is used by VCode emission to sink the blocks at the last
107107
/// moment (when we actually emit bytes into the MachBuffer).
108108
cold_blocks: FxHashSet<BlockIndex>,
109+
/// These are loop headers. Used for alignment.
110+
loop_headers: FxHashSet<BlockIndex>,
109111
/// Lowered blocks that are indirect branch targets.
110112
indirect_branch_targets: FxHashSet<BlockIndex>,
111113
}
@@ -438,6 +440,7 @@ impl BlockLoweringOrder {
438440
// Step 3: now that we have RPO, build the BlockIndex/BB fwd/rev maps.
439441
let mut lowered_order = vec![];
440442
let mut cold_blocks = FxHashSet::default();
443+
let mut loop_headers = FxHashSet::default();
441444
let mut lowered_succ_ranges = vec![];
442445
let mut lb_to_bindex = FxHashMap::default();
443446
let mut indirect_branch_targets = FxHashSet::default();
@@ -455,6 +458,10 @@ impl BlockLoweringOrder {
455458
cold_blocks.insert(index);
456459
}
457460

461+
if f.layout.is_loop_header(block) {
462+
loop_headers.insert(index);
463+
}
464+
458465
if indirect_branch_target_clif_blocks.contains(&block) {
459466
indirect_branch_targets.insert(index);
460467
}
@@ -464,6 +471,10 @@ impl BlockLoweringOrder {
464471
cold_blocks.insert(index);
465472
}
466473

474+
if f.layout.is_loop_header(pred) || f.layout.is_loop_header(succ) {
475+
loop_headers.insert(index);
476+
}
477+
467478
if indirect_branch_target_clif_blocks.contains(&succ) {
468479
indirect_branch_targets.insert(index);
469480
}
@@ -491,6 +502,7 @@ impl BlockLoweringOrder {
491502
lowered_succ_ranges,
492503
orig_map,
493504
cold_blocks,
505+
loop_headers,
494506
indirect_branch_targets,
495507
};
496508
trace!("BlockLoweringOrder: {:?}", result);
@@ -513,6 +525,11 @@ impl BlockLoweringOrder {
513525
self.cold_blocks.contains(&block)
514526
}
515527

528+
/// Determine whether the given lowered-block index is a loop header.
529+
pub fn is_loop_header(&self, block: BlockIndex) -> bool {
530+
self.loop_headers.contains(&block)
531+
}
532+
516533
/// Determine whether the given lowered block index is an indirect branch
517534
/// target.
518535
pub fn is_indirect_branch_target(&self, block: BlockIndex) -> bool {

cranelift/codegen/src/machinst/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ pub trait MachInst: Clone + Debug {
157157

158158
/// Align a basic block offset (from start of function). By default, no
159159
/// alignment occurs.
160-
fn align_basic_block(offset: CodeOffset) -> CodeOffset {
160+
fn align_basic_block(offset: CodeOffset, _loop_header: bool) -> CodeOffset {
161161
offset
162162
}
163163

cranelift/codegen/src/machinst/vcode.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -849,7 +849,8 @@ impl<I: VCodeInst> VCode<I> {
849849

850850
for (block_order_idx, &block) in final_order.iter().enumerate() {
851851
trace!("emitting block {:?}", block);
852-
let new_offset = I::align_basic_block(buffer.cur_offset());
852+
let new_offset =
853+
I::align_basic_block(buffer.cur_offset(), self.block_order.is_loop_header(block));
853854
while new_offset > buffer.cur_offset() {
854855
// Pad with NOPs up to the aligned block offset.
855856
let nop = I::gen_nop((new_offset - buffer.cur_offset()) as usize);

0 commit comments

Comments
 (0)