diff --git a/cranelift/codegen/src/context.rs b/cranelift/codegen/src/context.rs index 1003e57337a9..71d1f4bdd8d2 100644 --- a/cranelift/codegen/src/context.rs +++ b/cranelift/codegen/src/context.rs @@ -300,7 +300,7 @@ impl Context { /// Compute the loop analysis. pub fn compute_loop_analysis(&mut self) { self.loop_analysis - .compute(&self.func, &self.cfg, &self.domtree) + .compute(&mut self.func, &self.cfg, &self.domtree) } /// Compute the control flow graph and dominator tree. diff --git a/cranelift/codegen/src/ir/layout.rs b/cranelift/codegen/src/ir/layout.rs index 7162c848c513..494c534833b5 100644 --- a/cranelift/codegen/src/ir/layout.rs +++ b/cranelift/codegen/src/ir/layout.rs @@ -482,6 +482,16 @@ impl Layout { pub fn is_cold(&self, block: Block) -> bool { self.blocks[block].cold } + + /// This block is the start of a loop. + pub fn set_loop_header(&mut self, block: Block) { + self.blocks[block].loop_header = true; + } + + /// Is the given block known to be a loop header? + pub fn is_loop_header(&self, block: Block) -> bool { + self.blocks[block].loop_header + } } /// A single node in the linked-list of blocks. @@ -494,6 +504,7 @@ struct BlockNode { last_inst: PackedOption, seq: SequenceNumber, cold: bool, + loop_header: bool, } /// Iterate over blocks in layout order. See [crate::ir::layout::Layout::blocks]. diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index ae92b7307ef2..c0047cb05322 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -2211,6 +2211,17 @@ impl MachInst for Inst { Inst::nop(std::cmp::min(preferred_size, 15) as u8) } + fn align_basic_block(offset: CodeOffset, loop_header: bool) -> CodeOffset { + if loop_header { + // Unaligned loop headers can cause severe performance problems. + // See https://github.com/bytecodealliance/wasmtime/issues/4883. + // Here we use conservative 64 bytes alignment. + align_to(offset, 64) + } else { + offset + } + } + fn rc_for_type(ty: Type) -> CodegenResult<(&'static [RegClass], &'static [Type])> { match ty { types::I8 => Ok((&[RegClass::Int], &[types::I8])), diff --git a/cranelift/codegen/src/loop_analysis.rs b/cranelift/codegen/src/loop_analysis.rs index 0e8715ae91dd..5cda997b952a 100644 --- a/cranelift/codegen/src/loop_analysis.rs +++ b/cranelift/codegen/src/loop_analysis.rs @@ -100,12 +100,17 @@ impl LoopAnalysis { impl LoopAnalysis { /// Detects the loops in a function. Needs the control flow graph and the dominator tree. - pub fn compute(&mut self, func: &Function, cfg: &ControlFlowGraph, domtree: &DominatorTree) { + pub fn compute( + &mut self, + func: &mut Function, + cfg: &ControlFlowGraph, + domtree: &DominatorTree, + ) { let _tt = timing::loop_analysis(); self.loops.clear(); self.block_loop_map.clear(); self.block_loop_map.resize(func.dfg.num_blocks()); - self.find_loop_headers(cfg, domtree, &func.layout); + self.find_loop_headers(cfg, domtree, &mut func.layout); self.discover_loop_blocks(cfg, domtree, &func.layout); self.valid = true; } @@ -134,7 +139,7 @@ impl LoopAnalysis { &mut self, cfg: &ControlFlowGraph, domtree: &DominatorTree, - layout: &Layout, + layout: &mut Layout, ) { // We traverse the CFG in reverse postorder for &block in domtree.cfg_postorder().iter().rev() { @@ -147,6 +152,8 @@ impl LoopAnalysis { // This block is a loop header, so we create its associated loop let lp = self.loops.push(LoopData::new(block, None)); self.block_loop_map[block] = lp.into(); + // We also need to mark this block as a loop header in the layout. + layout.set_loop_header(block); break; // We break because we only need one back edge to identify a loop header. } @@ -270,7 +277,7 @@ mod tests { let mut domtree = DominatorTree::new(); cfg.compute(&func); domtree.compute(&func, &cfg); - loop_analysis.compute(&func, &cfg, &domtree); + loop_analysis.compute(&mut func, &cfg, &domtree); let loops = loop_analysis.loops().collect::>(); assert_eq!(loops.len(), 2); @@ -329,7 +336,7 @@ mod tests { let mut domtree = DominatorTree::new(); cfg.compute(&func); domtree.compute(&func, &cfg); - loop_analysis.compute(&func, &cfg, &domtree); + loop_analysis.compute(&mut func, &cfg, &domtree); let loops = loop_analysis.loops().collect::>(); assert_eq!(loops.len(), 3); diff --git a/cranelift/codegen/src/machinst/blockorder.rs b/cranelift/codegen/src/machinst/blockorder.rs index 847cbf53f2ff..85f1aa659e60 100644 --- a/cranelift/codegen/src/machinst/blockorder.rs +++ b/cranelift/codegen/src/machinst/blockorder.rs @@ -106,6 +106,8 @@ pub struct BlockLoweringOrder { /// which is used by VCode emission to sink the blocks at the last /// moment (when we actually emit bytes into the MachBuffer). cold_blocks: FxHashSet, + /// These are loop headers. Used for alignment. + loop_headers: FxHashSet, /// Lowered blocks that are indirect branch targets. indirect_branch_targets: FxHashSet, } @@ -438,6 +440,7 @@ impl BlockLoweringOrder { // Step 3: now that we have RPO, build the BlockIndex/BB fwd/rev maps. let mut lowered_order = vec![]; let mut cold_blocks = FxHashSet::default(); + let mut loop_headers = FxHashSet::default(); let mut lowered_succ_ranges = vec![]; let mut lb_to_bindex = FxHashMap::default(); let mut indirect_branch_targets = FxHashSet::default(); @@ -455,6 +458,10 @@ impl BlockLoweringOrder { cold_blocks.insert(index); } + if f.layout.is_loop_header(block) { + loop_headers.insert(index); + } + if indirect_branch_target_clif_blocks.contains(&block) { indirect_branch_targets.insert(index); } @@ -464,6 +471,10 @@ impl BlockLoweringOrder { cold_blocks.insert(index); } + if f.layout.is_loop_header(pred) || f.layout.is_loop_header(succ) { + loop_headers.insert(index); + } + if indirect_branch_target_clif_blocks.contains(&succ) { indirect_branch_targets.insert(index); } @@ -491,6 +502,7 @@ impl BlockLoweringOrder { lowered_succ_ranges, orig_map, cold_blocks, + loop_headers, indirect_branch_targets, }; trace!("BlockLoweringOrder: {:?}", result); @@ -513,6 +525,11 @@ impl BlockLoweringOrder { self.cold_blocks.contains(&block) } + /// Determine whether the given lowered-block index is a loop header. + pub fn is_loop_header(&self, block: BlockIndex) -> bool { + self.loop_headers.contains(&block) + } + /// Determine whether the given lowered block index is an indirect branch /// target. pub fn is_indirect_branch_target(&self, block: BlockIndex) -> bool { diff --git a/cranelift/codegen/src/machinst/mod.rs b/cranelift/codegen/src/machinst/mod.rs index 36e96c708d0d..23e25ca79d15 100644 --- a/cranelift/codegen/src/machinst/mod.rs +++ b/cranelift/codegen/src/machinst/mod.rs @@ -157,7 +157,7 @@ pub trait MachInst: Clone + Debug { /// Align a basic block offset (from start of function). By default, no /// alignment occurs. - fn align_basic_block(offset: CodeOffset) -> CodeOffset { + fn align_basic_block(offset: CodeOffset, _loop_header: bool) -> CodeOffset { offset } diff --git a/cranelift/codegen/src/machinst/vcode.rs b/cranelift/codegen/src/machinst/vcode.rs index 7f36389cd976..6bc6514f9ff6 100644 --- a/cranelift/codegen/src/machinst/vcode.rs +++ b/cranelift/codegen/src/machinst/vcode.rs @@ -849,7 +849,8 @@ impl VCode { for (block_order_idx, &block) in final_order.iter().enumerate() { trace!("emitting block {:?}", block); - let new_offset = I::align_basic_block(buffer.cur_offset()); + let new_offset = + I::align_basic_block(buffer.cur_offset(), self.block_order.is_loop_header(block)); while new_offset > buffer.cur_offset() { // Pad with NOPs up to the aligned block offset. let nop = I::gen_nop((new_offset - buffer.cur_offset()) as usize);