Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x86_64: align loop headers to 64 bytes #5004

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cranelift/codegen/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ impl Context {
/// Compute the loop analysis.
pub fn compute_loop_analysis(&mut self) {
self.loop_analysis
.compute(&self.func, &self.cfg, &self.domtree)
.compute(&mut self.func, &self.cfg, &self.domtree)
}

/// Compute the control flow graph and dominator tree.
Expand Down
11 changes: 11 additions & 0 deletions cranelift/codegen/src/ir/layout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,16 @@ impl Layout {
pub fn is_cold(&self, block: Block) -> bool {
self.blocks[block].cold
}

/// This block is the start of a loop.
pub fn set_loop_header(&mut self, block: Block) {
self.blocks[block].loop_header = true;
}

/// Is the given block known to be a loop header?
pub fn is_loop_header(&self, block: Block) -> bool {
self.blocks[block].loop_header
}
}

/// A single node in the linked-list of blocks.
Expand All @@ -494,6 +504,7 @@ struct BlockNode {
last_inst: PackedOption<Inst>,
seq: SequenceNumber,
cold: bool,
loop_header: bool,
}

/// Iterate over blocks in layout order. See [crate::ir::layout::Layout::blocks].
Expand Down
11 changes: 11 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2211,6 +2211,17 @@ impl MachInst for Inst {
Inst::nop(std::cmp::min(preferred_size, 15) as u8)
}

fn align_basic_block(offset: CodeOffset, loop_header: bool) -> CodeOffset {
if loop_header {
// Unaligned loop headers can cause severe performance problems.
// See https://github.com/bytecodealliance/wasmtime/issues/4883.
// Here we use conservative 64 bytes alignment.
align_to(offset, 64)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you disable this when optimizing for size? Also won't this cause a large blowup when there are a lot of tiny loops in a function with multiple being able to fit in a single cacheline?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also maybe disable it for cold blocks?

} else {
offset
}
}

fn rc_for_type(ty: Type) -> CodegenResult<(&'static [RegClass], &'static [Type])> {
match ty {
types::I8 => Ok((&[RegClass::Int], &[types::I8])),
Expand Down
17 changes: 12 additions & 5 deletions cranelift/codegen/src/loop_analysis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,17 @@ impl LoopAnalysis {

impl LoopAnalysis {
/// Detects the loops in a function. Needs the control flow graph and the dominator tree.
pub fn compute(&mut self, func: &Function, cfg: &ControlFlowGraph, domtree: &DominatorTree) {
pub fn compute(
&mut self,
func: &mut Function,
cfg: &ControlFlowGraph,
domtree: &DominatorTree,
) {
let _tt = timing::loop_analysis();
self.loops.clear();
self.block_loop_map.clear();
self.block_loop_map.resize(func.dfg.num_blocks());
self.find_loop_headers(cfg, domtree, &func.layout);
self.find_loop_headers(cfg, domtree, &mut func.layout);
self.discover_loop_blocks(cfg, domtree, &func.layout);
self.valid = true;
}
Expand Down Expand Up @@ -134,7 +139,7 @@ impl LoopAnalysis {
&mut self,
cfg: &ControlFlowGraph,
domtree: &DominatorTree,
layout: &Layout,
layout: &mut Layout,
) {
// We traverse the CFG in reverse postorder
for &block in domtree.cfg_postorder().iter().rev() {
Expand All @@ -147,6 +152,8 @@ impl LoopAnalysis {
// This block is a loop header, so we create its associated loop
let lp = self.loops.push(LoopData::new(block, None));
self.block_loop_map[block] = lp.into();
// We also need to mark this block as a loop header in the layout.
layout.set_loop_header(block);
break;
// We break because we only need one back edge to identify a loop header.
}
Expand Down Expand Up @@ -270,7 +277,7 @@ mod tests {
let mut domtree = DominatorTree::new();
cfg.compute(&func);
domtree.compute(&func, &cfg);
loop_analysis.compute(&func, &cfg, &domtree);
loop_analysis.compute(&mut func, &cfg, &domtree);

let loops = loop_analysis.loops().collect::<Vec<Loop>>();
assert_eq!(loops.len(), 2);
Expand Down Expand Up @@ -329,7 +336,7 @@ mod tests {
let mut domtree = DominatorTree::new();
cfg.compute(&func);
domtree.compute(&func, &cfg);
loop_analysis.compute(&func, &cfg, &domtree);
loop_analysis.compute(&mut func, &cfg, &domtree);

let loops = loop_analysis.loops().collect::<Vec<Loop>>();
assert_eq!(loops.len(), 3);
Expand Down
17 changes: 17 additions & 0 deletions cranelift/codegen/src/machinst/blockorder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ pub struct BlockLoweringOrder {
/// which is used by VCode emission to sink the blocks at the last
/// moment (when we actually emit bytes into the MachBuffer).
cold_blocks: FxHashSet<BlockIndex>,
/// These are loop headers. Used for alignment.
loop_headers: FxHashSet<BlockIndex>,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to merge cold_blocks and loop_headers into a single SecondaryMap with a bitflag as element? Or are both still too infrequently used such that it causes a large increase in memory usage?

/// Lowered blocks that are indirect branch targets.
indirect_branch_targets: FxHashSet<BlockIndex>,
}
Expand Down Expand Up @@ -438,6 +440,7 @@ impl BlockLoweringOrder {
// Step 3: now that we have RPO, build the BlockIndex/BB fwd/rev maps.
let mut lowered_order = vec![];
let mut cold_blocks = FxHashSet::default();
let mut loop_headers = FxHashSet::default();
let mut lowered_succ_ranges = vec![];
let mut lb_to_bindex = FxHashMap::default();
let mut indirect_branch_targets = FxHashSet::default();
Expand All @@ -455,6 +458,10 @@ impl BlockLoweringOrder {
cold_blocks.insert(index);
}

if f.layout.is_loop_header(block) {
loop_headers.insert(index);
}

if indirect_branch_target_clif_blocks.contains(&block) {
indirect_branch_targets.insert(index);
}
Expand All @@ -464,6 +471,10 @@ impl BlockLoweringOrder {
cold_blocks.insert(index);
}

if f.layout.is_loop_header(pred) || f.layout.is_loop_header(succ) {
loop_headers.insert(index);
}

if indirect_branch_target_clif_blocks.contains(&succ) {
indirect_branch_targets.insert(index);
}
Expand Down Expand Up @@ -491,6 +502,7 @@ impl BlockLoweringOrder {
lowered_succ_ranges,
orig_map,
cold_blocks,
loop_headers,
indirect_branch_targets,
};
trace!("BlockLoweringOrder: {:?}", result);
Expand All @@ -513,6 +525,11 @@ impl BlockLoweringOrder {
self.cold_blocks.contains(&block)
}

/// Determine whether the given lowered-block index is a loop header.
pub fn is_loop_header(&self, block: BlockIndex) -> bool {
self.loop_headers.contains(&block)
}

/// Determine whether the given lowered block index is an indirect branch
/// target.
pub fn is_indirect_branch_target(&self, block: BlockIndex) -> bool {
Expand Down
2 changes: 1 addition & 1 deletion cranelift/codegen/src/machinst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ pub trait MachInst: Clone + Debug {

/// Align a basic block offset (from start of function). By default, no
/// alignment occurs.
fn align_basic_block(offset: CodeOffset) -> CodeOffset {
fn align_basic_block(offset: CodeOffset, _loop_header: bool) -> CodeOffset {
offset
}

Expand Down
3 changes: 2 additions & 1 deletion cranelift/codegen/src/machinst/vcode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -849,7 +849,8 @@ impl<I: VCodeInst> VCode<I> {

for (block_order_idx, &block) in final_order.iter().enumerate() {
trace!("emitting block {:?}", block);
let new_offset = I::align_basic_block(buffer.cur_offset());
let new_offset =
I::align_basic_block(buffer.cur_offset(), self.block_order.is_loop_header(block));
while new_offset > buffer.cur_offset() {
// Pad with NOPs up to the aligned block offset.
let nop = I::gen_nop((new_offset - buffer.cur_offset()) as usize);
Expand Down