From c75876cdb470ae66da1772b5987d346f863b686d Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Wed, 18 Dec 2024 17:07:18 -0800 Subject: [PATCH] pulley: Add macro instructions for function prologue/epilogue This commit adds two new instructions to Pulley to combine the operations of setting up a frame, allocating stack, and saving clobbered registers. This is all combined into a single instruction which is relatively large but is much smaller than each of these individual operations exploded out. This is a size win on `spidermonkey.cwasm` by about 1M and locally in a small `fib.wat` test this is also a good speedup by reducing the number of instructions executed. --- cranelift/bitset/src/scalar.rs | 3 + cranelift/codegen/meta/src/pulley.rs | 19 +- .../codegen/src/isa/pulley_shared/abi.rs | 401 +++++++++++------- .../codegen/src/isa/pulley_shared/inst.isle | 1 + .../src/isa/pulley_shared/inst/emit.rs | 4 +- .../src/isa/pulley_shared/lower/isle.rs | 1 + pulley/src/interp.rs | 44 ++ pulley/src/lib.rs | 10 + pulley/src/regs.rs | 26 +- 9 files changed, 358 insertions(+), 151 deletions(-) diff --git a/cranelift/bitset/src/scalar.rs b/cranelift/bitset/src/scalar.rs index 07649c297efe..6d05338adbef 100644 --- a/cranelift/bitset/src/scalar.rs +++ b/cranelift/bitset/src/scalar.rs @@ -556,14 +556,17 @@ pub trait ScalarBitSetStorage: macro_rules! impl_storage { ( $int:ty ) => { impl ScalarBitSetStorage for $int { + #[inline] fn leading_zeros(self) -> u8 { u8::try_from(self.leading_zeros()).unwrap() } + #[inline] fn trailing_zeros(self) -> u8 { u8::try_from(self.trailing_zeros()).unwrap() } + #[inline] fn count_ones(self) -> u8 { u8::try_from(self.count_ones()).unwrap() } diff --git a/cranelift/codegen/meta/src/pulley.rs b/cranelift/codegen/meta/src/pulley.rs index 557c92de2eea..4d95da23fd9d 100644 --- a/cranelift/codegen/meta/src/pulley.rs +++ b/cranelift/codegen/meta/src/pulley.rs @@ -69,6 +69,10 @@ impl Inst<'_> { Operand::Binop { dst, src1, src2 } } ("dst", ty) => Operand::Writable { name, ty }, + (name, "RegSet < XReg >") => Operand::Normal { + name, + ty: "XRegSet", + }, (name, ty) => Operand::Normal { name, ty }, }) .chain(if self.name.contains("Trap") { @@ -120,10 +124,17 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> { if i > 0 { format_string.push_str(","); } + + if ty == "XRegSet" { + format_string.push_str(" {"); + format_string.push_str(name); + format_string.push_str(":?}"); + continue; + } + format_string.push_str(" {"); format_string.push_str(name); format_string.push_str("}"); - if ty.contains("Reg") { if name == "dst" { locals.push_str(&format!("let {name} = reg_name(*{name}.to_reg());\n")); @@ -176,6 +187,12 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> { let mut defs = Vec::new(); for op in inst.operands() { match op { + // `{Push,Pop}FrameSave` doesn't participate in register allocation. + Operand::Normal { + name: _, + ty: "XRegSet", + } if *name == "PushFrameSave" || *name == "PopFrameSave" => {} + Operand::Normal { name, ty } => { if ty.contains("Reg") { uses.push(name); diff --git a/cranelift/codegen/src/isa/pulley_shared/abi.rs b/cranelift/codegen/src/isa/pulley_shared/abi.rs index 43a9ce7b789f..9f7895be36b9 100644 --- a/cranelift/codegen/src/isa/pulley_shared/abi.rs +++ b/cranelift/codegen/src/isa/pulley_shared/abi.rs @@ -10,6 +10,7 @@ use crate::{ }; use alloc::{boxed::Box, vec::Vec}; use core::marker::PhantomData; +use cranelift_bitset::ScalarBitSet; use regalloc2::{MachineEnv, PReg, PRegSet}; use smallvec::{smallvec, SmallVec}; use std::borrow::ToOwned; @@ -288,26 +289,56 @@ where smallvec![inst.into()] } + /// Generates the entire prologue for the function. + /// + /// Note that this is different from other backends where it's not spread + /// out among a few individual functions. That's because the goal here is to + /// generate a single macro-instruction for the entire prologue in the most + /// common cases and we don't want to spread the logic over multiple + /// functions. + /// + /// The general machinst methods are split to accomodate stack checks and + /// things like stack probes, all of which are empty on Pulley because + /// Pulley has its own stack check mechanism. fn gen_prologue_frame_setup( _call_conv: isa::CallConv, - flags: &settings::Flags, + _flags: &settings::Flags, _isa_flags: &PulleyFlags, frame_layout: &FrameLayout, ) -> SmallInstVec { let mut insts = SmallVec::new(); - if frame_layout.setup_area_size > 0 { - insts.push(RawInst::PushFrame.into()); - if flags.unwind_info() { - insts.push( - Inst::Unwind { - inst: UnwindInst::PushFrameRegs { - offset_upward_to_caller_sp: frame_layout.setup_area_size, - }, - } - .into(), - ); - } + let incoming_args_diff = frame_layout.tail_args_size - frame_layout.incoming_args_size; + if incoming_args_diff > 0 { + // Decrement SP by the amount of additional incoming argument space + // we need + insts.extend(Self::gen_sp_reg_adjust(-(incoming_args_diff as i32))); + } + + let style = frame_layout.pulley_frame_style(); + + match &style { + FrameStyle::None => {} + FrameStyle::PulleySetupNoClobbers => insts.push(RawInst::PushFrame.into()), + FrameStyle::PulleySetupAndSaveClobbers { + frame_size, + saved_by_pulley, + } => insts.push( + RawInst::PushFrameSave { + amt: *frame_size, + regs: pulley_interpreter::RegSet::from_bitset(*saved_by_pulley), + } + .into(), + ), + FrameStyle::Manual { frame_size } => insts.extend(Self::gen_sp_reg_adjust( + -i32::try_from(*frame_size).unwrap(), + )), + } + + for (offset, ty, reg) in frame_layout.manually_managed_clobbers(&style) { + insts.push( + Inst::gen_store(Amode::SpOffset { offset }, reg, ty, MemFlags::trusted()).into(), + ); } insts @@ -318,31 +349,71 @@ where _call_conv: isa::CallConv, _flags: &settings::Flags, _isa_flags: &PulleyFlags, + _frame_layout: &FrameLayout, + ) -> SmallInstVec { + // Note that this is intentionally empty as `gen_return` does + // everything. + SmallVec::new() + } + + fn gen_return( + _call_conv: isa::CallConv, + _isa_flags: &PulleyFlags, frame_layout: &FrameLayout, ) -> SmallInstVec { let mut insts = SmallVec::new(); - if frame_layout.setup_area_size > 0 { - insts.push(RawInst::PopFrame.into()); + let style = frame_layout.pulley_frame_style(); + + // Restore clobbered registers that are manually managed in Cranelift. + for (offset, ty, reg) in frame_layout.manually_managed_clobbers(&style) { + insts.push( + Inst::gen_load( + Writable::from_reg(reg), + Amode::SpOffset { offset }, + ty, + MemFlags::trusted(), + ) + .into(), + ); + } + + // Perform the inverse of `gen_prologue_frame_setup`. + match &style { + FrameStyle::None => {} + FrameStyle::PulleySetupNoClobbers => insts.push(RawInst::PopFrame.into()), + FrameStyle::PulleySetupAndSaveClobbers { + frame_size, + saved_by_pulley, + } => insts.push( + RawInst::PopFrameSave { + amt: *frame_size, + regs: pulley_interpreter::RegSet::from_bitset(*saved_by_pulley), + } + .into(), + ), + FrameStyle::Manual { frame_size } => { + insts.extend(Self::gen_sp_reg_adjust(i32::try_from(*frame_size).unwrap())) + } } + // Handle final stack adjustments for the tail-call ABI. if frame_layout.tail_args_size > 0 { insts.extend(Self::gen_sp_reg_adjust( frame_layout.tail_args_size.try_into().unwrap(), )); } + // And finally, return. + // + // FIXME: if `frame_layout.tail_args_size` is zero this instruction + // should get folded into the macro-instructions above. No need to have + // all functions do `pop_frame; ret`, that could be `pop_frame_and_ret`. + // Should benchmark whether this is worth it though. + insts.push(RawInst::Ret {}.into()); insts } - fn gen_return( - _call_conv: isa::CallConv, - _isa_flags: &PulleyFlags, - _frame_layout: &FrameLayout, - ) -> SmallInstVec { - smallvec![RawInst::Ret {}.into()] - } - fn gen_probestack(_insts: &mut SmallInstVec, _frame_size: u32) { // Pulley doesn't implement stack probes since all stack pointer // decrements are checked already. @@ -350,137 +421,21 @@ where fn gen_clobber_save( _call_conv: isa::CallConv, - flags: &settings::Flags, - frame_layout: &FrameLayout, + _flags: &settings::Flags, + _frame_layout: &FrameLayout, ) -> SmallVec<[Self::I; 16]> { - let mut insts = SmallVec::new(); - let setup_frame = frame_layout.setup_area_size > 0; - - let incoming_args_diff = frame_layout.tail_args_size - frame_layout.incoming_args_size; - if incoming_args_diff > 0 { - // Pulley does not generate/probestack/stack checks/etc and doesn't - // expose the direct ability to modify fp/lr, so simulate a pop, - // perform the sp adjustment, then perform the same push that was - // done previously in the prologue. - // - // Note that for now this'll generate `push_frame pop_frame` pairs - // in the prologue which isn't great, and updating that is left for - // a future refactoring to only do a `push_frame` once (e.g. skip - // the one above if this block is going to be executed) - if setup_frame { - insts.push(RawInst::PopFrame.into()); - } - // Decrement SP by the amount of additional incoming argument space - // we need - insts.extend(Self::gen_sp_reg_adjust(-(incoming_args_diff as i32))); - - if setup_frame { - insts.push(RawInst::PushFrame.into()); - } - } - - if flags.unwind_info() && setup_frame { - // The *unwind* frame (but not the actual frame) starts at the - // clobbers, just below the saved FP/LR pair. - insts.push( - Inst::Unwind { - inst: UnwindInst::DefineNewFrame { - offset_downward_to_clobbers: frame_layout.clobber_size, - offset_upward_to_caller_sp: frame_layout.setup_area_size, - }, - } - .into(), - ); - } - - // Adjust the stack pointer downward for clobbers, the function fixed - // frame (spillslots and storage slots), and outgoing arguments. - let stack_size = frame_layout.clobber_size - + frame_layout.fixed_frame_storage_size - + frame_layout.outgoing_args_size; - - // Store each clobbered register in order at offsets from SP, placing - // them above the fixed frame slots. - if stack_size > 0 { - insts.extend(Self::gen_sp_reg_adjust(-i32::try_from(stack_size).unwrap())); - - let mut cur_offset = 8; - for reg in &frame_layout.clobbered_callee_saves { - let r_reg = reg.to_reg(); - let ty = match r_reg.class() { - RegClass::Int => I64, - RegClass::Float => F64, - RegClass::Vector => unreachable!("no vector registers are callee-save"), - }; - insts.push( - Inst::gen_store( - Amode::SpOffset { - offset: i32::try_from(stack_size - cur_offset).unwrap(), - }, - Reg::from(reg.to_reg()), - ty, - MemFlags::trusted(), - ) - .into(), - ); - - if flags.unwind_info() { - insts.push( - Inst::Unwind { - inst: UnwindInst::SaveReg { - clobber_offset: frame_layout.clobber_size - cur_offset, - reg: r_reg, - }, - } - .into(), - ); - } - - cur_offset += 8 - } - } - - insts + // Note that this is intentionally empty because everything necessary + // was already done in `gen_prologue_frame_setup`. + SmallVec::new() } fn gen_clobber_restore( _call_conv: isa::CallConv, _flags: &settings::Flags, - frame_layout: &FrameLayout, + _frame_layout: &FrameLayout, ) -> SmallVec<[Self::I; 16]> { - let mut insts = SmallVec::new(); - - let stack_size = frame_layout.clobber_size - + frame_layout.fixed_frame_storage_size - + frame_layout.outgoing_args_size; - - let mut cur_offset = 8; - for reg in &frame_layout.clobbered_callee_saves { - let rreg = reg.to_reg(); - let ty = match rreg.class() { - RegClass::Int => I64, - RegClass::Float => F64, - RegClass::Vector => unreachable!("vector registers are never callee-saved"), - }; - insts.push( - Inst::gen_load( - reg.map(Reg::from), - Amode::SpOffset { - offset: i32::try_from(stack_size - cur_offset).unwrap(), - }, - ty, - MemFlags::trusted(), - ) - .into(), - ); - cur_offset += 8 - } - - if stack_size > 0 { - insts.extend(Self::gen_sp_reg_adjust(stack_size as i32)); - } - - insts + // Intentionally empty as restores happen for Pulley in `gen_return`. + SmallVec::new() } fn gen_call( @@ -606,6 +561,158 @@ where } } +/// Different styles of management of fp/lr and clobbered registers. +/// +/// This helps decide, depending on Cranelift settings and frame layout, what +/// macro instruction is used to setup the pulley frame. +enum FrameStyle { + /// No management is happening, fp/lr aren't saved by Pulley or Cranelift. + /// No stack is being allocated either. + None, + + /// No stack is being allocated and nothing is clobbered, but Pulley should + /// save the fp/lr combo. + PulleySetupNoClobbers, + + /// Pulley is managing the fp/lr combo, the stack size, and clobbered + /// X-class registers. + /// + /// Note that `saved_by_pulley` is not the exhaustive set of clobbered + /// registers. It's only those that are part of the `PushFrameSave` + /// instruction. + PulleySetupAndSaveClobbers { + /// The size of the frame, including clobbers, that's being allocated. + frame_size: u32, + /// Registers that pulley is saving/restoring. + saved_by_pulley: ScalarBitSet, + }, + + /// Cranelift is manually managing everything, both clobbers and stack + /// increments/decrements. + /// + /// Note that fp/lr are not saved in this mode. + Manual { + /// The size of the stack being allocated. + frame_size: u32, + }, +} + +/// Pulley-specific helpers when dealing with ABI code. +impl FrameLayout { + /// Whether or not this frame saves fp/lr. + fn setup_frame(&self) -> bool { + self.setup_area_size > 0 + } + + /// Returns the stack size allocated by this function, excluding incoming + /// tail args or the optional "setup area" of fp/lr. + fn stack_size(&self) -> u32 { + self.clobber_size + self.fixed_frame_storage_size + self.outgoing_args_size + } + + /// Returns the style of frame being used for this function. + /// + /// See `FrameStyle` for more information. + fn pulley_frame_style(&self) -> FrameStyle { + let saved_by_pulley = self.clobbered_xregs_saved_by_pulley(); + match ( + self.stack_size(), + self.setup_frame(), + saved_by_pulley.is_empty(), + ) { + // No stack allocated, not saving fp/lr, no clobbers, nothing to do + (0, false, true) => FrameStyle::None, + + // No stack allocated, saving fp/lr, no clobbers, so this is + // pulley-managed via push/pop_frame. + (0, true, true) => FrameStyle::PulleySetupNoClobbers, + + // Some stack is being allocated and pulley is managing fp/lr. Let + // pulley manage clobbered registers as well, regardless if they're + // present or not. + (frame_size, true, _) => FrameStyle::PulleySetupAndSaveClobbers { + frame_size, + saved_by_pulley, + }, + + // Some stack is being allocated, but pulley isn't managing fp/lr, + // so we're manually doing everything. + (frame_size, false, true) => FrameStyle::Manual { frame_size }, + + // If there's no frame setup and there's clobbered registers this + // technically should have already hit a case above, so panic here. + (_, false, false) => unreachable!(), + } + } + + /// Returns the set of clobbered registers that Pulley is managing via its + /// macro instructions rather than the generated code. + fn clobbered_xregs_saved_by_pulley(&self) -> ScalarBitSet { + let mut clobbered: ScalarBitSet = ScalarBitSet::new(); + // Pulley only manages clobbers if it's also managing fp/lr. + if !self.setup_frame() { + return clobbered; + } + let mut found_manual_clobber = false; + for reg in self.clobbered_callee_saves.iter() { + let r_reg = reg.to_reg(); + // Pulley can only manage clobbers of integer registers at this + // time, float registers are managed manually. + // + // Also assert that all pulley-managed clobbers come first, + // otherwise the loop below in `manually_managed_clobbers` is + // incorrect. + if r_reg.class() == RegClass::Int { + assert!(!found_manual_clobber); + clobbered.insert(r_reg.hw_enc()); + } else { + found_manual_clobber = true; + } + } + clobbered + } + + /// Returns an iterator over the clobbers that Cranelift is managing, not + /// Pulley. + /// + /// If this frame has clobbers then they're either saved by Pulley with + /// `FrameStyle::PulleySetupAndSaveClobbers`. Cranelift might need to manage + /// these registers depending on Cranelift settings. Cranelift also always + /// manages floating-point registers. + fn manually_managed_clobbers<'a>( + &'a self, + style: &'a FrameStyle, + ) -> impl Iterator + 'a { + let mut offset = self.stack_size(); + self.clobbered_callee_saves.iter().filter_map(move |reg| { + // Allocate space for this clobber no matter what. If pulley is + // managing this then we're just accounting for the pulley-saved + // registers as well. Note that all pulley-managed registers come + // first in the list here. + offset -= 8; + let r_reg = reg.to_reg(); + let ty = match r_reg.class() { + RegClass::Int => { + // If this register is saved by pulley, skip this clobber. + if let FrameStyle::PulleySetupAndSaveClobbers { + saved_by_pulley, .. + } = style + { + if saved_by_pulley.contains(r_reg.hw_enc()) { + return None; + } + } + I64 + } + RegClass::Float => F64, + RegClass::Vector => unreachable!("no vector registers are callee-save"), + }; + let offset = i32::try_from(offset).unwrap(); + Some((offset, ty, Reg::from(reg.to_reg()))) + }) + } +} + impl

PulleyABICallSite

where P: PulleyTargetKind, diff --git a/cranelift/codegen/src/isa/pulley_shared/inst.isle b/cranelift/codegen/src/isa/pulley_shared/inst.isle index 1097285feacc..b30f4b9450f1 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst.isle +++ b/cranelift/codegen/src/isa/pulley_shared/inst.isle @@ -156,6 +156,7 @@ (type BoxCallIndInfo (primitive BoxCallIndInfo)) (type BoxReturnCallInfo (primitive BoxReturnCallInfo)) (type BoxReturnCallIndInfo (primitive BoxReturnCallIndInfo)) +(type XRegSet (primitive XRegSet)) ;;;; Address Modes ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs index 59aebec1e49b..80be6330bc8c 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs @@ -573,7 +573,9 @@ fn pulley_emit

( Inst::Raw { raw } => { match raw { - RawInst::PushFrame | RawInst::StackAlloc32 { .. } => { + RawInst::PushFrame + | RawInst::StackAlloc32 { .. } + | RawInst::PushFrameSave { .. } => { sink.add_trap(ir::TrapCode::STACK_OVERFLOW); } _ => {} diff --git a/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs b/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs index da74f73f6916..de2e0cc20c70 100644 --- a/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs +++ b/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs @@ -31,6 +31,7 @@ type BoxCallIndInfo = Box>; type BoxReturnCallInfo = Box>; type BoxReturnCallIndInfo = Box>; type BoxExternalName = Box; +type XRegSet = pulley_interpreter::RegSet; pub(crate) struct PulleyIsleContext<'a, 'b, I, B> where diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs index fe32ea75d6ac..db33ba1ee34e 100644 --- a/pulley/src/interp.rs +++ b/pulley/src/interp.rs @@ -1798,6 +1798,50 @@ impl OpVisitor for Interpreter<'_> { ControlFlow::Continue(()) } + #[inline] + fn push_frame_save(&mut self, amt: u32, regs: RegSet) -> ControlFlow { + // Decrement the stack pointer `amt` bytes plus 2 pointers more for + // fp/lr. + let ptr_size = size_of::(); + let full_amt = usize::try_from(amt).unwrap() + 2 * ptr_size; + let new_sp = self.state[XReg::sp].get_ptr::().wrapping_sub(full_amt); + self.set_sp::(new_sp)?; + + unsafe { + // Emulate `push_frame` by placing `lr` and `fp` onto the stack, in + // that order, at the top of the allocated area. + self.store(XReg::sp, (full_amt - 1 * ptr_size) as i32, self.state.lr); + self.store(XReg::sp, (full_amt - 2 * ptr_size) as i32, self.state.fp); + + // Set `fp` to the top of our frame, where `fp` is stored. + let mut offset = amt as i32; + self.state.fp = self.state[XReg::sp] + .get_ptr::() + .byte_offset(offset as isize); + + // Next save any registers in `regs` to the stack. + for reg in regs { + offset -= 8; + self.store(XReg::sp, offset, self.state[reg].get_u64()); + } + } + ControlFlow::Continue(()) + } + + fn pop_frame_save(&mut self, amt: u32, regs: RegSet) -> ControlFlow { + // Restore all registers in `regs`, followed by the normal `pop_frame` + // opcode below to restore fp/lr. + unsafe { + let mut offset = amt as i32; + for reg in regs { + offset -= 8; + let val = self.load(XReg::sp, offset); + self.state[reg].set_u64(val); + } + } + self.pop_frame() + } + fn pop_frame(&mut self) -> ControlFlow { self.set_sp_unchecked(self.state.fp); let fp = self.pop(); diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs index f87388f2f59c..ca971f7724c2 100644 --- a/pulley/src/lib.rs +++ b/pulley/src/lib.rs @@ -442,6 +442,16 @@ macro_rules! for_each_op { /// `sp = fp; pop fp; pop lr` pop_frame = PopFrame ; + /// Macro-instruction to enter a function, allocate some stack, and + /// then save some registers. + /// + /// This is equivalent to `push_frame`, `stack_alloc32 amt`, then + /// saving all of `regs` to the top of the stack just allocated. + push_frame_save = PushFrameSave { amt: u32, regs: RegSet }; + /// Inverse of `push_frame_save`. Restores `regs` from the top of + /// the stack, then runs `stack_free32 amt`, then runs `pop_frame`. + pop_frame_save = PopFrameSave { amt: u32, regs: RegSet }; + /// `sp = sp.checked_sub(amt)` stack_alloc32 = StackAlloc32 { amt: u32 }; diff --git a/pulley/src/regs.rs b/pulley/src/regs.rs index 72e4bbf2129e..434411fb2d94 100644 --- a/pulley/src/regs.rs +++ b/pulley/src/regs.rs @@ -261,10 +261,32 @@ impl Into> for RegSet { impl IntoIterator for RegSet { type Item = R; - type IntoIter = core::iter::FilterMap, fn(u8) -> Option>; + type IntoIter = RegSetIntoIter; fn into_iter(self) -> Self::IntoIter { - self.bitset.into_iter().filter_map(R::new) + RegSetIntoIter { + iter: self.bitset.into_iter(), + _marker: PhantomData, + } + } +} + +/// Returned iterator from `RegSet::into_iter` +pub struct RegSetIntoIter { + iter: cranelift_bitset::scalar::Iter, + _marker: PhantomData, +} + +impl Iterator for RegSetIntoIter { + type Item = R; + fn next(&mut self) -> Option { + Some(R::new(self.iter.next()?).unwrap()) + } +} + +impl DoubleEndedIterator for RegSetIntoIter { + fn next_back(&mut self) -> Option { + Some(R::new(self.iter.next_back()?).unwrap()) } }