From f236dd82eefe17216c4b86accfcfca785ceecc54 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Mon, 16 Dec 2024 08:57:18 -0800 Subject: [PATCH] pulley: Implement `return_call` instructions This commit fleshes out the Cranelift lowerings of tail calls which gets the wasm tail call proposal itself working on Pulley. Most of the bits and pieces here were copied over from the riscv64 backend and then edited to suit Pulley. --- .../codegen/src/isa/pulley_shared/abi.rs | 41 +++++- .../codegen/src/isa/pulley_shared/inst.isle | 8 ++ .../src/isa/pulley_shared/inst/emit.rs | 127 ++++++++++++++++++ .../codegen/src/isa/pulley_shared/inst/mod.rs | 36 +++++ .../src/isa/pulley_shared/lower/isle.rs | 6 +- crates/wasmtime/src/config.rs | 6 +- crates/wast-util/src/lib.rs | 11 +- pulley/src/interp.rs | 7 + pulley/src/lib.rs | 4 + .../tail-call/loop-across-modules.wast | 2 +- 10 files changed, 236 insertions(+), 12 deletions(-) diff --git a/cranelift/codegen/src/isa/pulley_shared/abi.rs b/cranelift/codegen/src/isa/pulley_shared/abi.rs index 33470dc490e0..43a9ce7b789f 100644 --- a/cranelift/codegen/src/isa/pulley_shared/abi.rs +++ b/cranelift/codegen/src/isa/pulley_shared/abi.rs @@ -611,12 +611,45 @@ where P: PulleyTargetKind, { pub fn emit_return_call( - self, - _ctx: &mut Lower>, - _args: isle::ValueSlice, + mut self, + ctx: &mut Lower>, + args: isle::ValueSlice, _backend: &PulleyBackend

, ) { - todo!() + let new_stack_arg_size = + u32::try_from(self.sig(ctx.sigs()).sized_stack_arg_space()).unwrap(); + + ctx.abi_mut().accumulate_tail_args_size(new_stack_arg_size); + + // Put all arguments in registers and stack slots (within that newly + // allocated stack space). + self.emit_args(ctx, args); + self.emit_stack_ret_arg_for_tail_call(ctx); + + let dest = self.dest().clone(); + let uses = self.take_uses(); + + match dest { + CallDest::ExtName(name, RelocDistance::Near) => { + let info = Box::new(ReturnCallInfo { + dest: name, + uses, + new_stack_arg_size, + }); + ctx.emit(Inst::ReturnCall { info }.into()); + } + CallDest::ExtName(_name, RelocDistance::Far) => { + unimplemented!("return-call of a host function") + } + CallDest::Reg(callee) => { + let info = Box::new(ReturnCallInfo { + dest: XReg::new(callee).unwrap(), + uses, + new_stack_arg_size, + }); + ctx.emit(Inst::ReturnIndirectCall { info }.into()); + } + } } } diff --git a/cranelift/codegen/src/isa/pulley_shared/inst.isle b/cranelift/codegen/src/isa/pulley_shared/inst.isle index 89fdec3fe796..ab0d39a96a16 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst.isle +++ b/cranelift/codegen/src/isa/pulley_shared/inst.isle @@ -53,6 +53,12 @@ ;; An indirect call to an unknown callee. (IndirectCall (info BoxCallIndInfo)) + ;; A direct return-call macro instruction. + (ReturnCall (info BoxReturnCallInfo)) + + ;; An indirect return-call macro instruction. + (ReturnIndirectCall (info BoxReturnCallIndInfo)) + ;; An indirect call out to a host-defined function. The host function ;; pointer is the first "argument" of this function call. (IndirectCallHost (info BoxCallInfo)) @@ -125,6 +131,8 @@ (type BoxCallInfo (primitive BoxCallInfo)) (type BoxCallIndInfo (primitive BoxCallIndInfo)) +(type BoxReturnCallInfo (primitive BoxReturnCallInfo)) +(type BoxReturnCallIndInfo (primitive BoxReturnCallIndInfo)) ;;;; Address Modes ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs index d209e6530ebc..5b0c435e83b7 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs @@ -190,6 +190,29 @@ fn pulley_emit

( } } + Inst::ReturnCall { info } => { + emit_return_call_common_sequence(sink, emit_info, state, &info); + + // Emit an unconditional jump which is quite similar to `Inst::Call` + // except that a `jump` opcode is used instead of a `call` opcode. + sink.put1(pulley_interpreter::Opcode::Jump as u8); + sink.add_reloc(Reloc::X86CallPCRel4, &info.dest, -1); + sink.put4(0); + + // Islands were manually handled in + // `emit_return_call_common_sequence`. + *start_offset = sink.cur_offset(); + } + + Inst::ReturnIndirectCall { info } => { + emit_return_call_common_sequence(sink, emit_info, state, &info); + enc::xjump(sink, info.dest); + + // Islands were manually handled in + // `emit_return_call_common_sequence`. + *start_offset = sink.cur_offset(); + } + Inst::IndirectCallHost { info } => { // Emit a relocation to fill in the actual immediate argument here // in `call_indirect_host`. @@ -496,3 +519,107 @@ fn pulley_emit

( } } } + +fn emit_return_call_common_sequence( + sink: &mut MachBuffer>, + emit_info: &EmitInfo, + state: &mut EmitState

, + info: &ReturnCallInfo, +) where + P: PulleyTargetKind, +{ + // The return call sequence can potentially emit a lot of instructions, so + // lets emit an island here if we need it. + // + // It is difficult to calculate exactly how many instructions are going to + // be emitted, so we calculate it by emitting it into a disposable buffer, + // and then checking how many instructions were actually emitted. + let mut buffer = MachBuffer::new(); + let mut fake_emit_state = state.clone(); + + return_call_emit_impl(&mut buffer, emit_info, &mut fake_emit_state, info); + + // Finalize the buffer and get the number of bytes emitted. + let buffer = buffer.finish(&Default::default(), &mut Default::default()); + let length = buffer.data().len() as u32; + + // And now emit the island inline with this instruction. + if sink.island_needed(length) { + let jump_around_label = sink.get_label(); + >::gen_jump(jump_around_label).emit(sink, emit_info, state); + sink.emit_island(length + 4, &mut state.ctrl_plane); + sink.bind_label(jump_around_label, &mut state.ctrl_plane); + } + + // Now that we're done, emit the *actual* return sequence. + return_call_emit_impl(sink, emit_info, state, info); +} + +/// This should not be called directly, Instead prefer to call [emit_return_call_common_sequence]. +fn return_call_emit_impl( + sink: &mut MachBuffer>, + emit_info: &EmitInfo, + state: &mut EmitState

, + info: &ReturnCallInfo, +) where + P: PulleyTargetKind, +{ + let sp_to_fp_offset = { + let frame_layout = state.frame_layout(); + i64::from( + frame_layout.clobber_size + + frame_layout.fixed_frame_storage_size + + frame_layout.outgoing_args_size, + ) + }; + + // Restore all clobbered registers before leaving the function. + let mut clobber_offset = sp_to_fp_offset - 8; + for reg in state.frame_layout().clobbered_callee_saves.clone() { + let rreg = reg.to_reg(); + let ty = match rreg.class() { + RegClass::Int => I64, + RegClass::Float => F64, + RegClass::Vector => unimplemented!("Vector Clobber Restores"), + }; + + >::from(Inst::gen_load( + reg.map(Reg::from), + Amode::SpOffset { + offset: clobber_offset.try_into().unwrap(), + }, + ty, + MemFlags::trusted(), + )) + .emit(sink, emit_info, state); + + clobber_offset -= 8 + } + + // Restore the link register and frame pointer using a `pop_frame` + // instruction. This will move `sp` to the current frame pointer and then + // restore the old lr/fp, so this restores all of sp/fp/lr in one + // instruction. + let setup_area_size = i64::from(state.frame_layout().setup_area_size); + assert!(setup_area_size > 0, "must have frame pointers enabled"); + >::from(RawInst::PopFrame).emit(sink, emit_info, state); + + // Now that `sp` is restored to what it was on function entry it may need to + // be adjusted if the stack arguments of our own function differ from the + // stack arguments of the callee. Perform any necessary adjustment here. + // + // Note that this means that there's a brief window where stack arguments + // might be below `sp` in the case that the callee has more stack arguments + // than ourselves. That's in theory ok though as we're inventing the pulley + // ABI and nothing like async signals are happening that we have to worry + // about. + let incoming_args_diff = + i64::from(state.frame_layout().tail_args_size - info.new_stack_arg_size); + + if incoming_args_diff != 0 { + let amt = i32::try_from(incoming_args_diff).unwrap(); + for inst in PulleyMachineDeps::

::gen_sp_reg_adjust(amt) { + >::from(inst).emit(sink, emit_info, state); + } + } +} diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs index 11aac8e7c304..bc79c4b322b8 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs @@ -41,6 +41,20 @@ mod generated { include!(concat!(env!("OUT_DIR"), "/pulley_inst_gen.rs")); } +/// Out-of-line data for return-calls, to keep the size of `Inst` down. +#[derive(Clone, Debug)] +pub struct ReturnCallInfo { + /// Where this call is going. + pub dest: T, + + /// The size of the argument area for this return-call, potentially smaller + /// than that of the caller, but never larger. + pub new_stack_arg_size: u32, + + /// The in-register arguments and their constraints. + pub uses: CallArgList, +} + impl Inst { /// Generic constructor for a load (zero-extending where appropriate). pub fn gen_load(dst: Writable, mem: Amode, ty: Type, flags: MemFlags) -> Inst { @@ -154,6 +168,18 @@ fn pulley_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) { } collector.reg_clobbers(info.clobbers); } + Inst::ReturnCall { info } => { + for CallArgPair { vreg, preg } in &mut info.uses { + collector.reg_fixed_use(vreg, *preg); + } + } + Inst::ReturnIndirectCall { info } => { + collector.reg_use(&mut info.dest); + + for CallArgPair { vreg, preg } in &mut info.uses { + collector.reg_fixed_use(vreg, *preg); + } + } Inst::Jump { .. } => {} @@ -381,6 +407,7 @@ where Inst::Jump { .. } => MachTerminator::Uncond, Inst::BrIf { .. } => MachTerminator::Cond, Inst::BrTable { .. } => MachTerminator::Indirect, + Inst::ReturnCall { .. } | Inst::ReturnIndirectCall { .. } => MachTerminator::Indirect, _ => MachTerminator::None, } } @@ -574,6 +601,15 @@ impl Inst { format!("indirect_call {callee}, {info:?}") } + Inst::ReturnCall { info } => { + format!("return_call {info:?}") + } + + Inst::ReturnIndirectCall { info } => { + let callee = format_reg(*info.dest); + format!("return_indirect_call {callee}, {info:?}") + } + Inst::IndirectCallHost { info } => { format!("indirect_call_host {info:?}") } diff --git a/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs b/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs index 25f831b3d8d4..b059e08a3507 100644 --- a/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs +++ b/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs @@ -9,7 +9,9 @@ use inst::InstAndKind; use crate::ir::{condcodes::*, immediates::*, types::*, *}; use crate::isa::pulley_shared::{ abi::*, - inst::{FReg, OperandSize, VReg, WritableFReg, WritableVReg, WritableXReg, XReg}, + inst::{ + FReg, OperandSize, ReturnCallInfo, VReg, WritableFReg, WritableVReg, WritableXReg, XReg, + }, lower::{regs, Cond}, *, }; @@ -25,6 +27,8 @@ type VecArgPair = Vec; type VecRetPair = Vec; type BoxCallInfo = Box>; type BoxCallIndInfo = Box>; +type BoxReturnCallInfo = Box>; +type BoxReturnCallIndInfo = Box>; type BoxExternalName = Box; pub(crate) struct PulleyIsleContext<'a, 'b, I, B> diff --git a/crates/wasmtime/src/config.rs b/crates/wasmtime/src/config.rs index f50da67e3a91..31de2c06803f 100644 --- a/crates/wasmtime/src/config.rs +++ b/crates/wasmtime/src/config.rs @@ -1963,12 +1963,8 @@ impl Config { // Pulley at this time fundamentally doesn't support the // `threads` proposal, notably shared memory, because Rust can't // safely implement loads/stores in the face of shared memory. - // - // Additionally pulley currently panics on tail-call generation - // in Cranelift ABI call which will get implemented in the - // future but is listed here for now as unsupported. if self.compiler_target().is_pulley() { - return WasmFeatures::TAIL_CALL | WasmFeatures::THREADS; + return WasmFeatures::THREADS; } // Other Cranelift backends are either 100% missing or complete diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs index a5b4d1fcbd8a..ad17fea5edf8 100644 --- a/crates/wast-util/src/lib.rs +++ b/crates/wast-util/src/lib.rs @@ -308,7 +308,7 @@ impl Compiler { // support at this time (pulley is a work-in-progress) and so // individual tests are listed below as "should fail" even if // they're not covered in this list. - if config.tail_call() || config.wide_arithmetic() { + if config.wide_arithmetic() { return true; } } @@ -424,6 +424,15 @@ impl WastTest { "spec_testsuite/proposals/relaxed-simd/relaxed_laneselect.wast", "spec_testsuite/proposals/relaxed-simd/relaxed_madd_nmadd.wast", "spec_testsuite/proposals/relaxed-simd/relaxed_min_max.wast", + "spec_testsuite/proposals/memory64/simd_lane.wast", + "spec_testsuite/proposals/memory64/simd_memory-multi.wast", + "spec_testsuite/proposals/memory64/relaxed_min_max.wast", + "spec_testsuite/proposals/memory64/relaxed_madd_nmadd.wast", + "spec_testsuite/proposals/memory64/relaxed_laneselect.wast", + "spec_testsuite/proposals/memory64/relaxed_dot_product.wast", + "spec_testsuite/proposals/memory64/i16x8_relaxed_q15mulr_s.wast", + "spec_testsuite/proposals/memory64/i32x4_relaxed_trunc.wast", + "spec_testsuite/proposals/memory64/i8x16_relaxed_swizzle.wast", "spec_testsuite/simd_align.wast", "spec_testsuite/simd_bitwise.wast", "spec_testsuite/simd_boolean.wast", diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs index 3496867653a2..9ed62535b71e 100644 --- a/pulley/src/interp.rs +++ b/pulley/src/interp.rs @@ -1066,6 +1066,13 @@ impl OpVisitor for Interpreter<'_> { ControlFlow::Continue(()) } + fn xjump(&mut self, reg: XReg) -> ControlFlow { + unsafe { + self.pc = UnsafeBytecodeStream::new(NonNull::new_unchecked(self.state[reg].get_ptr())); + } + ControlFlow::Continue(()) + } + fn br_if32(&mut self, cond: XReg, offset: PcRelOffset) -> ControlFlow { let cond = self.state[cond].get_u32(); if cond != 0 { diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs index facda91c3f9c..d627db5c783b 100644 --- a/pulley/src/lib.rs +++ b/pulley/src/lib.rs @@ -101,6 +101,10 @@ macro_rules! for_each_op { /// Unconditionally transfer control to the PC at the given offset. jump = Jump { offset: PcRelOffset }; + /// Unconditionally transfer control to the PC at specified + /// register. + xjump = XJump { reg: XReg }; + /// Conditionally transfer control to the given PC offset if /// `low32(cond)` contains a non-zero value. br_if32 = BrIf { cond: XReg, offset: PcRelOffset }; diff --git a/tests/misc_testsuite/tail-call/loop-across-modules.wast b/tests/misc_testsuite/tail-call/loop-across-modules.wast index 8575c69ec55c..ea5fb610433d 100644 --- a/tests/misc_testsuite/tail-call/loop-across-modules.wast +++ b/tests/misc_testsuite/tail-call/loop-across-modules.wast @@ -42,5 +42,5 @@ (start $start) ) -(assert_return (invoke $B "g" (i32.const 100000000)) +(assert_return (invoke $B "g" (i32.const 100000)) (i32.const 42))