diff --git a/CMakeLists.txt b/CMakeLists.txt index 59def7c..9ed6a68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,6 +126,7 @@ set(HEADERS "${LIB86CPU_ROOT_DIR}/lib86cpu/core/allocator.h" "${LIB86CPU_ROOT_DIR}/lib86cpu/core/breakpoint.h" "${LIB86CPU_ROOT_DIR}/lib86cpu/core/decode.h" + "${LIB86CPU_ROOT_DIR}/lib86cpu/core/fpu.h" "${LIB86CPU_ROOT_DIR}/lib86cpu/core/helpers.h" "${LIB86CPU_ROOT_DIR}/lib86cpu/core/instructions.h" "${LIB86CPU_ROOT_DIR}/lib86cpu/core/internal.h" diff --git a/lib86cpu/core/emitter/emitter_common.h b/lib86cpu/core/emitter/emitter_common.h index fb558d0..7f8ab11 100644 --- a/lib86cpu/core/emitter/emitter_common.h +++ b/lib86cpu/core/emitter/emitter_common.h @@ -203,5 +203,17 @@ inline constexpr auto all_callable_funcs = std::make_tuple( fpu_update_tag, cpu_runtime_abort, dbg_update_exp_hook, - tlb_invalidate_ + tlb_invalidate_, + fpu_stack_check, + fpu_stack_check, + fpu_stack_check, + fpu_stack_check, + fpu_stack_check, + fpu_stack_check, + fpu_stack_check, + fpu_stack_check, + fpu_stack_check, + fpu_stack_check, + fpu_stack_check, + fpu_stack_check ); diff --git a/lib86cpu/core/emitter/x64/jit.cpp b/lib86cpu/core/emitter/x64/jit.cpp index a84dd0d..aff726d 100644 --- a/lib86cpu/core/emitter/x64/jit.cpp +++ b/lib86cpu/core/emitter/x64/jit.cpp @@ -420,6 +420,7 @@ static_assert((LOCAL_VARS_off(0) & 15) == 0); // must be 16 byte aligned so that #define LD_MEM() load_mem(m_cpu->size_mode, 0) #define LD_MEMs(size) load_mem(size, 0) +#define LD_MEM80(idx) load_mem(SIZE80, 0) #define LD_MEM128() load_mem(SIZE128, 0) #define ST_MEM(val) store_mem(val, m_cpu->size_mode, 0) #define ST_MEMs(val, size) store_mem(val, size, 0) @@ -456,6 +457,13 @@ static_assert((LOCAL_VARS_off(0) & 15) == 0); // must be 16 byte aligned so that #define RESTORE_FPU_CTX() FLDCW(MEMD16(RSP, LOCAL_VARS_off(5))) #define CALL_F(func) MOV(RAX, func); CALL(RAX); RELOAD_RCX_CTX() +#define CALL_FPU_SET_CTX() MOV(RAX, m_cpu->set_host_fpu_ctx_fn); CALL(RAX) +#define CALL_FPU_EXP_CHK() MOV(RAX, m_cpu->fpu_exp_post_check_fn); CALL(RAX) +#define CALL_FPU_STACK_CHK(is_push, instr_ty) LEA(R8, MEMD64(RSP, LOCAL_VARS_off(0))); \ +LEA(RDX, MEMD64(RSP, LOCAL_VARS_off(2))); \ +CALL_F((&fpu_stack_check)); \ +MOV(EBX, EAX); \ +MOV(R8D, MEMD32(RSP, LOCAL_VARS_off(2))); lc86_jit::lc86_jit(cpu_t *cpu) { @@ -552,27 +560,41 @@ lc86_jit::gen_aux_funcs() MOV(EAX, MEMD32(RCX, CPU_CTX_INT)); RET(); - // raise any int - size_t raise_int_off = m_a.offset(), raise_int_off_aligned16 = (raise_int_off + 15) & ~15; - if (raise_int_off_aligned16 > raise_int_off) { - for (unsigned i = 0; i < (raise_int_off_aligned16 - raise_int_off); ++i) { - INT3(); + const auto &align_next_func_start = [this]() { + size_t off = m_a.offset(), off_aligned16 = (off + 15) & ~15; + if (off_aligned16 > off) { + for (unsigned i = 0; i < (off_aligned16 - off); ++i) { + INT3(); + } } - } + return off_aligned16; + }; + + // raise any int + size_t raise_int_off_aligned16 = align_next_func_start(); m_a.lock().or_(MEMD32(RCX, CPU_CTX_INT), EDX); RET(); // clear any int - size_t clear_int_off = m_a.offset(), clear_int_off_aligned16 = (clear_int_off + 15) & ~15; - if (clear_int_off_aligned16 > clear_int_off) { - for (unsigned i = 0; i < (clear_int_off_aligned16 - clear_int_off); ++i) { - INT3(); - } - } + size_t clear_int_off_aligned16 = align_next_func_start(); NOT(EDX); m_a.lock().and_(MEMD32(RCX, CPU_CTX_INT), EDX); RET(); + // We generate the following fpu related functions once here, to avoid having to generate them at every guest fpu encountered. We cannot use host helpers for these + // because the host needs to mirror the guest control word state to the host when it emulates a guest fpu instruction, and the WIN64 calling convention + // states that the control word is non-volatile across function calls + + // gen_set_host_fpu_ctx + size_t gen_set_host_fpu_ctx_off_aligned16 = align_next_func_start(); + gen_set_host_fpu_ctx(); + RET(); + + // gen_fpu_exp_post_check + size_t gen_fpu_exp_post_check_off_aligned16 = align_next_func_start(); + gen_fpu_exp_post_check(); + RET(); + if (auto err = m_code.flatten()) { std::string err_str("Asmjit failed at flatten() with the error "); err_str += DebugUtils::errorAsString(err); @@ -611,6 +633,8 @@ lc86_jit::gen_aux_funcs() m_cpu->read_int_fn = reinterpret_cast(static_cast(block.addr) + offset); m_cpu->raise_int_fn = reinterpret_cast(static_cast(block.addr) + offset + raise_int_off_aligned16); m_cpu->clear_int_fn = reinterpret_cast(static_cast(block.addr) + offset + clear_int_off_aligned16); + m_cpu->set_host_fpu_ctx_fn = reinterpret_cast(static_cast(block.addr) + offset + gen_set_host_fpu_ctx_off_aligned16); + m_cpu->fpu_exp_post_check_fn = reinterpret_cast(static_cast(block.addr) + offset + gen_fpu_exp_post_check_off_aligned16); } void @@ -2351,81 +2375,6 @@ lc86_jit::gen_simd_mem_align_check() m_a.bind(ok); } -template -void lc86_jit::gen_fpu_stack_check() -{ - // this function places in EBX the value of the fpu stack top after the push/pop, and in R8D the flags of the status word following a stack fault. It also writes - // to the host stack, at offset zero, an appropriate indefinite value when it detects a masked stack exception - // NOTE: we only support masked stack exceptions for now - - Label no_stack_fault = m_a.newLabel(), exp_masked = m_a.newLabel(); - XOR(R8D, R8D); - if constexpr (is_push) { - // detect stack overflow - MOVZX(EBX, MEMD16(RCX, FPU_DATA_FTOP)); - SUB(EBX, 1); - AND(EBX, 7); - MOV(AX, MEMSD16(RCX, RBX, 0, CPU_CTX_FTAGS0)); - CMP(AX, FPU_TAG_EMPTY); - BR_EQ(no_stack_fault); - } - else { - // detect stack underflow - MOVZX(EBX, MEMD16(RCX, FPU_DATA_FTOP)); - ADD(EBX, 1); - AND(EBX, 7); - MOV(AX, MEMSD16(RCX, RBX, 0, CPU_CTX_FTAGS0)); - CMP(AX, FPU_TAG_EMPTY); - BR_NE(no_stack_fault); - } - - MOVZX(EAX, MEMD16(RCX, CPU_CTX_FCTRL)); - TEST(EAX, FPU_EXP_INVALID); - BR_NE(exp_masked); - static const char *abort_msg = "Unmasked fpu stack exception not supported"; - MOV(RCX, abort_msg); - MOV(RAX, &cpu_runtime_abort); - CALL(RAX); // won't return - INT3(); - m_a.bind(exp_masked); - // stack fault exception masked, write an indefinite value, so that the fpu instr uses it - MOVZX(R8D, MEMD16(RCX, CPU_CTX_FSTATUS)); - OR(R8D, FPU_FLG_IE | FPU_FLG_SF | (is_push ? (1 << FPU_C1_SHIFT) : (0 << FPU_C1_SHIFT))); - - switch (instr_type) - { - case fpu_instr_t::integer8: - MOV(MEMD8(RSP, LOCAL_VARS_off(0)), FPU_INTEGER_INDEFINITE8); - break; - - case fpu_instr_t::integer16: - MOV(MEMD16(RSP, LOCAL_VARS_off(0)), FPU_INTEGER_INDEFINITE16); - break; - - case fpu_instr_t::integer32: - MOV(MEMD32(RSP, LOCAL_VARS_off(0)), FPU_INTEGER_INDEFINITE32); - break; - - case fpu_instr_t::integer64: - MOV(MEMD64(RSP, LOCAL_VARS_off(0)), FPU_INTEGER_INDEFINITE64); - break; - - case fpu_instr_t::float_: - MOV(MEMD64(RSP, LOCAL_VARS_off(0)), FPU_QNAN_FLOAT_INDEFINITE64); - MOV(MEMD16(RSP, LOCAL_VARS_off(1)), FPU_QNAN_FLOAT_INDEFINITE16); - break; - - case fpu_instr_t::bcd: - MOV(MEMD64(RSP, LOCAL_VARS_off(0)), FPU_BCD_INDEFINITE64); - MOV(MEMD16(RSP, LOCAL_VARS_off(1)), FPU_BCD_INDEFINITE16); - break; - - default: - LIB86CPU_ABORT(); - } - m_a.bind(no_stack_fault); -} - void lc86_jit::gen_fpu_exp_post_check() { @@ -5153,17 +5102,24 @@ lc86_jit::fld(decoded_instr *instr) RAISEin0_t(EXP_NM); } else { - // XXX missing check for fpu unmasked exceptions get_rm(instr, [this, instr](const op_info rm) { - gen_fpu_stack_check(); - gen_set_host_fpu_ctx(); + Label stack_fault = m_a.newLabel(), ok = m_a.newLabel(); + MOV(MEMD64(RSP, LOCAL_VARS_off(0)), 0); + CALL_FPU_STACK_CHK(true, fpu_instr_t::float_); + CALL_FPU_SET_CTX(); MOV(EDX, instr->i.raw.modrm.rm); MOV(EAX, sizeof(uint80_t)); MUL(DX); + TEST(MEMD64(RSP, LOCAL_VARS_off(0)), 0); + BR_NE(stack_fault); FLD(MEMSD80(RCX, RAX, 0, CPU_CTX_R0)); - gen_fpu_exp_post_check(); + BR_UNCOND(ok); + m_a.bind(stack_fault); + FLD(MEMD32(RSP, LOCAL_VARS_off(0))); + m_a.bind(ok); + CALL_FPU_EXP_CHK(); MOV(EAX, sizeof(uint80_t)); ST_R16(FPU_DATA_FTOP, BX); MUL(BX); @@ -5177,23 +5133,23 @@ lc86_jit::fld(decoded_instr *instr) case 0xD9: LD_MEMs(SIZE32); MOV(MEMD32(RSP, LOCAL_VARS_off(0)), EAX); - gen_fpu_stack_check(); - gen_set_host_fpu_ctx(); + CALL_FPU_STACK_CHK(true, fpu_instr_t::float_); + CALL_FPU_SET_CTX(); FLD(MEMD32(RSP, LOCAL_VARS_off(0))); break; case 0xDD: LD_MEMs(SIZE64); MOV(MEMD64(RSP, LOCAL_VARS_off(0)), RAX); - gen_fpu_stack_check(); - gen_set_host_fpu_ctx(); + CALL_FPU_STACK_CHK(true, fpu_instr_t::float_); + CALL_FPU_SET_CTX(); FLD(MEMD64(RSP, LOCAL_VARS_off(0))); break; case 0xDB: - LD_MEMs(SIZE80); - gen_fpu_stack_check(); - gen_set_host_fpu_ctx(); + LD_MEM80(0); + CALL_FPU_STACK_CHK(true, fpu_instr_t::float_); + CALL_FPU_SET_CTX(); FLD(MEMD80(RSP, LOCAL_VARS_off(0))); break; @@ -5201,7 +5157,7 @@ lc86_jit::fld(decoded_instr *instr) LIB86CPU_ABORT(); } - gen_fpu_exp_post_check(); + CALL_FPU_EXP_CHK(); MOV(EAX, sizeof(uint80_t)); ST_R16(FPU_DATA_FTOP, BX); MUL(BX); diff --git a/lib86cpu/core/emitter/x64/jit.h b/lib86cpu/core/emitter/x64/jit.h index 99e725f..0c9fedf 100644 --- a/lib86cpu/core/emitter/x64/jit.h +++ b/lib86cpu/core/emitter/x64/jit.h @@ -25,14 +25,6 @@ struct op_info { op_info(size_t val_, size_t bits_) : val(val_), bits(bits_) {} }; -enum class fpu_instr_t : int { - integer8, - integer16, - integer32, - integer64, - float_, - bcd, -}; class lc86_jit : public Target { public: @@ -253,8 +245,6 @@ class lc86_jit : public Target { template void gen_stack_pop(); void gen_simd_mem_align_check(); - template - void gen_fpu_stack_check(); void gen_fpu_exp_post_check(); void gen_set_host_fpu_ctx(); template diff --git a/lib86cpu/core/fpu.cpp b/lib86cpu/core/fpu.cpp index bb15865..bdee5cb 100644 --- a/lib86cpu/core/fpu.cpp +++ b/lib86cpu/core/fpu.cpp @@ -37,3 +37,89 @@ fpu_update_tag(cpu_ctx_t *cpu_ctx, uint32_t idx) cpu_ctx->regs.ftags[idx] = FPU_TAG_VALID; } } + +template +uint32_t fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val) +{ + // this function returns the fpu stack top after the push/pop, and the flags of the status word following a stack fault. It also writes + // an appropriate indefinite value when it detects a masked stack exception + // NOTE: we only support masked stack exceptions for now + + *sw = 0; + uint32_t ftop; + bool no_stack_fault; + if constexpr (is_push) { + // detect stack overflow + ftop = cpu_ctx->fpu_data.ftop; + ftop -= 1; + ftop &= 7; + no_stack_fault = cpu_ctx->regs.ftags[ftop] == FPU_TAG_EMPTY; + } + else { + // detect stack underflow + ftop = cpu_ctx->fpu_data.ftop; + ftop += 1; + ftop &= 7; + no_stack_fault = cpu_ctx->regs.ftags[ftop] != FPU_TAG_EMPTY; + } + + if (!no_stack_fault) { + uint16_t fctrl = cpu_ctx->regs.fctrl; + fctrl &= FPU_EXP_INVALID; + if ((cpu_ctx->regs.fctrl & FPU_EXP_INVALID) == 0) { + static const char *abort_msg = "Unmasked fpu stack exception not supported"; + cpu_runtime_abort(abort_msg); // won't return + } + // stack fault exception masked, write an indefinite value, so that the fpu instr uses it + uint32_t fstatus = cpu_ctx->regs.fstatus; + fstatus |= (FPU_FLG_IE | FPU_FLG_SF | (is_push ? (1 << FPU_C1_SHIFT) : (0 << FPU_C1_SHIFT))); + *sw = fstatus; + + switch (instr_type) + { + case fpu_instr_t::integer8: + inv_val->low = FPU_INTEGER_INDEFINITE8; + break; + + case fpu_instr_t::integer16: + inv_val->low = FPU_INTEGER_INDEFINITE16; + break; + + case fpu_instr_t::integer32: + inv_val->low = FPU_INTEGER_INDEFINITE32; + break; + + case fpu_instr_t::integer64: + inv_val->low = FPU_INTEGER_INDEFINITE64; + break; + + case fpu_instr_t::float_: + inv_val->low = FPU_QNAN_FLOAT_INDEFINITE64; + inv_val->high = FPU_QNAN_FLOAT_INDEFINITE16; + break; + + case fpu_instr_t::bcd: + inv_val->low = FPU_BCD_INDEFINITE64; + inv_val->high = FPU_BCD_INDEFINITE16; + break; + + default: + LIB86CPU_ABORT(); + } + } + + return ftop; +} + +template uint32_t fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val); +template uint32_t fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val); +template uint32_t fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val); +template uint32_t fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val); +template uint32_t fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val); +template uint32_t fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val); +template uint32_t fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val); +template uint32_t fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val); +template uint32_t fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val); +template uint32_t fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val); +template uint32_t fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val); +template uint32_t fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val); diff --git a/lib86cpu/core/fpu.h b/lib86cpu/core/fpu.h new file mode 100644 index 0000000..babe53f --- /dev/null +++ b/lib86cpu/core/fpu.h @@ -0,0 +1,22 @@ +/* + * ergo720 Copyright (c) 2024 + */ + +#pragma once + +#include "lib86cpu_priv.h" + + +enum class fpu_instr_t : int { + integer8 = 0, + integer16, + integer32, + integer64, + float_, + bcd, +}; + +void fpu_init(cpu_t *cpu); +void JIT_API fpu_update_tag(cpu_ctx_t *cpu_ctx, uint32_t idx); +template +uint32_t JIT_API fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val); diff --git a/lib86cpu/core/internal.h b/lib86cpu/core/internal.h index 31924b8..3bf5057 100644 --- a/lib86cpu/core/internal.h +++ b/lib86cpu/core/internal.h @@ -9,6 +9,7 @@ #include "decode.h" #include "support.h" #include "breakpoint.h" +#include "fpu.h" template @@ -21,8 +22,6 @@ addr_t get_pc(cpu_ctx_t *cpu_ctx); template translated_code_t * JIT_API cpu_raise_exception(cpu_ctx_t *cpu_ctx); uint32_t JIT_API cpu_do_int(cpu_ctx_t *cpu_ctx, uint32_t int_flg); -void fpu_init(cpu_t *cpu); -void JIT_API fpu_update_tag(cpu_ctx_t *cpu_ctx, uint32_t idx); void halt_loop(cpu_t *cpu); void JIT_API tlb_invalidate_(cpu_ctx_t *cpu_ctx, addr_t addr); @@ -422,7 +421,7 @@ CR0_TS_MASK | CR0_EM_MASK | CR0_MP_MASK | CR0_PE_MASK) // fpu indefinite values #define FPU_INTEGER_INDEFINITE8 (1 << 7) #define FPU_INTEGER_INDEFINITE16 (1 << 15) -#define FPU_INTEGER_INDEFINITE32 (1 << 31) +#define FPU_INTEGER_INDEFINITE32 (1UL << 31) #define FPU_INTEGER_INDEFINITE64 (1ULL << 63) #define FPU_QNAN_FLOAT_INDEFINITE64 0xC000000000000000 // mantissa part #define FPU_QNAN_FLOAT_INDEFINITE16 0xFFFF // exponent and sign parts diff --git a/lib86cpu/lib86cpu_priv.h b/lib86cpu/lib86cpu_priv.h index bab9dca..7255ad8 100644 --- a/lib86cpu/lib86cpu_priv.h +++ b/lib86cpu/lib86cpu_priv.h @@ -100,6 +100,7 @@ using entry_t = translated_code_t *(JIT_API *)(cpu_ctx_t *cpu_ctx); using read_int_t = uint32_t(JIT_API *)(cpu_ctx_t *cpu_ctx); using raise_int_t = void(JIT_API *)(cpu_ctx_t *cpu_ctx, uint32_t int_flg); using clear_int_t = void(JIT_API *)(cpu_ctx_t *cpu_ctx, uint32_t int_flg); +using fpu_func_t = void(*)(); // jmp_offset functions: 0,1 -> used for direct linking (either points to exit or &next_tc), 2 -> exit struct translated_code_t { @@ -208,6 +209,9 @@ struct cpu_t { read_int_t read_int_fn; raise_int_t raise_int_fn; clear_int_t clear_int_fn; + // don't call these fpu functions from the host, they won't work correctly. Only call them from the jitted code + fpu_func_t set_host_fpu_ctx_fn; + fpu_func_t fpu_exp_post_check_fn; fp_int get_int_vec; std::string dbg_name; addr_t bp_addr;