diff --git a/lib86cpu/core/emitter/emitter_common.cpp b/lib86cpu/core/emitter/emitter_common.cpp
index 497646e..ded1c48 100644
--- a/lib86cpu/core/emitter/emitter_common.cpp
+++ b/lib86cpu/core/emitter/emitter_common.cpp
@@ -7,6 +7,15 @@
 #include "emitter_common.h"
 
 
+// For fpu_check_stack_fault
+static_assert(CPU_CTX_ES < 65536);
+static_assert(CPU_CTX_CS < 65536);
+static_assert(CPU_CTX_SS < 65536);
+static_assert(CPU_CTX_DS < 65536);
+static_assert(CPU_CTX_FS < 65536);
+static_assert(CPU_CTX_GS < 65536);
+
+
 static const std::unordered_map<ZydisRegister, const std::pair<int, size_t>> zydis_to_reg_offset_table = {
 	{ ZYDIS_REGISTER_AL,         { EAX_idx,       CPU_CTX_EAX     }  },
 	{ ZYDIS_REGISTER_CL,         { ECX_idx,       CPU_CTX_ECX     }  },
diff --git a/lib86cpu/core/emitter/emitter_common.h b/lib86cpu/core/emitter/emitter_common.h
index 2969274..9c6b2e5 100644
--- a/lib86cpu/core/emitter/emitter_common.h
+++ b/lib86cpu/core/emitter/emitter_common.h
@@ -208,6 +208,9 @@ inline constexpr auto all_callable_funcs = std::make_tuple(
 	cpu_runtime_abort,
 	dbg_update_exp_hook,
 	tlb_invalidate_,
-	fpu_stack_check<true>,
-	fpu_stack_check<false>
+	fpu_is_tag_empty,
+	fpu_stack_overflow,
+	fpu_stack_underflow,
+	fpu_stack_fault,
+	fpu_update_ptr
 );
diff --git a/lib86cpu/core/emitter/x64/jit.cpp b/lib86cpu/core/emitter/x64/jit.cpp
index 1d0262a..9b9d71d 100644
--- a/lib86cpu/core/emitter/x64/jit.cpp
+++ b/lib86cpu/core/emitter/x64/jit.cpp
@@ -507,6 +507,17 @@ static_assert((LOCAL_VARS_off(0) & 15) == 0); // must be 16 byte aligned so that
 #define RELOAD_RCX_CTX() MOV(RCX, &m_cpu->cpu_ctx)
 #define RESTORE_FPU_CTX() FLDCW(MEMD16(RSP, LOCAL_VARS_off(5)))
 #define CALL_F(func) MOV(RAX, func); CALL(RAX); RELOAD_RCX_CTX()
+#define FPU_IS_TAG_EMPTY(num) MOV(EDX, (num)); \
+	CALL_F(&fpu_is_tag_empty); \
+	TEST(EAX, EAX)
+#define FPU_CLEAR_C1() AND(MEMD16(RCX, CPU_CTX_FSTATUS), ~FPU_FLG_C1)
+#define FPU_PUSH() DEC(MEMD16(RCX, FPU_DATA_FTOP)); AND(MEMD16(RCX, FPU_DATA_FTOP), 7)
+#define FPU_LOAD_STX(x) MOVZX(EDX, MEMD16(RCX, FPU_DATA_FTOP)); \
+	ADD(EDX, (x)); \
+	AND(EDX, 7); \
+	MOV(EAX, sizeof(uint80_t)); \
+	MUL(DX); \
+	FLD(MEMSD80(RCX, RAX, 0, CPU_CTX_R0))
 
 
 lc86_jit::lc86_jit(cpu_t *cpu)
@@ -2345,30 +2356,23 @@ lc86_jit::gen_simd_mem_align_check()
 	m_a.bind(ok);
 }
 
-void
-lc86_jit::gen_fpu_exp_post_check()
+template<typename T>
+void lc86_jit::gen_fpu_exp_post_check(uint32_t exception, T &&unmasked)
 {
-	// this function should be called immediately after the fpu instr to check exceptions for. It expects to find in R8W the flags of the status word following
-	// a previous stack fault (if any happened)
-	// NOTE: we only support masked exceptions for now
+	// This function should be called immediately after the fpu instr to check exceptions for
 
-	Label no_exp = m_a.newLabel();
+	Label masked = m_a.newLabel();
 	FNSTSW(AX);
-	TEST(AX, FPU_EXP_ALL);
-	BR_EQ(no_exp);
-	LD_R16(R9W, CPU_CTX_FCTRL);
-	AND(R9W, FPU_EXP_ALL);
-	CMP(R9W, FPU_EXP_ALL);
-	BR_EQ(no_exp);
-	static const char *abort_msg = "Unmasked fpu exceptions are not supported";
-	MOV(RCX, abort_msg);
-	MOV(RAX, &cpu_runtime_abort);
-	CALL(RAX); // won't return
-	INT3();
-	m_a.bind(no_exp);
-	AND(AX, ~(FPU_FLG_SF | FPU_FLG_ES | FPU_FLG_TOP | FPU_FLG_BSY));
-	OR(AX, R8W);
-	ST_R16(CPU_CTX_FSTATUS, AX);
+	MOV(DX, MEMD16(RCX, CPU_CTX_FCTRL));
+	NOT(DX);
+	AND(DX, AX);
+	AND(AX, (FPU_EXP_ALL | FPU_FLG_CC_ALL));
+	AND(MEMD16(RCX, CPU_CTX_FSTATUS), ~(FPU_EXP_ALL | FPU_FLG_CC_ALL));
+	OR(MEMD16(RCX, CPU_CTX_FSTATUS), AX); // update exception and condition code flags of guest fstatus
+	TEST(DX, exception); // test if exceptions of interest are unmasked
+	BR_EQ(masked);
+	unmasked();
+	m_a.bind(masked);
 }
 
 void
@@ -2380,26 +2384,48 @@ lc86_jit::gen_set_host_fpu_ctx()
 	FNCLEX(); // clear all pending fpu exceptions, so that we can use the host to detect guest fpu exceptions
 }
 
-template<bool update_fdp>
-void lc86_jit::gen_update_fpu_ptr(decoded_instr *instr)
+void
+lc86_jit::gen_update_fpu_ptr(decoded_instr *instr, x86::Gp mem_addr64)
 {
-	ST_R16(CPU_CTX_FCS, m_cpu->cpu_ctx.regs.cs);
-	ST_R32(CPU_CTX_FIP, m_cpu->instr_eip);
-	MOV(AX, MEMD16(RCX, CPU_CTX_DS));
-	ST_R16(CPU_CTX_FDS, AX);
-	if constexpr (update_fdp) {
-		ST_R32(CPU_CTX_FDP, m_cpu->instr_eip + instr->i.raw.modrm.offset);
+	/*
+	is_mem_op -> bit 63
+	fop -> [48 - 58]
+	seg offset -> [32 - 47]
+	modrm addr -> [0 - 31]
+	*/
+
+	uint64_t is_mem_operand = !(((instr->i.raw.modrm.reg << 3) | (instr->i.raw.modrm.mod << 6)) == 0xC0); // all fpu instr with 0xCx modrm have reg only operands
+	uint64_t instr_info = is_mem_operand << 63;
+	instr_info |= ((((instr->i.raw.modrm.rm | (instr->i.raw.modrm.reg << 3) | (instr->i.raw.modrm.mod << 6)) | ((uint64_t)instr->i.opcode << 8)) & 0x7FF) << 48); // fop is a 11 bit register
+	MOV(RDX, instr_info);
+	if (is_mem_operand) {
+		instr_info |= ((uint64_t)get_seg_prfx_offset(instr) << 32);
+		// modrm addr is calculated at runtime and placed in edx
+		OR(RDX, mem_addr64);
 	}
+	CALL_F(&fpu_update_ptr);
+}
+
+void
+lc86_jit::gen_fpu_exp(uint32_t exception, stack_fault_func func)
+{
+	MOV(EDX, exception);
+	CALL_F(func);
 }
 
-template<bool is_push>
-void lc86_jit::gen_fpu_stack_fault_check(fpu_instr_t fpu_instr)
+void
+lc86_jit::gen_check_fpu_unmasked_exp()
 {
-	MOV(R9D, fpu_instr);
-	LEA(R8, MEMD64(RSP, LOCAL_VARS_off(0)));
-	LEA(RDX, MEMD64(RSP, LOCAL_VARS_off(2)));
-	CALL_F((&fpu_stack_check<is_push>));
-	MOV(R8D, MEMD32(RSP, LOCAL_VARS_off(2)));
+	Label no_exp = m_a.newLabel();
+	TEST(MEMD16(RCX, CPU_CTX_FSTATUS), FPU_FLG_ES);
+	BR_EQ(no_exp);
+	if (m_cpu->cpu_ctx.regs.cr0 & CR0_NE_MASK) {
+		RAISEin0_f(EXP_MF);
+	}
+	else {
+		LIB86CPU_ABORT_msg("MS-DOS compatibility mode for fpu exceptions is not supported");
+	}
+	m_a.bind(no_exp);
 }
 
 template<bool is_push, typename T>
@@ -2428,6 +2454,8 @@ void lc86_jit::gen_fpu_stack_prologue(fpu_instr_t fpu_instr, T &&action_when_no_
 template<ZydisMnemonic mnemonic>
 void lc86_jit::float_arithmetic(decoded_instr *instr)
 {
+	LIB86CPU_ABORT();
+#if 0
 	if (m_cpu->cpu_ctx.hflags & (HFLG_CR0_EM | HFLG_CR0_TS)) {
 		RAISEin0_t(EXP_NM);
 	}
@@ -2529,6 +2557,7 @@ void lc86_jit::float_arithmetic(decoded_instr *instr)
 
 		}
 	}
+#endif
 }
 
 template<unsigned idx>
@@ -3526,7 +3555,8 @@ template<unsigned idx>
 void lc86_jit::float_load_constant(decoded_instr *instr)
 {
 	// idx 0 -> fld1, 1 -> fldl2e, 2 -> fldl2t, 3 -> fldlg2, 4 -> fldln2, 5 -> fldpi, 6 -> fldz
-
+	LIB86CPU_ABORT();
+#if 0
 	if (m_cpu->cpu_ctx.hflags & (HFLG_CR0_EM | HFLG_CR0_TS)) {
 		RAISEin0_t(EXP_NM);
 	}
@@ -3567,6 +3597,7 @@ void lc86_jit::float_load_constant(decoded_instr *instr)
 		MOV(EDX, EBX);
 		CALL_F(&fpu_update_tag<true>);
 	}
+#endif
 }
 
 void
@@ -5248,6 +5279,8 @@ lc86_jit::enter(decoded_instr *instr)
 void
 lc86_jit::fild(decoded_instr *instr)
 {
+	LIB86CPU_ABORT();
+#if 0
 	if (m_cpu->cpu_ctx.hflags & (HFLG_CR0_EM | HFLG_CR0_TS)) {
 		RAISEin0_t(EXP_NM);
 	}
@@ -5281,11 +5314,14 @@ lc86_jit::fild(decoded_instr *instr)
 				CALL_F(&fpu_update_tag<true>);
 			});
 	}
+#endif
 }
 
 void
 lc86_jit::fistp(decoded_instr *instr)
 {
+	LIB86CPU_ABORT();
+#if 0
 	if (m_cpu->cpu_ctx.hflags & (HFLG_CR0_EM | HFLG_CR0_TS)) {
 		RAISEin0_t(EXP_NM);
 	}
@@ -5328,6 +5364,7 @@ lc86_jit::fistp(decoded_instr *instr)
 			CALL_F(&fpu_update_tag<false>);
 		}
 	}
+#endif
 }
 
 void
@@ -5337,45 +5374,77 @@ lc86_jit::fld(decoded_instr *instr)
 		RAISEin0_t(EXP_NM);
 	}
 	else {
+		gen_check_fpu_unmasked_exp();
+		Label end_instr = m_a.newLabel();
+		const auto stack_fault_check = [&]() {
+			Label ok = m_a.newLabel();
+			FPU_IS_TAG_EMPTY(-1); // check for stack overflow of dst st0
+			BR_NE(ok);
+			gen_fpu_exp(FPU_STACK_OVERFLOW, &fpu_stack_overflow);
+			BR_UNCOND(end_instr);
+			m_a.bind(ok);
+			};
+
 		get_rm<OPNUM_SINGLE>(instr,
-			[this, instr](const op_info rm)
+			[this, instr, end_instr, &stack_fault_check](const op_info rm)
 			{
-				gen_fpu_stack_prologue<true>(fpu_instr_t::float_, [this, instr]() {
-					MOV(EDX, instr->i.raw.modrm.rm);
-					MOV(EAX, sizeof(uint80_t));
-					MUL(DX);
-					FLD(MEMSD80(RCX, RAX, 0, CPU_CTX_R0)); // load guest st(i) to host st0
-					});
-				gen_fpu_exp_post_check();
-				MOV(EBX, MEMD32(RSP, LOCAL_VARS_off(4)));
+				Label ok = m_a.newLabel(), masked = m_a.newLabel(), do_push = m_a.newLabel();
+				unsigned stx = instr->i.raw.modrm.rm;
+				gen_update_fpu_ptr(instr);
+				FPU_CLEAR_C1();
+				stack_fault_check();
+				FPU_IS_TAG_EMPTY(stx); // check for stack underflow for src stx
+				BR_EQ(ok);
+				gen_fpu_exp(FPU_STACK_UNDERFLOW, &fpu_stack_fault);
+				TEST(MEMD16(RCX, CPU_CTX_FCTRL), FPU_FLG_IE);
+				BR_NE(masked);
+				BR_UNCOND(end_instr);
+				m_a.bind(masked); // if masked, load a qnan
+				EMMS();
+				FNCLEX();
+				MOV(MEMD64(RSP, LOCAL_VARS_off(0)), FPU_QNAN_FLOAT_INDEFINITE64);
+				MOV(MEMD64(RSP, LOCAL_VARS_off(1)), FPU_QNAN_FLOAT_INDEFINITE16);
+				FLD(MEMD80(RSP, LOCAL_VARS_off(0))); // load qnan
+				BR_UNCOND(do_push);
+				m_a.bind(ok);
+				EMMS();
+				FNCLEX();
+				FPU_LOAD_STX(instr->i.raw.modrm.rm); // load src stx
+				m_a.bind(do_push);
+				FPU_PUSH();
 				MOV(EAX, sizeof(uint80_t));
-				ST_R16(FPU_DATA_FTOP, BX);
-				MUL(BX);
-				FSTP(MEMSD80(RCX, RAX, 0, CPU_CTX_R0));
-				gen_update_fpu_ptr<false>(instr);
+				MUL(MEMD16(RCX, FPU_DATA_FTOP));
+				FSTP(MEMSD80(RCX, RAX, 0, CPU_CTX_R0)); // store src stx or qnan to dst st0
 			},
-			[this, instr](const op_info rm)
+			[this, instr, end_instr, &stack_fault_check](const op_info rm)
 			{
 				uint8_t size = instr->i.opcode == 0xD9 ? SIZE32 : (instr->i.opcode == 0xDD ? SIZE64 : SIZE80);
-				LD_MEMs(size);
+				auto rax_host_reg = SIZED_REG(x64::rax, size);
+				MOV(EBX, EDX); // save mem addr for gen_fpu_exp
+				LD_MEMs(size); // load src mem
 				if (size != SIZE80) {
-					MOV(MEMD(RSP, LOCAL_VARS_off(0), size), EAX);
+					MOV(MEMD(RSP, LOCAL_VARS_off(0), size), rax_host_reg);
 				}
-				gen_fpu_stack_fault_check<true>(fpu_instr_t::float_);
+				gen_update_fpu_ptr(instr);
+				FPU_CLEAR_C1();
+				stack_fault_check();
 				gen_set_host_fpu_ctx();
 				FLD(MEMD(RSP, LOCAL_VARS_off(0), size));
-				gen_fpu_exp_post_check();
-				MOV(EBX, MEMD32(RSP, LOCAL_VARS_off(4)));
+				gen_fpu_exp_post_check(FPU_EXP_INVALID, [this, instr, end_instr, size]() {
+					FSTP(MEMD(RSP, LOCAL_VARS_off(0), size)); // do a dummy pop to restore host fpu stack
+					OR(MEMD16(RCX, CPU_CTX_FSTATUS), FPU_FLG_ES);
+					BR_UNCOND(end_instr);
+					});
+				FPU_PUSH();
 				MOV(EAX, sizeof(uint80_t));
-				ST_R16(FPU_DATA_FTOP, BX);
-				MUL(BX);
-				FSTP(MEMSD80(RCX, RAX, 0, CPU_CTX_R0));
-				gen_update_fpu_ptr<true>(instr);
+				MUL(MEMD16(RCX, FPU_DATA_FTOP));
+				FSTP(MEMSD(RCX, RAX, 0, CPU_CTX_R0, size)); // store src mem to dst st0
+				RESTORE_FPU_CTX();
 			});
 
-		RESTORE_FPU_CTX();
-		MOV(EDX, EBX);
-		CALL_F(&fpu_update_tag<true>);
+		XOR(EDX, EDX);
+		CALL_F(&fpu_update_tag<true>); // update dst st0 tag
+		m_a.bind(end_instr);
 	}
 }
 
@@ -5533,6 +5602,8 @@ lc86_jit::fnstsw(decoded_instr *instr)
 void
 lc86_jit::fstp(decoded_instr *instr)
 {
+	LIB86CPU_ABORT();
+#if 0
 	if (m_cpu->cpu_ctx.hflags & (HFLG_CR0_EM | HFLG_CR0_TS)) {
 		RAISEin0_t(EXP_NM);
 	}
@@ -5542,10 +5613,13 @@ lc86_jit::fstp(decoded_instr *instr)
 			gen_fpu_stack_prologue<false>(fpu_instr_t::float_, [this]() {
 				MOV(EAX, sizeof(uint80_t));
 				MUL(MEMD16(RSP, LOCAL_VARS_off(4)));
-				FLD(MEMSD80(RCX, RAX, 0, CPU_CTX_R0)); // load guest st0 to host st0
+				FLD(MEMSD80(RCX, RAX, 0, CPU_CTX_R0)); // load src st0
 				});
 		}
 		else {
+			MOV(EAX, sizeof(uint80_t));
+			MUL(MEMD16(RCX, FPU_DATA_FTOP));
+			FLD(MEMSD80(RCX, RAX, 0, CPU_CTX_R0)); // load src st0
 			XOR(R8D, R8D); // clear r8w so that gen_fpu_exp_post_check still works
 		}
 
@@ -5584,6 +5658,7 @@ lc86_jit::fstp(decoded_instr *instr)
 			CALL_F(&fpu_update_tag<false>);
 		}
 	}
+#endif
 }
 
 void
diff --git a/lib86cpu/core/emitter/x64/jit.h b/lib86cpu/core/emitter/x64/jit.h
index 22e319d..8cbe2c8 100644
--- a/lib86cpu/core/emitter/x64/jit.h
+++ b/lib86cpu/core/emitter/x64/jit.h
@@ -260,12 +260,10 @@ class lc86_jit : public Target {
 	template<unsigned num, unsigned store_at = 0, bool write_esp = true>
 	void gen_stack_pop();
 	void gen_simd_mem_align_check();
-	void gen_fpu_exp_post_check();
+	template<typename T>
+	void gen_fpu_exp_post_check(uint32_t exception, T &&unmasked);
 	void gen_set_host_fpu_ctx();
-	template<bool update_fdp>
-	void gen_update_fpu_ptr(decoded_instr *instr);
-	template<bool is_push>
-	void gen_fpu_stack_fault_check(fpu_instr_t fpu_instr);
+	void gen_update_fpu_ptr(decoded_instr *instr, x86::Gp mem_addr64 = x86::rbx);
 	template<unsigned idx>
 	void shift(decoded_instr *instr);
 	template<unsigned idx>
@@ -288,6 +286,8 @@ class lc86_jit : public Target {
 	void float_load_constant(decoded_instr *instr);
 	template<bool is_push, typename T>
 	void gen_fpu_stack_prologue(fpu_instr_t fpu_instr, T &&action_when_no_fault);
+	void gen_fpu_exp(uint32_t exception, stack_fault_func func);
+	void gen_check_fpu_unmasked_exp();
 
 	cpu_t *m_cpu;
 	CodeHolder m_code;
diff --git a/lib86cpu/core/fpu.cpp b/lib86cpu/core/fpu.cpp
index 230f363..a590228 100644
--- a/lib86cpu/core/fpu.cpp
+++ b/lib86cpu/core/fpu.cpp
@@ -20,9 +20,16 @@ fpu_init(cpu_t *cpu)
 	}
 }
 
+static void
+fpu_push(cpu_ctx_t *cpu_ctx)
+{
+	cpu_ctx->fpu_data.ftop = (cpu_ctx->fpu_data.ftop - 1) & 7;
+}
+
 template<bool is_push>
-void fpu_update_tag(cpu_ctx_t *cpu_ctx, uint32_t idx)
+void fpu_update_tag(cpu_ctx_t *cpu_ctx, uint32_t st_num)
 {
+	uint32_t idx = (st_num + cpu_ctx->fpu_data.ftop) & 7;
 	if constexpr (is_push) {
 		uint16_t exp = cpu_ctx->regs.fr[idx].high & 0x7FFF;
 		uint64_t mant = cpu_ctx->regs.fr[idx].low;
@@ -43,77 +50,62 @@ void fpu_update_tag(cpu_ctx_t *cpu_ctx, uint32_t idx)
 	}
 }
 
-template<bool is_push>
-uint32_t fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val, fpu_instr_t instr_type)
+uint32_t
+fpu_is_tag_empty(cpu_ctx_t *cpu_ctx, uint32_t st_num)
 {
-	// this function returns the fpu stack pointer to the value modified by the push/pop, and the flags of the status word following a stack fault.
-	// It also writes an appropriate indefinite value when it detects a masked stack exception
-	// NOTE: we only support masked stack exceptions for now
+	return cpu_ctx->regs.ftags[(st_num + cpu_ctx->fpu_data.ftop) & 7] == FPU_TAG_EMPTY;
+}
 
-	uint32_t ftop, fstatus = cpu_ctx->regs.fstatus;
-	*sw = fstatus;
-	bool no_stack_fault;
-	if constexpr (is_push) {
-		// detect stack overflow
-		ftop = cpu_ctx->fpu_data.ftop;
-		ftop -= 1;
-		ftop &= 7;
-		no_stack_fault = cpu_ctx->regs.ftags[ftop] == FPU_TAG_EMPTY;
-	}
-	else {
-		// detect stack underflow
-		ftop = cpu_ctx->fpu_data.ftop;
-		no_stack_fault = cpu_ctx->regs.ftags[ftop] != FPU_TAG_EMPTY;
+void
+fpu_update_ptr(cpu_ctx_t *cpu_ctx, uint64_t instr_info)
+{
+	cpu_ctx->regs.fcs = cpu_ctx->regs.cs;
+	cpu_ctx->regs.fip = cpu_ctx->regs.eip;
+	cpu_ctx->regs.fop = ((instr_info >> 48) & 0x7FF);
+	if (instr_info & (1ULL << 63)) {
+		cpu_ctx->regs.fds = *(uint16_t *)(((instr_info >> 32) & 0xFFFF) + (uint8_t *)cpu_ctx);
+		cpu_ctx->regs.fdp = instr_info & 0xFFFFFFFF;
 	}
+}
 
-	if (!no_stack_fault) {
-		uint16_t fctrl = cpu_ctx->regs.fctrl;
-		fctrl &= FPU_EXP_INVALID;
-		if ((cpu_ctx->regs.fctrl & FPU_EXP_INVALID) == 0) {
-			static const char *abort_msg = "Unmasked fpu stack exception not supported";
-			cpu_runtime_abort(abort_msg); // won't return
-		}
-		// stack fault exception masked, write an indefinite value, so that the fpu instr uses it
-		fstatus |= (FPU_FLG_IE | FPU_FLG_SF | (is_push ? (1 << FPU_C1_SHIFT) : (0 << FPU_C1_SHIFT)));
-		*sw = fstatus;
-
-		switch (instr_type)
-		{
-		case fpu_instr_t::integer8:
-			inv_val->low = FPU_INTEGER_INDEFINITE8;
-			break;
-
-		case fpu_instr_t::integer16:
-			inv_val->low = FPU_INTEGER_INDEFINITE16;
-			break;
-
-		case fpu_instr_t::integer32:
-			inv_val->low = FPU_INTEGER_INDEFINITE32;
-			break;
-
-		case fpu_instr_t::integer64:
-			inv_val->low = FPU_INTEGER_INDEFINITE64;
-			break;
-
-		case fpu_instr_t::float_:
-			inv_val->low = FPU_QNAN_FLOAT_INDEFINITE64;
-			inv_val->high = FPU_QNAN_FLOAT_INDEFINITE16;
-			break;
+void
+fpu_stack_fault(cpu_ctx_t *cpu_ctx, uint32_t exception)
+{
+	assert(exception & FPU_EXP_INVALID);
 
-		case fpu_instr_t::bcd:
-			inv_val->low = FPU_BCD_INDEFINITE64;
-			inv_val->high = FPU_BCD_INDEFINITE16;
-			break;
+	exception &= (FPU_EXP_ALL | FPU_FLG_SF | FPU_FLG_C1);
+	uint32_t unmasked = (exception & ~cpu_ctx->regs.fctrl) & FPU_EXP_ALL;
+	if (unmasked) {
+		cpu_ctx->regs.fstatus |= FPU_FLG_ES;
+	}
 
-		default:
-			LIB86CPU_ABORT();
+	cpu_ctx->regs.fstatus |= exception;
+	if (exception & FPU_FLG_SF) {
+		if (!(exception & FPU_FLG_C1)) {
+			cpu_ctx->regs.fstatus &= ~FPU_FLG_C1;
 		}
 	}
+}
+
+void
+fpu_stack_overflow(cpu_ctx_t *cpu_ctx, uint32_t exception)
+{
+	if (cpu_ctx->regs.fctrl & FPU_EXP_INVALID) {
+		// masked stack fault response
+		fpu_push(cpu_ctx);
+		cpu_ctx->regs.fr[cpu_ctx->fpu_data.ftop].low = FPU_QNAN_FLOAT_INDEFINITE64;
+		cpu_ctx->regs.fr[cpu_ctx->fpu_data.ftop].high = FPU_QNAN_FLOAT_INDEFINITE16;
+		fpu_update_tag<true>(cpu_ctx, 0);
+	}
 
-	return ftop;
+	fpu_stack_fault(cpu_ctx, exception);
+}
+
+void
+fpu_stack_underflow(cpu_ctx_t *cpu_ctx, uint32_t exception)
+{
+	// TODO
 }
 
-template JIT_API uint32_t fpu_stack_check<true>(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val, fpu_instr_t instr_type);
-template JIT_API uint32_t fpu_stack_check<false>(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val, fpu_instr_t instr_type);
 template JIT_API void fpu_update_tag<true>(cpu_ctx_t *cpu_ctx, uint32_t idx);
 template JIT_API void fpu_update_tag<false>(cpu_ctx_t *cpu_ctx, uint32_t idx);
diff --git a/lib86cpu/core/fpu.h b/lib86cpu/core/fpu.h
index beb8a78..9ebb35d 100644
--- a/lib86cpu/core/fpu.h
+++ b/lib86cpu/core/fpu.h
@@ -16,8 +16,14 @@ enum class fpu_instr_t : uint32_t {
 	bcd,
 };
 
+using stack_fault_func = void(* JIT_API)(cpu_ctx_t *, uint32_t);
+
+
 void fpu_init(cpu_t *cpu);
 template<bool is_push>
-JIT_API void fpu_update_tag(cpu_ctx_t *cpu_ctx, uint32_t idx);
-template<bool is_push>
-JIT_API uint32_t fpu_stack_check(cpu_ctx_t *cpu_ctx, uint32_t *sw, uint80_t *inv_val, fpu_instr_t instr_type);
+JIT_API void fpu_update_tag(cpu_ctx_t *cpu_ctx, uint32_t st_num);
+JIT_API uint32_t fpu_is_tag_empty(cpu_ctx_t *cpu_ctx, uint32_t st_num);
+JIT_API void fpu_stack_overflow(cpu_ctx_t *cpu_ctx, uint32_t exception);
+JIT_API void fpu_stack_underflow(cpu_ctx_t *cpu_ctx, uint32_t exception);
+JIT_API void fpu_stack_fault(cpu_ctx_t *cpu_ctx, uint32_t exception);
+JIT_API void fpu_update_ptr(cpu_ctx_t *cpu_ctx, uint64_t instr_info);
diff --git a/lib86cpu/core/instructions.cpp b/lib86cpu/core/instructions.cpp
index db06dc9..b3f630e 100644
--- a/lib86cpu/core/instructions.cpp
+++ b/lib86cpu/core/instructions.cpp
@@ -819,6 +819,9 @@ uint32_t update_crN_helper(cpu_ctx_t *cpu_ctx, uint32_t new_cr, uint8_t idx)
 			cpu_ctx->hflags = (((new_cr & CR0_EM_MASK) << 3) | (cpu_ctx->hflags & ~HFLG_CR0_EM));
 			cpu_ctx->hflags = (((new_cr & CR0_MP_MASK) << 14) | (cpu_ctx->hflags & ~HFLG_CR0_MP));
 		}
+		if constexpr (idx1 == 0) {
+			cpu_ctx->hflags = (((new_cr & CR0_NE_MASK) << 2) | (cpu_ctx->hflags & ~HFLG_CR0_NE));
+		}
 		cpu_ctx->hflags = (((new_cr & CR0_TS_MASK) << 7) | (cpu_ctx->hflags & ~HFLG_CR0_TS));
 
 		if constexpr (idx1 != 2) {
diff --git a/lib86cpu/core/internal.h b/lib86cpu/core/internal.h
index a1827ba..cbab99e 100644
--- a/lib86cpu/core/internal.h
+++ b/lib86cpu/core/internal.h
@@ -43,6 +43,7 @@ JIT_API void tlb_invalidate_(cpu_ctx_t *cpu_ctx, addr_t addr);
 #define PE_MODE_SHIFT       4
 #define CR0_EM_SHIFT        5
 #define TRAMP_SHIFT         6
+#define CR0_NE_SHIFT        7
 #define CR4_OSFXSR_SHIFT    9
 #define CR0_TS_SHIFT        10
 #define CR0_MP_SHIFT        15
@@ -55,14 +56,15 @@ JIT_API void tlb_invalidate_(cpu_ctx_t *cpu_ctx, addr_t addr);
 #define HFLG_PE_MODE        (1 << PE_MODE_SHIFT)
 #define HFLG_CR0_EM         (1 << CR0_EM_SHIFT)
 #define HFLG_TRAMP          (1 << TRAMP_SHIFT)
+#define HFLG_CR0_NE         (1 << CR0_NE_SHIFT)
 #define HFLG_CR0_MP         (1 << CR0_MP_SHIFT)
 #define HFLG_CR0_TS         (1 << CR0_TS_SHIFT)
 #define HFLG_CR4_OSFXSR     (1 << CR4_OSFXSR_SHIFT)
 #define HFLG_CR4_VME        (1 << CR4_VME_SHIFT)
 #define HFLG_CR4_PVI        (1 << CR4_PVI_SHIFT)
-#define HFLG_CONST          (HFLG_CPL | HFLG_CS32 | HFLG_SS32 | HFLG_PE_MODE | HFLG_CR0_EM | HFLG_TRAMP | HFLG_CR0_MP | HFLG_CR0_TS \
+#define HFLG_CONST          (HFLG_CPL | HFLG_CS32 | HFLG_SS32 | HFLG_PE_MODE | HFLG_CR0_EM | HFLG_TRAMP | HFLG_CR0_MP | HFLG_CR0_TS | HFLG_CR0_NE \
 | HFLG_CR4_OSFXSR | HFLG_CR4_VME | HFLG_CR4_PVI)
-#define HFLG_SAVED_MASK     (HFLG_CPL | HFLG_CS32 | HFLG_SS32 | HFLG_PE_MODE | HFLG_CR0_EM | HFLG_CR0_MP | HFLG_CR0_TS | HFLG_CR4_OSFXSR | HFLG_CR4_VME | HFLG_CR4_PVI)
+#define HFLG_SAVED_MASK     (HFLG_CPL | HFLG_CS32 | HFLG_SS32 | HFLG_PE_MODE | HFLG_CR0_EM | HFLG_CR0_MP | HFLG_CR0_TS | HFLG_CR0_NE | HFLG_CR4_OSFXSR | HFLG_CR4_VME | HFLG_CR4_PVI)
 
 // cpu interrupt flags
 #define CPU_NO_INT           0
@@ -395,7 +397,7 @@ CR0_TS_MASK | CR0_EM_MASK | CR0_MP_MASK | CR0_PE_MASK)
 #define FPU_EXP_PRECISION  (1 << 5)
 #define FPU_EXP_ALL        (FPU_EXP_INVALID | FPU_EXP_DENORMAL | FPU_EXP_DIVBYZERO | FPU_EXP_OVERFLOW | FPU_EXP_UNDERFLOW | FPU_EXP_PRECISION)
 
-// fpu fstatus flags and shifts
+// fpu fstatus flags
 #define FPU_FLG_IE     FPU_EXP_INVALID
 #define FPU_FLG_DE     FPU_EXP_DENORMAL
 #define FPU_FLG_ZE     FPU_EXP_DIVBYZERO
@@ -404,14 +406,17 @@ CR0_TS_MASK | CR0_EM_MASK | CR0_MP_MASK | CR0_PE_MASK)
 #define FPU_FLG_PE     FPU_EXP_PRECISION
 #define FPU_FLG_SF     (1 << 6)
 #define FPU_FLG_ES     (1 << 7)
+#define FPU_FLG_C0     (1 << 8)
+#define FPU_FLG_C1     (1 << 9)
+#define FPU_FLG_C2     (1 << 10)
 #define FPU_FLG_TOP    (7 << 11)
+#define FPU_FLG_C3     (1 << 14)
 #define FPU_FLG_BSY    (1 << 15)
-#define FPU_ES_SHIFT   7
-#define FPU_C0_SHIFT   8
-#define FPU_C1_SHIFT   9
-#define FPU_C2_SHIFT   10
-#define FPU_TOP_SHIFT  11
-#define FPU_C3_SHIFT   14
+#define FPU_FLG_CC_ALL (FPU_FLG_C0 | FPU_FLG_C1 | FPU_FLG_C2 | FPU_FLG_C3)
+
+// fpu stack fault flags
+#define FPU_STACK_OVERFLOW (FPU_EXP_INVALID | FPU_FLG_SF | FPU_FLG_C1)
+#define FPU_STACK_UNDERFLOW (FPU_EXP_INVALID | FPU_FLG_SF)
 
 // fpu cctrl flags
 #define FPU_FLG_PC     (3 << 8)
diff --git a/lib86cpu/interface.cpp b/lib86cpu/interface.cpp
index 8ecbbf8..2cebb66 100644
--- a/lib86cpu/interface.cpp
+++ b/lib86cpu/interface.cpp
@@ -1338,7 +1338,7 @@ uint16_t
 read_fstatus(cpu_t *cpu)
 {
 	uint16_t fstatus = (cpu->cpu_ctx.regs.fstatus & ~FPU_FLG_TOP);
-	fstatus |= (cpu->cpu_ctx.fpu_data.ftop << FPU_TOP_SHIFT);
+	fstatus |= (cpu->cpu_ctx.fpu_data.ftop << 11);
 	return fstatus;
 }
 
@@ -1351,6 +1351,6 @@ read_fstatus(cpu_t *cpu)
 void
 write_fstatus(cpu_t *cpu, uint16_t value)
 {
-	cpu->cpu_ctx.fpu_data.ftop = (value & FPU_FLG_TOP) >> FPU_TOP_SHIFT;
+	cpu->cpu_ctx.fpu_data.ftop = (value & FPU_FLG_TOP) >> 11;
 	cpu->cpu_ctx.regs.fstatus = value;
 }
diff --git a/lib86cpu/support.cpp b/lib86cpu/support.cpp
index 2676ebe..8c5c4c8 100644
--- a/lib86cpu/support.cpp
+++ b/lib86cpu/support.cpp
@@ -14,7 +14,7 @@
 #endif
 
 // This should be updated whenever cpu members that need to be saved are added/removed
-#define SAVE_STATE_ID 7
+#define SAVE_STATE_ID 8
 
 
 void