From 808a1e4375dfb820eb872a2892b158f7a56b854d Mon Sep 17 00:00:00 2001 From: ergo720 <45463469+ergo720@users.noreply.github.com> Date: Sun, 23 Jun 2024 02:18:56 +0200 Subject: [PATCH] Allocate a single pool at startup for all the jitted code that can be created, instead of allocating multiple pools at runtime on demand Testing with nxbx/nboxkrnl shows that this cuts in half the execution time --- lib86cpu/core/allocator.cpp | 102 +++++++++--------------- lib86cpu/core/allocator.h | 17 ++-- lib86cpu/core/emitter/x64/jit.cpp | 6 +- lib86cpu/core/linux/os_exceptions.cpp | 2 +- lib86cpu/core/linux/os_mem.cpp | 39 +-------- lib86cpu/core/linux/os_mem.h | 2 - lib86cpu/core/windows/os_exceptions.cpp | 2 +- lib86cpu/core/windows/os_mem.cpp | 40 +--------- lib86cpu/core/windows/os_mem.h | 2 - 9 files changed, 51 insertions(+), 161 deletions(-) diff --git a/lib86cpu/core/allocator.cpp b/lib86cpu/core/allocator.cpp index 7a03c10..c11d1ec 100644 --- a/lib86cpu/core/allocator.cpp +++ b/lib86cpu/core/allocator.cpp @@ -11,92 +11,74 @@ #include "os_exceptions.h" -mem_manager::block_header_t * -mem_manager::create_pool() +mem_manager::mem_manager() { - block_header_t *start = static_cast(os_alloc(POOL_SIZE)); - block_header_t *addr = start; - for (unsigned i = 0; i < BLOCKS_PER_POOL - 1; i++) { + m_code_block_area = os_alloc(CODE_CACHE_MAX_SIZE * BLOCK_SIZE + BLOCK_SIZE); // 32768 code blocks + another one for aux functions + init_pool(); +} + +void +mem_manager::init_pool() +{ + block_header_t *addr = static_cast(m_code_block_area); + m_head = addr; + for (unsigned i = 0; i < CODE_CACHE_MAX_SIZE - 1; ++i) { addr->next = reinterpret_cast(reinterpret_cast(addr) + BLOCK_SIZE); addr = addr->next; } - addr->next = nullptr; - blocks.emplace_back(start); - return start; } void * mem_manager::alloc() { - if (head == nullptr) { - head = create_pool(); - } - - block_header_t *addr = head; - head = head->next; + assert(m_head); + block_header_t *addr = m_head; + m_head = m_head->next; return addr; } void mem_manager::free(void *ptr) { - // this is necessary because we mark the code section memory as read-only after the code is written to it - os_protect(ptr, BLOCK_SIZE, get_mem_flags(MEM_READ | MEM_WRITE)); - static_cast(ptr)->next = head; - head = static_cast(ptr); + static_cast(ptr)->next = m_head; + m_head = static_cast(ptr); } void mem_manager::destroy_all_blocks() { #if defined(_WIN64) || defined(__linux__) - for (const auto &eh_pair : eh_frames) { + for (const auto &eh_pair : m_eh_frames) { os_delete_exp_info(eh_pair.second); } - eh_frames.clear(); + m_eh_frames.clear(); #endif #if defined(_WIN64) - for (auto &addr : blocks) { - os_free(addr); - } - - for (auto &block : big_blocks) { + for (auto &block : m_big_blocks) { os_free(block.first); } #elif defined(__linux__) - for (auto &addr : blocks) { - os_free(addr, POOL_SIZE); - } - - for (auto &block : big_blocks) { + for (auto &block : m_big_blocks) { os_free(block.first, block.second); } #endif - big_blocks.clear(); - blocks.clear(); - head = nullptr; + init_pool(); + m_big_blocks.clear(); } void mem_manager::purge_all_blocks() { destroy_all_blocks(); - #if defined(_WIN64) - for (auto &block : hidden_blocks) { - os_free(block.first); - } + os_free(m_code_block_area); #elif defined(__linux__) - for (auto &block : hidden_blocks) { - os_free(block.first, block.second); - } + os_free(m_code_block_area, CODE_CACHE_MAX_SIZE * BLOCK_SIZE + BLOCK_SIZE); #endif - - hidden_blocks.clear(); } mem_block @@ -110,7 +92,7 @@ mem_manager::allocate_sys_mem(size_t num_bytes) size_t block_size = (num_bytes + PAGE_MASK) & ~PAGE_MASK; void *addr = os_alloc(block_size); mem_block block(addr, block_size); - big_blocks.emplace(addr, block_size); + m_big_blocks.emplace(addr, block_size); return block; } @@ -118,21 +100,17 @@ mem_manager::allocate_sys_mem(size_t num_bytes) } mem_block -mem_manager::allocate_non_pooled_sys_mem(size_t num_bytes) +mem_manager::get_non_pooled_sys_mem(size_t num_bytes) { if (num_bytes == 0) { return mem_block(); } - size_t block_size = (num_bytes + PAGE_MASK) & ~PAGE_MASK; - void *addr = os_alloc(block_size); - mem_block block(addr, block_size); - hidden_blocks.emplace(addr, block_size); - return block; + return mem_block(reinterpret_cast(m_code_block_area) + CODE_CACHE_MAX_SIZE * BLOCK_SIZE, BLOCK_SIZE); } void -mem_manager::protect_sys_mem(const mem_block &block, unsigned flags) +mem_manager::flush_instr_cache(const mem_block &block) { void *addr = block.addr; size_t size = block.size; @@ -141,17 +119,13 @@ mem_manager::protect_sys_mem(const mem_block &block, unsigned flags) return; } - os_protect(addr, size, get_mem_flags(flags)); - - if (flags & MEM_EXEC) { #if defined(_WIN64) - os_flush_instr_cache(addr, size); + os_flush_instr_cache(addr, size); #elif defined(__linux__) - void *start = addr; - void *end = static_cast(addr) + size; - os_flush_instr_cache(start, end); + void *start = addr; + void *end = static_cast(addr) + size; + os_flush_instr_cache(start, end); #endif - } } void @@ -161,25 +135,21 @@ mem_manager::release_sys_mem(void *addr) return; } - if (auto it = hidden_blocks.find(addr); it != hidden_blocks.end()) { - return; - } - #if defined(_WIN64) || defined(__linux__) void *main_addr = reinterpret_cast(addr) + 16; - if (auto it = eh_frames.find(main_addr); it != eh_frames.end()) { + if (auto it = m_eh_frames.find(main_addr); it != m_eh_frames.end()) { os_delete_exp_info(it->second); - eh_frames.erase(main_addr); + m_eh_frames.erase(main_addr); } #endif - if (auto it = big_blocks.find(addr); it != big_blocks.end()) { + if (auto it = m_big_blocks.find(addr); it != m_big_blocks.end()) { #if defined(_WIN64) os_free(it->first); #elif defined(__linux__) os_free(it->first, it->second); #endif - big_blocks.erase(addr); + m_big_blocks.erase(addr); return; } diff --git a/lib86cpu/core/allocator.h b/lib86cpu/core/allocator.h index f85271f..b7fb34e 100644 --- a/lib86cpu/core/allocator.h +++ b/lib86cpu/core/allocator.h @@ -28,26 +28,25 @@ struct mem_block { class mem_manager { public: mem_block allocate_sys_mem(size_t num_bytes); - mem_block allocate_non_pooled_sys_mem(size_t num_bytes); - void protect_sys_mem(const mem_block &block, unsigned flags); + mem_block get_non_pooled_sys_mem(size_t num_bytes); + void flush_instr_cache(const mem_block &block); void release_sys_mem(void *addr); void destroy_all_blocks(); ~mem_manager() { purge_all_blocks(); } + mem_manager(); #if defined(_WIN64) || defined(__linux__) - std::map eh_frames; + std::map m_eh_frames; #endif private: struct block_header_t { block_header_t *next; }; - block_header_t *head = nullptr; - std::vector blocks; - std::map big_blocks; - std::map hidden_blocks; - - block_header_t *create_pool(); + block_header_t *m_head; + std::map m_big_blocks; + void *m_code_block_area; + void init_pool(); void *alloc(); void free(void *ptr); void purge_all_blocks(); diff --git a/lib86cpu/core/emitter/x64/jit.cpp b/lib86cpu/core/emitter/x64/jit.cpp index aadd2e9..4b258d9 100644 --- a/lib86cpu/core/emitter/x64/jit.cpp +++ b/lib86cpu/core/emitter/x64/jit.cpp @@ -542,7 +542,7 @@ lc86_jit::gen_code_block() #endif // This code block is complete, so protect and flush the instruction cache now - m_mem.protect_sys_mem(block, MEM_READ | MEM_EXEC); + m_mem.flush_instr_cache(block); tc->ptr_code = reinterpret_cast(main_offset); tc->jmp_offset[0] = tc->jmp_offset[1] = tc->jmp_offset[2] = reinterpret_cast(exit_offset); @@ -600,7 +600,7 @@ lc86_jit::gen_aux_funcs() throw lc86_exp_abort("The generated code has a zero size", lc86_status::internal_error); } - auto block = m_mem.allocate_non_pooled_sys_mem(estimated_code_size); + auto block = m_mem.get_non_pooled_sys_mem(estimated_code_size); if (auto err = m_code.relocateToBase(reinterpret_cast(block.addr))) { std::string err_str("Asmjit failed at relocateToBase() with the error "); err_str += DebugUtils::errorAsString(err); @@ -616,7 +616,7 @@ lc86_jit::gen_aux_funcs() assert(offset + buff_size <= estimated_code_size); std::memcpy(static_cast(block.addr) + offset, section->data(), buff_size); - m_mem.protect_sys_mem(block, MEM_READ | MEM_EXEC); + m_mem.flush_instr_cache(block); m_cpu->read_int_fn = reinterpret_cast(static_cast(block.addr) + offset); m_cpu->raise_int_fn = reinterpret_cast(static_cast(block.addr) + offset + raise_int_off_aligned16); diff --git a/lib86cpu/core/linux/os_exceptions.cpp b/lib86cpu/core/linux/os_exceptions.cpp index 91d797a..75d56c2 100644 --- a/lib86cpu/core/linux/os_exceptions.cpp +++ b/lib86cpu/core/linux/os_exceptions.cpp @@ -159,7 +159,7 @@ lc86_jit::gen_exception_info(uint8_t *code_ptr, size_t code_size) size_t aligned_code_size = (code_size + sizeof(void *) - 1) & ~(sizeof(void *) - 1); cie_t *cie = reinterpret_cast(code_ptr + aligned_code_size); write_eh_frame(cie, code_ptr, code_size); - m_mem.eh_frames.emplace(code_ptr, cie); + m_mem.m_eh_frames.emplace(code_ptr, cie); } void diff --git a/lib86cpu/core/linux/os_mem.cpp b/lib86cpu/core/linux/os_mem.cpp index 972193a..22079f8 100644 --- a/lib86cpu/core/linux/os_mem.cpp +++ b/lib86cpu/core/linux/os_mem.cpp @@ -10,40 +10,10 @@ #include "os_mem.h" -int -get_mem_flags(unsigned flags) -{ - switch (flags) - { - case MEM_READ: - return PROT_READ; - - case MEM_WRITE: - return PROT_WRITE; - - case MEM_READ | MEM_WRITE: - return PROT_READ | PROT_WRITE; - - case MEM_READ | MEM_EXEC: - return PROT_READ | PROT_EXEC; - - case MEM_READ | MEM_WRITE | MEM_EXEC: - return PROT_READ | PROT_WRITE | PROT_EXEC; - - case MEM_EXEC: - return PROT_READ | PROT_EXEC; - - default: - LIB86CPU_ABORT(); - } - - return PROT_NONE; -} - void * os_alloc(size_t size) { - auto addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + auto addr = mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0); if (addr == MAP_FAILED) { throw lc86_exp_abort("Failed to allocate memory for the generated code", lc86_status::no_memory); } @@ -57,13 +27,6 @@ os_free(void *addr, size_t size) assert(!ret); } -void -os_protect(void *addr, size_t size, int prot) -{ - [[maybe_unused]] auto ret = mprotect(addr, size, prot); - assert(!ret); -} - void os_flush_instr_cache(void *start, void *end) { diff --git a/lib86cpu/core/linux/os_mem.h b/lib86cpu/core/linux/os_mem.h index 87b063f..7e12550 100644 --- a/lib86cpu/core/linux/os_mem.h +++ b/lib86cpu/core/linux/os_mem.h @@ -7,8 +7,6 @@ #pragma once -int get_mem_flags(unsigned flags); void *os_alloc(size_t size); void os_free(void *addr, size_t size); -void os_protect(void *addr, size_t size, int prot); void os_flush_instr_cache(void *addr, void *end); diff --git a/lib86cpu/core/windows/os_exceptions.cpp b/lib86cpu/core/windows/os_exceptions.cpp index 6baa6bb..e2a1241 100644 --- a/lib86cpu/core/windows/os_exceptions.cpp +++ b/lib86cpu/core/windows/os_exceptions.cpp @@ -90,7 +90,7 @@ lc86_jit::gen_exception_info(uint8_t *code_ptr, size_t code_size) table->BeginAddress = 0; table->EndAddress = code_size; table->UnwindInfoAddress = aligned_code_size; - m_mem.eh_frames.emplace(code_ptr, table); + m_mem.m_eh_frames.emplace(code_ptr, table); [[maybe_unused]] auto ret = RtlAddFunctionTable(table, 1, reinterpret_cast(code_ptr)); assert(ret); diff --git a/lib86cpu/core/windows/os_mem.cpp b/lib86cpu/core/windows/os_mem.cpp index e8db4d3..b884ab5 100644 --- a/lib86cpu/core/windows/os_mem.cpp +++ b/lib86cpu/core/windows/os_mem.cpp @@ -10,40 +10,10 @@ #include "os_mem.h" -unsigned -get_mem_flags(unsigned flags) -{ - switch (flags) - { - case MEM_READ: - return PAGE_READONLY; - - case MEM_WRITE: - return PAGE_READWRITE; - - case MEM_READ | MEM_WRITE: - return PAGE_READWRITE; - - case MEM_READ | MEM_EXEC: - return PAGE_EXECUTE_READ; - - case MEM_READ | MEM_WRITE | MEM_EXEC: - return PAGE_EXECUTE_READWRITE; - - case MEM_EXEC: - return PAGE_EXECUTE; - - default: - LIB86CPU_ABORT(); - } - - return PAGE_NOACCESS; -} - void * os_alloc(size_t size) { - auto addr = VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + auto addr = VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_EXECUTE_READWRITE); if (addr == NULL) { throw lc86_exp_abort("Failed to allocate memory for the generated code", lc86_status::no_memory); } @@ -57,14 +27,6 @@ os_free(void *addr) assert(ret); } -void -os_protect(void *addr, size_t size, unsigned prot) -{ - DWORD dummy; - [[maybe_unused]] auto ret = VirtualProtect(addr, size, prot, &dummy); - assert(ret); -} - void os_flush_instr_cache(void *addr, size_t size) { diff --git a/lib86cpu/core/windows/os_mem.h b/lib86cpu/core/windows/os_mem.h index b1ab66d..61791e8 100644 --- a/lib86cpu/core/windows/os_mem.h +++ b/lib86cpu/core/windows/os_mem.h @@ -7,8 +7,6 @@ #pragma once -unsigned get_mem_flags(unsigned flags); void *os_alloc(size_t size); void os_free(void *addr); -void os_protect(void *addr, size_t size, unsigned prot); void os_flush_instr_cache(void *addr, size_t size);