From 94f904857473d9256a78f9aaacede18e487330e4 Mon Sep 17 00:00:00 2001 From: sewer56 Date: Thu, 21 Dec 2023 20:19:57 +0000 Subject: [PATCH] Added: 128-bit atomics & completed assembly hook/unhook code. --- docs/dev/design/assembly-hooks/overview.md | 7 +- docs/dev/design/branch-hooks/overview.md | 93 +++++++- projects/reloaded-hooks-portable/Cargo.toml | 1 + .../src/api/buffers/buffer_abstractions.rs | 4 +- .../src/api/hooks/assembly/assembly_hook.rs | 11 +- .../src/helpers/atomic_write.rs | 8 +- .../src/helpers/atomic_write_masked.rs | 207 ++++++++++++++++-- .../src/helpers/overwrite_code.rs | 4 +- 8 files changed, 303 insertions(+), 32 deletions(-) diff --git a/docs/dev/design/assembly-hooks/overview.md b/docs/dev/design/assembly-hooks/overview.md index 0e4c2cf..96b49dd 100644 --- a/docs/dev/design/assembly-hooks/overview.md +++ b/docs/dev/design/assembly-hooks/overview.md @@ -136,10 +136,6 @@ The following table below shows common hook lengths, for: ## Thread Safety & Memory Layout -!!! note "[Reloaded3](https://reloaded-project.github.io/Reloaded-III/) allows mod load/unloads in real time, so this is a hard requirement." - -!!! warning "Therefore, assembly hooks should be thread safe." - In order to support thread safety, while retaining maximum runtime performance, the buffers where the original and hook code are contained have a very specific memory layout (shown below) @@ -149,6 +145,9 @@ original and hook code are contained have a very specific memory layout (shown b - Original Code ``` +Emplacing the jump to the hook function itself, and patching within the hook function should be atomic +whenever it is possible on the platform. + ### Example If the *'Original Code'* was: diff --git a/docs/dev/design/branch-hooks/overview.md b/docs/dev/design/branch-hooks/overview.md index 51d7cd0..76b34a9 100644 --- a/docs/dev/design/branch-hooks/overview.md +++ b/docs/dev/design/branch-hooks/overview.md @@ -10,6 +10,11 @@ !!! warning "Only guaranteed to work on platforms with [Targeted Memory Allocation](../../platform/overview.md#recommended-targeted-memory-allocation)" + Because the library needs to be able to acquire memory in proximity of the original function. + + Usually this is almost always achievable, but cases where Denuvo DRM inflates ARM64 binaries + (20MB -> 500MB) may prove problematic as ARM64 has +-128MiB range for relative jumps. + !!! note "I'm not a security person/researcher. I just make full stack game modding tools, mods and libraries. Naming in these design docs might be unconventional." This hook works by replacing the target of a `call` (a.k.a. Branch with Link) instruction with a new target. @@ -85,4 +90,90 @@ flowchart TD When the hook is deactivated, the stub is replaced with a direct jump back to the original function. By bypassing your code entirely, it is safe for your dynamic library (`.dll`/`.so`/`.dylib`) -to unload from the process. \ No newline at end of file +to unload from the process. + +## Example + +### Before + +```asm +; x86 Assembly +originalCaller: + ; Some code... + call originalFunction + ; More code... + +originalFunction: + ; Function implementation... +``` + +### After (Fast Mode) + +```asm +; x86 Assembly +originalCaller: + ; Some code... + call newFunction + ; More code... + +newFunction: + ; New function implementation... + call originalFunction ; Optional. + +originalFunction: + ; Original function implementation... +``` + +### After + +```asm +; x86 Assembly +originalCaller: + ; Some code... + call stub + ; More code... + +stub: + jmp newFunction + ; nop padding to 8 bytes (if needed) + +newFunction: + ; New function implementation... + call originalFunction ; Optional. + +originalFunction: + ; Original function implementation... +``` + +### After (with Calling Convention Conversion) + +```asm +; x86 Assembly +originalCaller: + ; Some code... + call wrapper + ; More code... + +wrapper: + ; call convention conversion implementation + call newFunction + ; call convention conversion implementation + ret + +newFunction: + ; New function implementation... + call reverseWrapper ; Optional. + +reverseWrapper: + ; call convention conversion implementation + call originalFunction + ; call convention conversion implementation + ret + +originalFunction: + ; Original function implementation... +``` + +## Thread Safety & Memory Layout + +Emplacing the jump to the stub and patching within the stub are atomic operations on all supported platforms. \ No newline at end of file diff --git a/projects/reloaded-hooks-portable/Cargo.toml b/projects/reloaded-hooks-portable/Cargo.toml index ec0c143..e8583d7 100644 --- a/projects/reloaded-hooks-portable/Cargo.toml +++ b/projects/reloaded-hooks-portable/Cargo.toml @@ -21,6 +21,7 @@ bitflags = "2.4.1" derive_more = { version = "0.99.17", default-features = false, features = ["from", "add", "iterator"] } derive-new = { version = "0.6.0", default-features = false } bitfield = "0.14.0" +portable-atomic = "1.6.0" # Tests only! lazy_static = { version = "1.4.0", features = ["spin_no_std"] } diff --git a/projects/reloaded-hooks-portable/src/api/buffers/buffer_abstractions.rs b/projects/reloaded-hooks-portable/src/api/buffers/buffer_abstractions.rs index ee18e49..454b5fa 100644 --- a/projects/reloaded-hooks-portable/src/api/buffers/buffer_abstractions.rs +++ b/projects/reloaded-hooks-portable/src/api/buffers/buffer_abstractions.rs @@ -106,8 +106,8 @@ pub trait Buffer { /// This method works around the complicated tidbits of writing to buffer, such as instruction /// cache invalidation and permission changes on W^X systems where applicable. /// - /// `TInteger` must be a native integer type, such as `u8`, `u16`, `u32`, `u64`, which can be written - /// using a single instruction. + /// `TInteger` must be a native integer type, such as `u8`, `u16`, `u32`, `u64`, `u128` which + /// can be written using a single instruction. /// /// # Parameters /// diff --git a/projects/reloaded-hooks-portable/src/api/hooks/assembly/assembly_hook.rs b/projects/reloaded-hooks-portable/src/api/hooks/assembly/assembly_hook.rs index 48c216a..3472250 100644 --- a/projects/reloaded-hooks-portable/src/api/hooks/assembly/assembly_hook.rs +++ b/projects/reloaded-hooks-portable/src/api/hooks/assembly/assembly_hook.rs @@ -196,13 +196,18 @@ where /// Writes the hook to memory, either enabling or disabling it based on the provided parameters. fn write_hook(&self, branch_opcode: &[u8], code: &[u8], num_bytes: usize) { // Write the branch first, as per docs - TBuffer::overwrite(self.stub_address, branch_opcode); + // This also overwrites some extra code afterwards, but that's a-ok for now. + unsafe { + atomic_write_masked::(self.stub_address, branch_opcode, num_bytes); + } // Now write the remaining code TBuffer::overwrite(self.stub_address + num_bytes, &code[num_bytes..]); - // Now write the non-branch code - atomic_write_masked::(self.stub_address, &code[..num_bytes], num_bytes); + // And now re-insert the code we temp overwrote with the branch + unsafe { + atomic_write_masked::(self.stub_address, code, num_bytes); + } } /// Enables the hook. diff --git a/projects/reloaded-hooks-portable/src/helpers/atomic_write.rs b/projects/reloaded-hooks-portable/src/helpers/atomic_write.rs index 3caf474..7de470a 100644 --- a/projects/reloaded-hooks-portable/src/helpers/atomic_write.rs +++ b/projects/reloaded-hooks-portable/src/helpers/atomic_write.rs @@ -1,7 +1,9 @@ use core::sync::atomic::{AtomicU16, AtomicU32, AtomicU64, AtomicU8, Ordering}; +use portable_atomic::AtomicU128; + /// Performs an atomic write of value in `src` to `tgt`. -/// Size must be 1/2/4/8 bytes. +/// Size must be 1/2/4/8/16 bytes. /// /// # Safety /// @@ -25,6 +27,10 @@ pub unsafe fn atomic_write(src: *const u8, tgt: *mut u8, size: usize) { let atomic = (tgt as *mut AtomicU64).as_ref().unwrap_unchecked(); atomic.store(*(src as *const u64), Ordering::Relaxed); } + 16 => { + let atomic = (tgt as *mut AtomicU128).as_ref().unwrap_unchecked(); + atomic.store(*(src as *const u128), Ordering::Relaxed); + } _ => panic!("Unsupported size for atomic write."), } } diff --git a/projects/reloaded-hooks-portable/src/helpers/atomic_write_masked.rs b/projects/reloaded-hooks-portable/src/helpers/atomic_write_masked.rs index 6642742..be9235b 100644 --- a/projects/reloaded-hooks-portable/src/helpers/atomic_write_masked.rs +++ b/projects/reloaded-hooks-portable/src/helpers/atomic_write_masked.rs @@ -1,5 +1,5 @@ use crate::api::buffers::buffer_abstractions::Buffer; -use core::ptr::read_unaligned; +use core::{hint::unreachable_unchecked, ptr::read_unaligned}; pub trait AtomicWriter { /// Writes a native integer type to a given address atomically. @@ -39,10 +39,17 @@ where } } +pub const MAX_ATOMIC_WRITE_BYTES: u8 = 16; + /// Overwrites bytes at the specified address with provided bytes using atomic operations. /// This will only replace the specified number of bytes, preserving the rest. +/// +/// # Safety +/// +/// Readable memory at 'address' must be at least 'num_bytes' rounded up to next power of 2 long. +/// I.e. This may not work if at end of virtual address space. #[inline] -pub fn atomic_write_masked(address: usize, code: &[u8], num_bytes: usize) +pub unsafe fn atomic_write_masked(address: usize, code: &[u8], num_bytes: usize) where TWriter: AtomicWriter, { @@ -56,11 +63,11 @@ where } 3..=4 => { let existing_code = read_unaligned(address as *const u32); - let code = read_unaligned(code.as_ptr() as *const u32); + let code = read_bytes_as_u32(code.as_ptr(), num_bytes); let mask = match num_bytes { 3 => 0x00_FF_FF_FF_u32.to_le(), 4 => 0xFF_FF_FF_FF, - _ => unreachable!(), + _ => unreachable_unchecked(), }; let combined_code = (existing_code & !mask) | (code & mask); @@ -68,34 +75,127 @@ where } 5..=8 => { let existing_code = read_unaligned(address as *const u64); - let mut temp_code: u64 = 0; + let code: u64 = read_bytes_as_u64(code.as_ptr(), num_bytes); let mask: u64 = match num_bytes { 5 => 0x00_00_00_FF_FF_FF_FF_FF_u64.to_le(), 6 => 0x00_00_FF_FF_FF_FF_FF_FF_u64.to_le(), 7 => 0x00_FF_FF_FF_FF_FF_FF_FF_u64.to_le(), 8 => 0xFF_FF_FF_FF_FF_FF_FF_FF_u64, - _ => unreachable!(), + _ => unreachable_unchecked(), }; - if cfg!(target_endian = "little") { - for (i, &byte) in code.iter().enumerate() { - temp_code |= (byte as u64) << (i * 8); - } - } else { - // Big-endian case - for (i, &byte) in code.iter().enumerate() { - temp_code |= (byte as u64) << ((7 - i) * 8); - } - } + let combined_code = (existing_code & !mask) | (code & mask); + TWriter::atomic_write(address, combined_code); + } + 9..=16 => { + let existing_code = read_unaligned(address as *const u128); + let code: u128 = read_bytes_as_u128(code.as_ptr(), num_bytes); + let mask: u128 = match num_bytes { + 9 => 0x00_00_00_00_00_00_00_FF_FF_FF_FF_FF_FF_FF_FF_FF_u128.to_le(), + 10 => 0x00_00_00_00_00_00_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_u128.to_le(), + 11 => 0x00_00_00_00_00_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_u128.to_le(), + 12 => 0x00_00_00_00_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_u128.to_le(), + 13 => 0x00_00_00_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_u128.to_le(), + 14 => 0x00_00_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_u128.to_le(), + 15 => 0x00_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_u128.to_le(), + 16 => 0xFF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_FF_u128, + _ => unreachable!(), + }; - let combined_code = (existing_code & !mask) | (temp_code & mask); + let combined_code = (existing_code & !mask) | (code & mask); TWriter::atomic_write(address, combined_code); } + _ => panic!("Unsupported num_bytes in atomic_overwrite_with_mask"), } } } +unsafe fn read_bytes_as_u32(address: *const u8, num_bytes: usize) -> u32 { + if num_bytes == 4 { + read_unaligned(address as *const u32) + } else { + // num_bytes is 3. + if cfg!(target_endian = "little") { + // Little-endian: LSB is at the lowest address. + let lower_bytes = read_unaligned(address as *const u16) as u32; + let upper_byte = *address.add(2) as u32; + lower_bytes | (upper_byte << 16) // Leftmost byte should be empty. + } else { + // Big-endian: MSB is at the lowest address. + let lower_bytes = read_unaligned(address as *const u16) as u32; + let upper_byte = *address.add(2) as u32; + lower_bytes << 16 | (upper_byte << 8) // We shift 8 because the rightmost byte should be empty + } + } +} + +unsafe fn read_bytes_as_u64(address: *const u8, num_bytes: usize) -> u64 { + if num_bytes == 8 { + read_unaligned(address as *const u64) + } else { + // num_bytes is between 5 and 7 + let mut value: u64 = read_unaligned(address as *const u32) as u64; + if cfg!(target_endian = "little") { + // Little-endian + // Byte 5 + value |= (*address.add(4) as u64) << 32; + + if num_bytes == 7 { + // Bytes 6-7 + value |= (read_unaligned(address.add(5) as *const u16) as u64) << 40; + } else if num_bytes == 6 { + // Byte 6 + value |= (*address.add(5) as u64) << 40; + } + } else { + // Big-endian + // Bytes 0-4 + value <<= 32; + + // Byte 5 + value |= (*address.add(4) as u64) << 24; + + if num_bytes == 7 { + // Bytes 6-7 + value |= (read_unaligned(address.add(5) as *const u16) as u64) << 8; + } else if num_bytes == 6 { + // Byte 6 + value |= (*address.add(5) as u64) << 16; + } + } + + value + } +} + +unsafe fn read_bytes_as_u128(address: *const u8, num_bytes: usize) -> u128 { + if num_bytes == 16 { + return read_unaligned(address as *const u128); + } + + if cfg!(target_endian = "little") { + // Read first 8 bytes + let mut value = read_unaligned(address as *const u64) as u128; + + // Bytes >= 8 + for i in 8..num_bytes { + value |= (*address.add(i) as u128) << (8 * i); + } + value + } else { + // Read first 8 bytes + let mut value = (read_unaligned(address as *const u64) as u128) << 64; + + // Bytes >= 8 + for i in 8..num_bytes { + value |= (*address.add(i) as u128) << ((15 - i) * 8); + } + + value + } +} + #[cfg(test)] mod tests { use super::*; @@ -104,10 +204,12 @@ mod tests { ($name:ident, $num_bytes:expr, $input:expr, $expected:expr) => { #[test] fn $name() { - let mut buffer = [0xFFu8; 8]; + let mut buffer = [0xFFu8; 16]; let address = buffer.as_mut_ptr() as usize; - atomic_write_masked::(address, &$input, $num_bytes); + unsafe { + atomic_write_masked::(address, &$input, $num_bytes); + } let result = &buffer[0..$num_bytes]; assert_eq!(result, $expected); @@ -162,4 +264,71 @@ mod tests { [0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21], &[0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21] ); + atomic_write_test!( + test_atomic_write_masked_9_bytes, + 9, + [0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A], + &[0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A] + ); + + atomic_write_test!( + test_atomic_write_masked_10_bytes, + 10, + [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39], + &[0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39] + ); + + atomic_write_test!( + test_atomic_write_masked_11_bytes, + 11, + [0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44], + &[0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44] + ); + + atomic_write_test!( + test_atomic_write_masked_12_bytes, + 12, + [0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50], + &[0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50] + ); + + atomic_write_test!( + test_atomic_write_masked_13_bytes, + 13, + [0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D], + &[0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D] + ); + + atomic_write_test!( + test_atomic_write_masked_14_bytes, + 14, + [0x5E, 0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B], + &[0x5E, 0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B] + ); + + atomic_write_test!( + test_atomic_write_masked_15_bytes, + 15, + [ + 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A + ], + &[ + 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A + ] + ); + + atomic_write_test!( + test_atomic_write_masked_16_bytes, + 16, + [ + 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, + 0x89, 0x8A + ], + &[ + 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, + 0x89, 0x8A + ] + ); } diff --git a/projects/reloaded-hooks-portable/src/helpers/overwrite_code.rs b/projects/reloaded-hooks-portable/src/helpers/overwrite_code.rs index dd7b82e..123f96a 100644 --- a/projects/reloaded-hooks-portable/src/helpers/overwrite_code.rs +++ b/projects/reloaded-hooks-portable/src/helpers/overwrite_code.rs @@ -1,7 +1,7 @@ use core::ptr::copy_nonoverlapping; use super::{ - atomic_write_masked::{atomic_write_masked, NativeMemoryAtomicWriter}, + atomic_write_masked::{atomic_write_masked, NativeMemoryAtomicWriter, MAX_ATOMIC_WRITE_BYTES}, icache_clear::clear_instruction_cache, }; use crate::api::platforms::platform_functions::{ @@ -27,7 +27,7 @@ pub(crate) fn overwrite_code(address: usize, buffer: &[u8]) { unsafe { // If the instructions are short, we can do it atomic! >w< enhancing our reliability. - if buffer.len() <= 8 { + if buffer.len() <= MAX_ATOMIC_WRITE_BYTES as usize { atomic_write_masked::(address, buffer, buffer.len()); } else { copy_nonoverlapping(buffer.as_ptr(), address as *mut u8, buffer.len());