diff --git a/include/scanners/x64/AVX.h b/include/scanners/x64/AVX.h new file mode 100644 index 0000000..1d5ea9c --- /dev/null +++ b/include/scanners/x64/AVX.h @@ -0,0 +1,83 @@ +#pragma once + +#include +#include +#include + +#include "../../util.h" + +namespace Pattern16 { + namespace Impl { + PATTERN16_NO_INLINE const void* scanRegion(const void* regionStart, const void* regionEnd, size_t sigStart, int, SplitSignature<__m256i>& signature, size_t length) { + auto sig_bytes = _mm256_load_si256(signature.first.data()); + auto mask_bytes = _mm256_load_si256(signature.second.data()); + auto psig_bytes = (reinterpret_cast(signature.first.data())); + auto sig_bytemask = broadcastMask256(psig_bytes[0 + sigStart], psig_bytes[1 + sigStart]); + auto sig_offset = -(intptr_t)sigStart; + auto blendmask = _mm256_set1_epi16(0x8000u); + auto cur = reinterpret_cast(regionStart) - 2; + auto safety = (signature.first.size() + 1) * sizeof(__m256i); + auto span = (((uintptr_t)regionEnd - (uintptr_t)regionStart - safety) >> 6); + { + outer_loop_continue1: + outer_loop_continue2: + cur += 2; + if PATTERN16_LIKELY (--span) { + _mm_prefetch(reinterpret_cast(cur + 16), _MM_HINT_T0); + _mm_prefetch(reinterpret_cast(cur + 48), _MM_HINT_T0); + uint64_t result; + uint32_t resultl; + uint32_t resulth; + { + auto read_aligned1 = _mm256_stream_load_si256(cur); + auto read_unaligned1 = _mm256_lddqu_si256(reinterpret_cast(reinterpret_cast(cur) + 1)); + auto check_aligned1 = _mm256_cmpeq_epi16(sig_bytemask, read_aligned1); + auto check_unaligned1 = _mm256_cmpeq_epi16(sig_bytemask, read_unaligned1); + resultl = _mm256_movemask_epi8(_mm256_blendv_epi8(check_aligned1, check_unaligned1, blendmask)); + auto read_aligned2 = _mm256_stream_load_si256(cur + 1); + auto read_unaligned2 = _mm256_lddqu_si256(reinterpret_cast(reinterpret_cast(cur) + 1) + 1); + auto check_aligned2 = _mm256_cmpeq_epi16(sig_bytemask, read_aligned2); + auto check_unaligned2 = _mm256_cmpeq_epi16(sig_bytemask, read_unaligned2); + resulth = _mm256_movemask_epi8(_mm256_blendv_epi8(check_aligned2, check_unaligned2, blendmask)); + if PATTERN16_LIKELY (!(resulth | resultl)) goto outer_loop_continue1; + } + { + result = resultl; + result += static_cast(resulth) << 32; + auto cur_sig = reinterpret_cast(cur) + sig_offset; + uint64_t result_ = result; + inner_loop_continue1: + inner_loop_continue2: + if PATTERN16_UNLIKELY (!(result &= result_)) goto outer_loop_continue2; + auto cur_sig_start = reinterpret_cast(cur_sig + _tzcnt_u64(result)); + result_ = result--; + auto potential_match = _mm256_lddqu_si256(cur_sig_start); + potential_match = _mm256_xor_si256(potential_match, sig_bytes); + if PATTERN16_LIKELY (!_mm256_testz_si256(potential_match, mask_bytes)) goto inner_loop_continue1; + auto length_ = length; + while (length_--) { + if (!length_) return (const void*)cur_sig_start; + auto potential_match = _mm256_lddqu_si256(cur_sig_start + length_); + potential_match = _mm256_xor_si256(potential_match, signature.first[length_]); + if (!_mm256_testz_si256(potential_match, signature.second[length_])) break; + } + goto inner_loop_continue2; + } + } + } + auto cur_byte = reinterpret_cast(cur); + auto end_byte = reinterpret_cast(cur + 3); + do { + auto cur_sig_start = reinterpret_cast(cur_byte); + auto length_ = length; + while (length_--) { + if (!length_) return (const void*)cur_sig_start; + auto potential_match = _mm256_lddqu_si256(cur_sig_start + length_); + potential_match = _mm256_xor_si256(potential_match, signature.first[length_]); + if (!_mm256_testz_si256(potential_match, signature.second[length_])) break; + } + } while (++cur_byte < end_byte); + return nullptr; + } + } +} \ No newline at end of file diff --git a/include/scanners/x64/SSE.h b/include/scanners/x64/SSE.h new file mode 100644 index 0000000..d1a6686 --- /dev/null +++ b/include/scanners/x64/SSE.h @@ -0,0 +1,98 @@ +#pragma once + +#include +#include +#include + +#include "../../util.h" + +namespace Pattern16 { + namespace Impl { + template + PATTERN16_NO_INLINE const void* scanRegion(const void* regionStart, const void* regionEnd, size_t sigStart, int, SplitSignature<__m128i>& signature, size_t length) { + auto sig_bytes = _mm_load_si128(signature.first.data()); + auto mask_bytes = _mm_load_si128(signature.second.data()); + auto psig_bytes = (reinterpret_cast(signature.first.data())); + auto sig_bytemask = broadcastMask128(psig_bytes[0 + sigStart], psig_bytes[1 + sigStart]); + auto sig_offset = -(intptr_t)sigStart; + auto checkmask_unaligned = broadcastMask128(0, 0xFF); + auto cur = reinterpret_cast(reinterpret_cast(regionStart)) - 4; + auto safety = (signature.first.size() + 3) * sizeof(__m128i); + auto span = (((uintptr_t)regionEnd - (uintptr_t)regionStart - safety) >> 6); + { + outer_loop_continue1: + outer_loop_continue2: + cur += 4; + if PATTERN16_LIKELY (--span) { + _mm_prefetch(reinterpret_cast(cur + 32), _MM_HINT_T0); + _mm_prefetch(reinterpret_cast(cur + 96), _MM_HINT_T0); + uint64_t result; + { + auto read_aligned1 = _mm_load_si128(cur + 2); + auto read_unaligned1 = _mm_loadu_si128(reinterpret_cast(reinterpret_cast(cur) + 1) + 2); + auto read_aligned2 = _mm_load_si128(cur + 3); + auto read_unaligned2 = _mm_loadu_si128(reinterpret_cast(reinterpret_cast(cur) + 1) + 3); + auto check_aligned2 = _mm_cmpeq_epi16(read_aligned2, sig_bytemask); + auto check_unaligned2 = _mm_cmpeq_epi16(read_unaligned2, sig_bytemask); + check_unaligned2 = _mm_and_si128(check_unaligned2, checkmask_unaligned); + result = _mm_movemask_epi8(_mm_or_si128(check_aligned2, check_unaligned2)); + result <<= 16; + auto check_aligned1 = _mm_cmpeq_epi16(read_aligned1, sig_bytemask); + auto check_unaligned1 = _mm_cmpeq_epi16(read_unaligned1, sig_bytemask); + check_unaligned1 = _mm_and_si128(check_unaligned1, checkmask_unaligned); + result |= _mm_movemask_epi8(_mm_or_si128(check_aligned1, check_unaligned1)); + result <<= 16; + } + { + auto read_aligned1 = _mm_load_si128(cur); + auto read_unaligned1 = _mm_loadu_si128(reinterpret_cast(reinterpret_cast(cur) + 1)); + auto read_aligned2 = _mm_load_si128(cur + 1); + auto read_unaligned2 = _mm_loadu_si128(reinterpret_cast(reinterpret_cast(cur) + 1) + 1); + auto check_aligned2 = _mm_cmpeq_epi16(read_aligned2, sig_bytemask); + auto check_unaligned2 = _mm_cmpeq_epi16(read_unaligned2, sig_bytemask); + check_unaligned2 = _mm_and_si128(check_unaligned2, checkmask_unaligned); + result |= _mm_movemask_epi8(_mm_or_si128(check_aligned2, check_unaligned2)); + result <<= 16; + auto check_aligned1 = _mm_cmpeq_epi16(read_aligned1, sig_bytemask); + auto check_unaligned1 = _mm_cmpeq_epi16(read_unaligned1, sig_bytemask); + check_unaligned1 = _mm_and_si128(check_unaligned1, checkmask_unaligned); + if PATTERN16_LIKELY (!(result |= _mm_movemask_epi8(_mm_or_si128(check_aligned1, check_unaligned1)))) goto outer_loop_continue1; + } + { + auto cur_sig = reinterpret_cast(cur) + sig_offset; + uint64_t result_ = result; + inner_loop_continue1: + inner_loop_continue2: + if PATTERN16_UNLIKELY (!(result &= result_)) goto outer_loop_continue2; + auto cur_sig_start = reinterpret_cast(cur_sig + _tzcnt_u64(result)); + result_ = result--; + auto potential_match = _mm_loadu_si128(cur_sig_start); + potential_match = _mm_xor_si128(potential_match, sig_bytes); + if PATTERN16_LIKELY (!_mm_testz_SSE(potential_match, mask_bytes)) goto inner_loop_continue1; + auto length_ = length; + while (length_--) { + if (!length_) return (const void*)cur_sig_start; + auto potential_match = _mm_loadu_si128(cur_sig_start + length_); + potential_match = _mm_xor_si128(potential_match, signature.first[length_]); + if (!_mm_testz_SSE(potential_match, signature.second[length_])) break; + } + goto inner_loop_continue2; + } + } + } + auto cur_byte = reinterpret_cast(cur); + auto end_byte = reinterpret_cast(cur + 7); + do { + auto cur_sig_start = reinterpret_cast(cur_byte); + auto length_ = length; + while (length_--) { + if (!length_) return (const void*)cur_sig_start; + auto potential_match = _mm_loadu_si128(cur_sig_start + length_); + potential_match = _mm_xor_si128(potential_match, signature.first[length_]); + if (!_mm_testz_si128(potential_match, signature.second[length_])) break; + } + } while (++cur_byte < end_byte); + return nullptr; + } + } +} \ No newline at end of file diff --git a/include/scanners/x64/x64.h b/include/scanners/x64/x64.h new file mode 100644 index 0000000..cdb6237 --- /dev/null +++ b/include/scanners/x64/x64.h @@ -0,0 +1,264 @@ +#pragma once + +#include +#include +#include + +#include "../../util.h" + +namespace Pattern16 { + namespace Impl { + PATTERN16_NO_INLINE const void* scanRegion(const void* regionStart, const void* regionEnd, size_t sigStart, int, SplitSignature& signature, size_t length) { + auto psig = signature.first.data(); + auto pmask = signature.first.data(); + auto psig_bytes = (reinterpret_cast(signature.first.data())); + auto pmask_bytes = (reinterpret_cast(signature.second.data())); + auto sig_bytemask = broadcastMask64(psig_bytes[0 + sigStart], psig_bytes[1 + sigStart]); + auto sig_offset = -static_cast(sigStart); + auto mask_bytemask1 = 0xFFFF'0000'0000ull; + auto mask_bytemask2 = 0xFFFF'0000'0000'0000ull; + auto cur = reinterpret_cast(regionStart) - 8; + auto safety = (signature.first.size() + 5) * sizeof(uint64_t); + auto span = ((reinterpret_cast(regionEnd) - reinterpret_cast(regionStart) - safety) >> 6); + { + outer_loop_continue1: + outer_loop_continue2: + cur += 8; + if PATTERN16_LIKELY (--span) { + _mm_prefetch(reinterpret_cast(cur + 64), _MM_HINT_T0); + _mm_prefetch(reinterpret_cast(cur + 192), _MM_HINT_T0); + uint32_t resultl = 0; + uint64_t resulth = 0; + { + auto read_alignedl = sig_bytemask ^ cur[0]; + auto read_alignedh = sig_bytemask ^ cur[4]; + auto read_alignedlh = static_cast(read_alignedl >> 32); + auto read_alignedhh = static_cast(read_alignedh >> 32); + { + auto val = 1u << 6; + resultl |= read_alignedl & mask_bytemask1 ? resultl : val; + resulth |= read_alignedh & mask_bytemask1 ? resulth : val; + } + { + auto val = 1u << 4; + resultl |= read_alignedl & mask_bytemask2 ? resultl : val; + resulth |= read_alignedh & mask_bytemask2 ? resulth : val; + } + { + auto val = 1u << 2; + resultl |= static_cast(read_alignedl) & 0xFFFF'0000u ? resultl : val; + resulth |= static_cast(read_alignedh) & 0xFFFF'0000u ? resulth : val; + } + { + resultl |= static_cast(!bool(static_cast(read_alignedl))); + resulth |= static_cast(!bool(static_cast(read_alignedh))); + } + } + { + auto read_alignedl = sig_bytemask ^ cur[1]; + auto read_alignedh = sig_bytemask ^ cur[5]; + { + auto val = 1u << 14; + resultl |= read_alignedl & mask_bytemask1 ? resultl : val; + resulth |= read_alignedh & mask_bytemask1 ? resulth : val; + } + { + auto val = 1u << 12; + resultl |= read_alignedl & mask_bytemask2 ? resultl : val; + resulth |= read_alignedh & mask_bytemask2 ? resulth : val; + } + { + auto val = 1u << 10; + resultl |= static_cast(read_alignedl) & 0xFFFF'0000u ? resultl : val; + resulth |= static_cast(read_alignedh) & 0xFFFF'0000u ? resulth : val; + } + { + auto val = 1u << 8; + resultl |= static_cast(read_alignedl) ? resultl : val; + resulth |= static_cast(read_alignedh) ? resulth : val; + } + } + { + auto read_alignedl = sig_bytemask ^ cur[2]; + auto read_alignedh = sig_bytemask ^ cur[6]; + { + auto val = 1u << 22; + resultl |= read_alignedl & mask_bytemask1 ? resultl : val; + resulth |= read_alignedh & mask_bytemask1 ? resulth : val; + } + { + auto val = 1u << 20; + resultl |= read_alignedl & mask_bytemask2 ? resultl : val; + resulth |= read_alignedh & mask_bytemask2 ? resulth : val; + } + { + auto val = 1u << 18; + resultl |= static_cast(read_alignedl) & 0xFFFF'0000u ? resultl : val; + resulth |= static_cast(read_alignedh) & 0xFFFF'0000u ? resulth : val; + } + { + auto val = 1u << 16; + resultl |= static_cast(read_alignedl) ? resultl : val; + resulth |= static_cast(read_alignedh) ? resulth : val; + } + } + { + auto read_alignedl = sig_bytemask ^ cur[3]; + auto read_alignedh = sig_bytemask ^ cur[7]; + { + auto val = 1u << 30; + resultl |= read_alignedl & mask_bytemask1 ? resultl : val; + resulth |= read_alignedh & mask_bytemask1 ? resulth : val; + } + { + auto val = 1u << 28; + resultl |= read_alignedl & mask_bytemask2 ? resultl : val; + resulth |= read_alignedh & mask_bytemask2 ? resulth : val; + } + { + auto val = 1u << 26; + resultl |= static_cast(read_alignedl) & 0xFFFF'0000u ? resultl : val; + resulth |= static_cast(read_alignedh) & 0xFFFF'0000u ? resulth : val; + } + { + auto val = 1u << 24; + resultl |= static_cast(read_alignedl) ? resultl : val; + resulth |= static_cast(read_alignedh) ? resulth : val; + } + } + { + auto read_unalignedl = sig_bytemask ^ reinterpret_cast(reinterpret_cast(cur) + 1)[0]; + auto read_unalignedh = sig_bytemask ^ reinterpret_cast(reinterpret_cast(cur) + 1)[4]; + { + auto val = 2u << 6; + resultl = read_unalignedl & mask_bytemask1 ? resultl : val; + resulth = read_unalignedh & mask_bytemask1 ? resulth : val; + } + { + auto val = 2u << 4; + resultl |= read_unalignedl & mask_bytemask2 ? resultl : val; + resulth |= read_unalignedh & mask_bytemask2 ? resulth : val; + } + { + auto val = 2u << 2; + resultl |= static_cast(read_unalignedl) & 0xFFFF'0000u ? resultl : val; + resulth |= static_cast(read_unalignedh) & 0xFFFF'0000u ? resulth : val; + } + { + auto val = 2u; + resultl |= static_cast(read_unalignedl) ? resultl : val; + resulth |= static_cast(read_unalignedh) ? resulth : val; + } + } + { + auto read_unalignedl = sig_bytemask ^ reinterpret_cast(reinterpret_cast(cur) + 1)[1]; + auto read_unalignedh = sig_bytemask ^ reinterpret_cast(reinterpret_cast(cur) + 1)[5]; + { + auto val = 2u << 14; + resultl |= read_unalignedl & mask_bytemask1 ? resultl : val; + resulth |= read_unalignedh & mask_bytemask1 ? resulth : val; + } + { + auto val = 2u << 12; + resultl |= read_unalignedl & mask_bytemask2 ? resultl : val; + resulth |= read_unalignedh & mask_bytemask2 ? resulth : val; + } + { + auto val = 2u << 10; + resultl |= static_cast(read_unalignedl) & 0xFFFF'0000u ? resultl : val; + resulth |= static_cast(read_unalignedh) & 0xFFFF'0000u ? resulth : val; + } + { + auto val = 2u << 8; + resultl |= static_cast(read_unalignedl) ? resultl : val; + resulth |= static_cast(read_unalignedh) ? resulth : val; + } + } + { + auto read_unalignedl = sig_bytemask ^ reinterpret_cast(reinterpret_cast(cur) + 1)[2]; + auto read_unalignedh = sig_bytemask ^ reinterpret_cast(reinterpret_cast(cur) + 1)[6]; + { + auto val = 2u << 22; + resultl |= read_unalignedl & mask_bytemask1 ? resultl : val; + resulth |= read_unalignedh & mask_bytemask1 ? resulth : val; + } + { + auto val = 2u << 20; + resultl |= read_unalignedl & mask_bytemask2 ? resultl : val; + resulth |= read_unalignedh & mask_bytemask2 ? resulth : val; + } + { + auto val = 2u << 18; + resultl |= static_cast(read_unalignedl) & 0xFFFF'0000u ? resultl : val; + resulth |= static_cast(read_unalignedh) & 0xFFFF'0000u ? resulth : val; + } + { + auto val = 2u << 16; + resultl |= static_cast(read_unalignedl) ? resultl : val; + resulth |= static_cast(read_unalignedh) ? resulth : val; + } + } + { + auto read_unalignedl = sig_bytemask ^ reinterpret_cast(reinterpret_cast(cur) + 1)[3]; + auto read_unalignedh = sig_bytemask ^ reinterpret_cast(reinterpret_cast(cur) + 1)[7]; + { + auto val = 2u << 30; + resultl |= read_unalignedl & mask_bytemask1 ? resultl : val; + resulth |= read_unalignedh & mask_bytemask1 ? resulth : val; + } + { + auto val = 2u << 28; + resultl |= read_unalignedl & mask_bytemask2 ? resultl : val; + resulth |= read_unalignedh & mask_bytemask2 ? resulth : val; + } + { + auto val = 2u << 26; + resultl |= static_cast(read_unalignedl) & 0xFFFF'0000u ? resultl : val; + resulth |= static_cast(read_unalignedh) & 0xFFFF'0000u ? resulth : val; + } + { + auto val = 2u << 24; + resultl |= static_cast(read_unalignedl) ? resultl : val; + resulth |= static_cast(read_unalignedh) ? resulth : val; + } + resulth <<= 32; + if PATTERN16_LIKELY (!(resulth |= resultl)) goto outer_loop_continue1; + } + { + auto cur_sig = reinterpret_cast(cur) + sig_offset; + uint64_t result = resulth; + inner_loop_continue1: + inner_loop_continue2: + if PATTERN16_UNLIKELY (!(resulth &= result)) goto outer_loop_continue2; + auto cur_sig_start = reinterpret_cast(cur_sig + _tzcnt_u64(resulth)); + result = resulth--; + auto potential_match = *cur_sig_start; + potential_match ^= psig[0]; + if PATTERN16_LIKELY (potential_match &= signature.second[0]) goto inner_loop_continue1; + auto length_ = length; + while (length_--) { + if (!length_) return (const void*)cur_sig_start; + auto potential_match = cur_sig_start[length_]; + potential_match ^= psig[length_]; + if (potential_match &= signature.second[length_]) break; + } + goto inner_loop_continue2; + } + } + } + auto cur_byte = reinterpret_cast(cur); + auto end_byte = reinterpret_cast(cur + 13); + do { + auto cur_sig_start = reinterpret_cast(cur_byte); + auto length_ = length; + while (length_--) { + if (!length_) return (const void*)cur_sig_start; + auto potential_match = cur_sig_start[length_]; + potential_match ^= psig[length_]; + if (potential_match & signature.second[length_]) break; + } + } while (++cur_byte < end_byte); + return nullptr; + } + } +} \ No newline at end of file