Skip to content

Commit

Permalink
Add project files.
Browse files Browse the repository at this point in the history
  • Loading branch information
Dasaav-dsv committed Jul 16, 2023
1 parent 0f69f18 commit 3ba9b44
Show file tree
Hide file tree
Showing 3 changed files with 445 additions and 0 deletions.
83 changes: 83 additions & 0 deletions include/scanners/x64/AVX.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#pragma once

#include <vector>
#include <cstdint>
#include <immintrin.h>

#include "../../util.h"

namespace Pattern16 {
namespace Impl {
PATTERN16_NO_INLINE const void* scanRegion(const void* regionStart, const void* regionEnd, size_t sigStart, int, SplitSignature<__m256i>& signature, size_t length) {
auto sig_bytes = _mm256_load_si256(signature.first.data());
auto mask_bytes = _mm256_load_si256(signature.second.data());
auto psig_bytes = (reinterpret_cast<uint8_t*>(signature.first.data()));
auto sig_bytemask = broadcastMask256(psig_bytes[0 + sigStart], psig_bytes[1 + sigStart]);
auto sig_offset = -(intptr_t)sigStart;
auto blendmask = _mm256_set1_epi16(0x8000u);
auto cur = reinterpret_cast<const __m256i*>(regionStart) - 2;
auto safety = (signature.first.size() + 1) * sizeof(__m256i);
auto span = (((uintptr_t)regionEnd - (uintptr_t)regionStart - safety) >> 6);
{
outer_loop_continue1:
outer_loop_continue2:
cur += 2;
if PATTERN16_LIKELY (--span) {
_mm_prefetch(reinterpret_cast<const char*>(cur + 16), _MM_HINT_T0);
_mm_prefetch(reinterpret_cast<const char*>(cur + 48), _MM_HINT_T0);
uint64_t result;
uint32_t resultl;
uint32_t resulth;
{
auto read_aligned1 = _mm256_stream_load_si256(cur);
auto read_unaligned1 = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(reinterpret_cast<const uint8_t*>(cur) + 1));
auto check_aligned1 = _mm256_cmpeq_epi16(sig_bytemask, read_aligned1);
auto check_unaligned1 = _mm256_cmpeq_epi16(sig_bytemask, read_unaligned1);
resultl = _mm256_movemask_epi8(_mm256_blendv_epi8(check_aligned1, check_unaligned1, blendmask));
auto read_aligned2 = _mm256_stream_load_si256(cur + 1);
auto read_unaligned2 = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(reinterpret_cast<const uint8_t*>(cur) + 1) + 1);
auto check_aligned2 = _mm256_cmpeq_epi16(sig_bytemask, read_aligned2);
auto check_unaligned2 = _mm256_cmpeq_epi16(sig_bytemask, read_unaligned2);
resulth = _mm256_movemask_epi8(_mm256_blendv_epi8(check_aligned2, check_unaligned2, blendmask));
if PATTERN16_LIKELY (!(resulth | resultl)) goto outer_loop_continue1;
}
{
result = resultl;
result += static_cast<uint64_t>(resulth) << 32;
auto cur_sig = reinterpret_cast<const uint8_t*>(cur) + sig_offset;
uint64_t result_ = result;
inner_loop_continue1:
inner_loop_continue2:
if PATTERN16_UNLIKELY (!(result &= result_)) goto outer_loop_continue2;
auto cur_sig_start = reinterpret_cast<const __m256i*>(cur_sig + _tzcnt_u64(result));
result_ = result--;
auto potential_match = _mm256_lddqu_si256(cur_sig_start);
potential_match = _mm256_xor_si256(potential_match, sig_bytes);
if PATTERN16_LIKELY (!_mm256_testz_si256(potential_match, mask_bytes)) goto inner_loop_continue1;
auto length_ = length;
while (length_--) {
if (!length_) return (const void*)cur_sig_start;
auto potential_match = _mm256_lddqu_si256(cur_sig_start + length_);
potential_match = _mm256_xor_si256(potential_match, signature.first[length_]);
if (!_mm256_testz_si256(potential_match, signature.second[length_])) break;
}
goto inner_loop_continue2;
}
}
}
auto cur_byte = reinterpret_cast<const uint8_t*>(cur);
auto end_byte = reinterpret_cast<const uint8_t*>(cur + 3);
do {
auto cur_sig_start = reinterpret_cast<const __m256i*>(cur_byte);
auto length_ = length;
while (length_--) {
if (!length_) return (const void*)cur_sig_start;
auto potential_match = _mm256_lddqu_si256(cur_sig_start + length_);
potential_match = _mm256_xor_si256(potential_match, signature.first[length_]);
if (!_mm256_testz_si256(potential_match, signature.second[length_])) break;
}
} while (++cur_byte < end_byte);
return nullptr;
}
}
}
98 changes: 98 additions & 0 deletions include/scanners/x64/SSE.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#pragma once

#include <vector>
#include <cstdint>
#include <immintrin.h>

#include "../../util.h"

namespace Pattern16 {
namespace Impl {
template <SSE_VERSION version = SSE4_1>
PATTERN16_NO_INLINE const void* scanRegion(const void* regionStart, const void* regionEnd, size_t sigStart, int, SplitSignature<__m128i>& signature, size_t length) {
auto sig_bytes = _mm_load_si128(signature.first.data());
auto mask_bytes = _mm_load_si128(signature.second.data());
auto psig_bytes = (reinterpret_cast<uint8_t*>(signature.first.data()));
auto sig_bytemask = broadcastMask128(psig_bytes[0 + sigStart], psig_bytes[1 + sigStart]);
auto sig_offset = -(intptr_t)sigStart;
auto checkmask_unaligned = broadcastMask128(0, 0xFF);
auto cur = reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(regionStart)) - 4;
auto safety = (signature.first.size() + 3) * sizeof(__m128i);
auto span = (((uintptr_t)regionEnd - (uintptr_t)regionStart - safety) >> 6);
{
outer_loop_continue1:
outer_loop_continue2:
cur += 4;
if PATTERN16_LIKELY (--span) {
_mm_prefetch(reinterpret_cast<const char*>(cur + 32), _MM_HINT_T0);
_mm_prefetch(reinterpret_cast<const char*>(cur + 96), _MM_HINT_T0);
uint64_t result;
{
auto read_aligned1 = _mm_load_si128(cur + 2);
auto read_unaligned1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(cur) + 1) + 2);
auto read_aligned2 = _mm_load_si128(cur + 3);
auto read_unaligned2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(cur) + 1) + 3);
auto check_aligned2 = _mm_cmpeq_epi16(read_aligned2, sig_bytemask);
auto check_unaligned2 = _mm_cmpeq_epi16(read_unaligned2, sig_bytemask);
check_unaligned2 = _mm_and_si128(check_unaligned2, checkmask_unaligned);
result = _mm_movemask_epi8(_mm_or_si128(check_aligned2, check_unaligned2));
result <<= 16;
auto check_aligned1 = _mm_cmpeq_epi16(read_aligned1, sig_bytemask);
auto check_unaligned1 = _mm_cmpeq_epi16(read_unaligned1, sig_bytemask);
check_unaligned1 = _mm_and_si128(check_unaligned1, checkmask_unaligned);
result |= _mm_movemask_epi8(_mm_or_si128(check_aligned1, check_unaligned1));
result <<= 16;
}
{
auto read_aligned1 = _mm_load_si128(cur);
auto read_unaligned1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(cur) + 1));
auto read_aligned2 = _mm_load_si128(cur + 1);
auto read_unaligned2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(cur) + 1) + 1);
auto check_aligned2 = _mm_cmpeq_epi16(read_aligned2, sig_bytemask);
auto check_unaligned2 = _mm_cmpeq_epi16(read_unaligned2, sig_bytemask);
check_unaligned2 = _mm_and_si128(check_unaligned2, checkmask_unaligned);
result |= _mm_movemask_epi8(_mm_or_si128(check_aligned2, check_unaligned2));
result <<= 16;
auto check_aligned1 = _mm_cmpeq_epi16(read_aligned1, sig_bytemask);
auto check_unaligned1 = _mm_cmpeq_epi16(read_unaligned1, sig_bytemask);
check_unaligned1 = _mm_and_si128(check_unaligned1, checkmask_unaligned);
if PATTERN16_LIKELY (!(result |= _mm_movemask_epi8(_mm_or_si128(check_aligned1, check_unaligned1)))) goto outer_loop_continue1;
}
{
auto cur_sig = reinterpret_cast<const uint8_t*>(cur) + sig_offset;
uint64_t result_ = result;
inner_loop_continue1:
inner_loop_continue2:
if PATTERN16_UNLIKELY (!(result &= result_)) goto outer_loop_continue2;
auto cur_sig_start = reinterpret_cast<const __m128i*>(cur_sig + _tzcnt_u64(result));
result_ = result--;
auto potential_match = _mm_loadu_si128(cur_sig_start);
potential_match = _mm_xor_si128(potential_match, sig_bytes);
if PATTERN16_LIKELY (!_mm_testz_SSE<version>(potential_match, mask_bytes)) goto inner_loop_continue1;
auto length_ = length;
while (length_--) {
if (!length_) return (const void*)cur_sig_start;
auto potential_match = _mm_loadu_si128(cur_sig_start + length_);
potential_match = _mm_xor_si128(potential_match, signature.first[length_]);
if (!_mm_testz_SSE<version>(potential_match, signature.second[length_])) break;
}
goto inner_loop_continue2;
}
}
}
auto cur_byte = reinterpret_cast<const uint8_t*>(cur);
auto end_byte = reinterpret_cast<const uint8_t*>(cur + 7);
do {
auto cur_sig_start = reinterpret_cast<const __m128i*>(cur_byte);
auto length_ = length;
while (length_--) {
if (!length_) return (const void*)cur_sig_start;
auto potential_match = _mm_loadu_si128(cur_sig_start + length_);
potential_match = _mm_xor_si128(potential_match, signature.first[length_]);
if (!_mm_testz_si128(potential_match, signature.second[length_])) break;
}
} while (++cur_byte < end_byte);
return nullptr;
}
}
}
Loading

0 comments on commit 3ba9b44

Please sign in to comment.