Skip to content

Commit

Permalink
#443: Add armv7 neon mplementation for Simd<u8, 16>::swizzle_dyn
Browse files Browse the repository at this point in the history
Use arm neon intrinsics to swizzle two u8x8 blocks with a u8x8x2 lookup table.
  • Loading branch information
hkratz authored Oct 22, 2024
1 parent 158e240 commit 7e162d1
Showing 1 changed file with 29 additions and 0 deletions.
29 changes: 29 additions & 0 deletions crates/core_simd/src/swizzle_dyn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ where
target_endian = "little"
))]
16 => transize(vqtbl1q_u8, self, idxs),
#[cfg(all(
target_arch = "arm",
target_feature = "v7",
target_feature = "neon",
target_endian = "little"
))]
16 => transize(armv7_neon_swizzle_u8x16, self, idxs),
#[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
32 => transize(avx2_pshufb, self, idxs),
#[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
Expand Down Expand Up @@ -98,6 +105,28 @@ where
}
}

/// armv7 neon supports swizzling `u8x16` by swizzling two u8x8 blocks
/// with a u8x8x2 lookup table.
///
/// # Safety
/// This requires armv7 neon to work
#[cfg(all(
target_arch = "arm",
target_feature = "v7",
target_feature = "neon",
target_endian = "little"
))]
unsafe fn armv7_neon_swizzle_u8x16(bytes: Simd<u8, 16>, idxs: Simd<u8, 16>) -> Simd<u8, 16> {
use core::arch::arm::{uint8x8x2_t, vcombine_u8, vget_high_u8, vget_low_u8, vtbl2_u8};
// SAFETY: Caller promised arm neon support
unsafe {
let bytes = uint8x8x2_t(vget_low_u8(bytes.into()), vget_high_u8(bytes.into()));
let lo = vtbl2_u8(bytes, vget_low_u8(idxs.into()));
let hi = vtbl2_u8(bytes, vget_high_u8(idxs.into()));
vcombine_u8(lo, hi).into()
}
}

/// "vpshufb like it was meant to be" on AVX2
///
/// # Safety
Expand Down

0 comments on commit 7e162d1

Please sign in to comment.