diff --git a/hwy/contrib/sort/vqsort-inl.h b/hwy/contrib/sort/vqsort-inl.h index 3f73fa1221..bd98835b34 100644 --- a/hwy/contrib/sort/vqsort-inl.h +++ b/hwy/contrib/sort/vqsort-inl.h @@ -43,6 +43,40 @@ #include "hwy/contrib/sort/vqsort.h" // Fill16BytesSecure #endif +namespace hwy { +namespace detail { + +HWY_INLINE void Fill16BytesStatic(void* bytes) { +#if !VQSORT_ONLY_STATIC + if (Fill16BytesSecure(bytes)) return; +#endif + + uint64_t* words = reinterpret_cast(bytes); + + // Static-only, or Fill16BytesSecure failed. Get some entropy from the + // stack/code location, and the clock() timer. + uint64_t** seed_stack = &words; + void (*seed_code)(void*) = &Fill16BytesStatic; + const uintptr_t bits_stack = reinterpret_cast(seed_stack); + const uintptr_t bits_code = reinterpret_cast(seed_code); + const uint64_t bits_time = static_cast(clock()); + words[0] = bits_stack ^ bits_time ^ 0xFEDCBA98; // "Nothing up my sleeve" + words[1] = bits_code ^ bits_time ^ 0x01234567; // constants. +} + +HWY_INLINE uint64_t* GetGeneratorStateStatic() { + thread_local uint64_t state[3] = {0}; + // This is a counter; zero indicates not yet initialized. + if (HWY_UNLIKELY(state[2] == 0)) { + Fill16BytesStatic(state); + state[2] = 1; + } + return state; +} + +} // namespace detail +} // namespace hwy + #endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_ // Per-target @@ -1754,34 +1788,6 @@ HWY_INLINE size_t CountAndReplaceNaN(D, Traits, T* HWY_RESTRICT, size_t) { return 0; } -HWY_INLINE void Fill16BytesStatic(void* bytes) { -#if !VQSORT_ONLY_STATIC - if (Fill16BytesSecure(bytes)) return; -#endif - - uint64_t* words = reinterpret_cast(bytes); - - // Static-only, or Fill16BytesSecure failed. Get some entropy from the - // stack/code location, and the clock() timer. - uint64_t** seed_stack = &words; - void (*seed_code)(void*) = &Fill16BytesStatic; - const uintptr_t bits_stack = reinterpret_cast(seed_stack); - const uintptr_t bits_code = reinterpret_cast(seed_code); - const uint64_t bits_time = static_cast(clock()); - words[0] = bits_stack ^ bits_time ^ 0xFEDCBA98; // "Nothing up my sleeve" - words[1] = bits_code ^ bits_time ^ 0x01234567; // constants. -} - -HWY_INLINE uint64_t* GetGeneratorStateStatic() { - thread_local uint64_t state[3] = {0}; - // This is a counter; zero indicates not yet initialized. - if (HWY_UNLIKELY(state[2] == 0)) { - Fill16BytesStatic(state); - state[2] = 1; - } - return state; -} - } // namespace detail // Old interface with user-specified buffer, retained for compatibility. Called @@ -1806,7 +1812,7 @@ void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num, #if VQSORT_ENABLED || HWY_IDE if (!detail::HandleSpecialCases(d, st, keys, num, buf)) { - uint64_t* HWY_RESTRICT state = detail::GetGeneratorStateStatic(); + uint64_t* HWY_RESTRICT state = hwy::detail::GetGeneratorStateStatic(); // Introspection: switch to worst-case N*logN heapsort after this many. // Should never be reached, so computing log2 exactly does not help. const size_t max_levels = 50; diff --git a/hwy/contrib/sort/vqsort.cc b/hwy/contrib/sort/vqsort.cc index 9d938360fb..da8ca64b39 100644 --- a/hwy/contrib/sort/vqsort.cc +++ b/hwy/contrib/sort/vqsort.cc @@ -211,8 +211,6 @@ void Sorter::Fill24Bytes(const void*, size_t, void*) {} bool Sorter::HaveFloat64() { return hwy::HaveFloat64(); } Sorter::Sorter() {} void Sorter::Delete() {} -uint64_t* GetGeneratorState() { - return HWY_STATIC_DISPATCH(detail::GetGeneratorStateStatic()); -} +uint64_t* GetGeneratorState() { return hwy::detail::GetGeneratorStateStatic(); } } // namespace hwy diff --git a/hwy/detect_targets.h b/hwy/detect_targets.h index 693bb0e98d..22a9ee35ae 100644 --- a/hwy/detect_targets.h +++ b/hwy/detect_targets.h @@ -537,9 +537,11 @@ // Clang, GCC and MSVC allow runtime dispatch on x86. #if HWY_ARCH_X86 #define HWY_HAVE_RUNTIME_DISPATCH 1 -// On Arm/PPC, currently only GCC does, and we require Linux to detect CPU -// capabilities. -#elif (HWY_ARCH_ARM || HWY_ARCH_PPC) && HWY_COMPILER_GCC_ACTUAL && \ +// On Arm/PPC, GCC and Clang 16+ do, and we require Linux to detect CPU +// capabilities. Currently require opt-in for Clang because it is experimental. +#elif (HWY_ARCH_ARM || HWY_ARCH_PPC) && \ + (HWY_COMPILER_GCC_ACTUAL || (HWY_COMPILER_CLANG >= 1600 && \ + defined(HWY_ENABLE_CLANG_ARM_DISPATCH))) && \ HWY_OS_LINUX && !defined(TOOLCHAIN_MISS_SYS_AUXV_H) #define HWY_HAVE_RUNTIME_DISPATCH 1 #else