diff --git a/CMakeLists.txt b/CMakeLists.txt index 58e65a062..21126cbca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1769,7 +1769,12 @@ if (NOT WIN32) ) endif() -if ((NOT is_x86) AND (NOT is_x64)) +if (x86_has_sse2) + target_compile_definitions(tg_owt + PRIVATE + WEBRTC_HAS_SSE2 + ) +else() remove_target_sources(tg_owt ${webrtc_loc} common_audio/fir_filter_sse.cc common_audio/fir_filter_sse.h diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 19758bc83..445205d69 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -8,8 +8,10 @@ set(is_arm 0) set(is_arm8 0) set(is_arm7 0) set(arm_use_neon 0) +set(x86_has_sse2 0) option(TG_OWT_ARCH_ARMV7_USE_NEON "Use NEON SIMD instructions when building for ARMv7" ON) +option(TG_OWT_ARCH_X86_FORCE_SSE "Assume SSE instructions available when building for IA-32" ON) # Check for 64-bit x86 (aka x64): @@ -117,3 +119,29 @@ endif() # arm32 endif() # aarch64 endif() # x86 endif() # x64 + + +if (is_x86 OR is_x64) + check_symbol_exists(__SSE__ "stddef.h" HAVE_SSE1_DEF) + check_symbol_exists(__SSE2__ "stddef.h" HAVE_SSE2_DEF) + + if (HAVE_SSE1_DEF AND HAVE_SSE2_DEF) + message(STATUS "Compiller natively supports SSE and SSE2, these SIMD instructions now enabled") + set(x86_has_sse2 1) + elseif (TG_OWT_ARCH_X86_FORCE_SSE) + message(STATUS "SSE SIMD instructions enabled (can be disabled with -DTG_OWT_ARCH_X86_FORCE_SSE=OFF).") + set(x86_has_sse2 1) + + if (WIN32) + # TODO: Add the correct flags for Windows here. + elseif (APPLE) + # TODO: Add the correct flags for Apple devices here. + else() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2") + set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -msse2") + endif() + else() + message(STATUS "Runtime checks of SSE SIMD activated (can be forced with -DTG_OWT_ARCH_X86_FORCE_SSE=ON).") + endif() +endif() diff --git a/cmake/init_target.cmake b/cmake/init_target.cmake index 15aa965b1..22277ab03 100644 --- a/cmake/init_target.cmake +++ b/cmake/init_target.cmake @@ -75,13 +75,6 @@ function(init_target target_name) # init_target(my_target folder_name) endif() endif() - if (is_x86) - target_compile_options(${target_name} - PRIVATE - -msse2 - ) - endif() - target_compile_definitions(${target_name} PRIVATE HAVE_NETINET_IN_H diff --git a/cmake/libpffft.cmake b/cmake/libpffft.cmake index fa2e66f9f..655f5ceec 100644 --- a/cmake/libpffft.cmake +++ b/cmake/libpffft.cmake @@ -15,7 +15,7 @@ PRIVATE _USE_MATH_DEFINES ) -if (NOT is_x86 AND NOT is_x64 AND NOT arm_use_neon) +if (NOT x86_has_sse2 AND NOT arm_use_neon) target_compile_definitions(libpffft PRIVATE PFFFT_SIMD_DISABLE diff --git a/src/modules/audio_processing/aec3/adaptive_fir_filter.cc b/src/modules/audio_processing/aec3/adaptive_fir_filter.cc index 6a0f53166..87d559a87 100644 --- a/src/modules/audio_processing/aec3/adaptive_fir_filter.cc +++ b/src/modules/audio_processing/aec3/adaptive_fir_filter.cc @@ -88,7 +88,7 @@ void ComputeFrequencyResponse_Neon( #if defined(WEBRTC_ARCH_X86_FAMILY) // Computes and stores the frequency response of the filter. -void ComputeFrequencyResponse_Sse2( +RTC_TARGET_SSE2 void ComputeFrequencyResponse_Sse2( size_t num_partitions, const std::vector>& H, std::vector>* H2) { @@ -210,10 +210,11 @@ void AdaptPartitions_Neon(const RenderBuffer& render_buffer, #if defined(WEBRTC_ARCH_X86_FAMILY) // Adapts the filter partitions. (SSE2 variant) -void AdaptPartitions_Sse2(const RenderBuffer& render_buffer, - const FftData& G, - size_t num_partitions, - std::vector>* H) { +RTC_TARGET_SSE2 void AdaptPartitions_Sse2( + const RenderBuffer& render_buffer, + const FftData& G, + size_t num_partitions, + std::vector>* H) { rtc::ArrayView> render_buffer_data = render_buffer.GetFftBuffer(); const size_t num_render_channels = render_buffer_data[0].size(); @@ -375,10 +376,11 @@ void ApplyFilter_Neon(const RenderBuffer& render_buffer, #if defined(WEBRTC_ARCH_X86_FAMILY) // Produces the filter output (SSE2 variant). -void ApplyFilter_Sse2(const RenderBuffer& render_buffer, - size_t num_partitions, - const std::vector>& H, - FftData* S) { +RTC_TARGET_SSE2 void ApplyFilter_Sse2( + const RenderBuffer& render_buffer, + size_t num_partitions, + const std::vector>& H, + FftData* S) { // const RenderBuffer& render_buffer, // rtc::ArrayView H, // FftData* S) { diff --git a/src/modules/audio_processing/aec3/adaptive_fir_filter_erl.cc b/src/modules/audio_processing/aec3/adaptive_fir_filter_erl.cc index 80378eb3c..64a7a11c7 100644 --- a/src/modules/audio_processing/aec3/adaptive_fir_filter_erl.cc +++ b/src/modules/audio_processing/aec3/adaptive_fir_filter_erl.cc @@ -57,7 +57,7 @@ void ErlComputer_NEON( #if defined(WEBRTC_ARCH_X86_FAMILY) // Computes and stores the echo return loss estimate of the filter, which is the // sum of the partition frequency responses. -void ErlComputer_SSE2( +RTC_TARGET_SSE2 void ErlComputer_SSE2( const std::vector>& H2, rtc::ArrayView erl) { std::fill(erl.begin(), erl.end(), 0.f); diff --git a/src/modules/audio_processing/aec3/fft_data.h b/src/modules/audio_processing/aec3/fft_data.h index 5e5adb62d..a02ffe2fe 100644 --- a/src/modules/audio_processing/aec3/fft_data.h +++ b/src/modules/audio_processing/aec3/fft_data.h @@ -45,7 +45,7 @@ struct FftData { rtc::ArrayView power_spectrum) const { RTC_DCHECK_EQ(kFftLengthBy2Plus1, power_spectrum.size()); switch (optimization) { -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && defined(WEBRTC_HAS_SSE2) case Aec3Optimization::kSse2: { constexpr int kNumFourBinBands = kFftLengthBy2 / 4; constexpr int kLimit = kNumFourBinBands * 4; diff --git a/src/modules/audio_processing/aec3/matched_filter.cc b/src/modules/audio_processing/aec3/matched_filter.cc index 2a489923b..50cfc0185 100644 --- a/src/modules/audio_processing/aec3/matched_filter.cc +++ b/src/modules/audio_processing/aec3/matched_filter.cc @@ -144,14 +144,14 @@ void MatchedFilterCore_NEON(size_t x_start_index, #if defined(WEBRTC_ARCH_X86_FAMILY) -void MatchedFilterCore_SSE2(size_t x_start_index, - float x2_sum_threshold, - float smoothing, - rtc::ArrayView x, - rtc::ArrayView y, - rtc::ArrayView h, - bool* filters_updated, - float* error_sum) { +RTC_TARGET_SSE2 void MatchedFilterCore_SSE2(size_t x_start_index, + float x2_sum_threshold, + float smoothing, + rtc::ArrayView x, + rtc::ArrayView y, + rtc::ArrayView h, + bool* filters_updated, + float* error_sum) { const int h_size = static_cast(h.size()); const int x_size = static_cast(x.size()); RTC_DCHECK_EQ(0, h_size % 4); diff --git a/src/modules/audio_processing/aec3/vector_math.h b/src/modules/audio_processing/aec3/vector_math.h index 883cd95fd..b12ad6048 100644 --- a/src/modules/audio_processing/aec3/vector_math.h +++ b/src/modules/audio_processing/aec3/vector_math.h @@ -42,7 +42,7 @@ class VectorMath { // Elementwise square root. void Sqrt(rtc::ArrayView x) { switch (optimization_) { -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && defined(WEBRTC_HAS_SSE2) case Aec3Optimization::kSse2: { const int x_size = static_cast(x.size()); const int vector_limit = x_size >> 2; @@ -116,7 +116,7 @@ class VectorMath { RTC_DCHECK_EQ(z.size(), x.size()); RTC_DCHECK_EQ(z.size(), y.size()); switch (optimization_) { -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && defined(WEBRTC_HAS_SSE2) case Aec3Optimization::kSse2: { const int x_size = static_cast(x.size()); const int vector_limit = x_size >> 2; @@ -162,7 +162,7 @@ class VectorMath { void Accumulate(rtc::ArrayView x, rtc::ArrayView z) { RTC_DCHECK_EQ(z.size(), x.size()); switch (optimization_) { -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && defined(WEBRTC_HAS_SSE2) case Aec3Optimization::kSse2: { const int x_size = static_cast(x.size()); const int vector_limit = x_size >> 2; diff --git a/src/modules/audio_processing/agc2/rnn_vad/rnn.cc b/src/modules/audio_processing/agc2/rnn_vad/rnn.cc index 55a51ffa4..1238d5db1 100644 --- a/src/modules/audio_processing/agc2/rnn_vad/rnn.cc +++ b/src/modules/audio_processing/agc2/rnn_vad/rnn.cc @@ -229,7 +229,7 @@ void ComputeFullyConnectedLayerOutput( #if defined(WEBRTC_ARCH_X86_FAMILY) // Fully connected layer SSE2 implementation. -void ComputeFullyConnectedLayerOutputSse2( +RTC_TARGET_SSE2 void ComputeFullyConnectedLayerOutputSse2( size_t input_size, size_t output_size, rtc::ArrayView input, diff --git a/src/rtc_base/system/inline.h b/src/rtc_base/system/inline.h index f585d34de..7828b2599 100644 --- a/src/rtc_base/system/inline.h +++ b/src/rtc_base/system/inline.h @@ -28,4 +28,10 @@ #endif +#if defined(__GNUC__) && !defined(__SSE2__) +#define RTC_TARGET_SSE2 __attribute__((__target__("sse2"))) +#else +#define RTC_TARGET_SSE2 +#endif + #endif // RTC_BASE_SYSTEM_INLINE_H_