Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid SSE2 usage on i386 without properly checks. #45

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1769,7 +1769,12 @@ if (NOT WIN32)
)
endif()

if ((NOT is_x86) AND (NOT is_x64))
if (x86_has_sse2)
target_compile_definitions(tg_owt
PRIVATE
WEBRTC_HAS_SSE2
)
else()
remove_target_sources(tg_owt ${webrtc_loc}
common_audio/fir_filter_sse.cc
common_audio/fir_filter_sse.h
Expand Down
28 changes: 28 additions & 0 deletions cmake/arch.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ set(is_arm 0)
set(is_arm8 0)
set(is_arm7 0)
set(arm_use_neon 0)
set(x86_has_sse2 0)

option(TG_OWT_ARCH_ARMV7_USE_NEON "Use NEON SIMD instructions when building for ARMv7" ON)
option(TG_OWT_ARCH_X86_FORCE_SSE "Assume SSE instructions available when building for IA-32" ON)


# Check for 64-bit x86 (aka x64):
Expand Down Expand Up @@ -117,3 +119,29 @@ endif() # arm32
endif() # aarch64
endif() # x86
endif() # x64


if (is_x86 OR is_x64)
check_symbol_exists(__SSE__ "stddef.h" HAVE_SSE1_DEF)
check_symbol_exists(__SSE2__ "stddef.h" HAVE_SSE2_DEF)

if (HAVE_SSE1_DEF AND HAVE_SSE2_DEF)
message(STATUS "Compiller natively supports SSE and SSE2, these SIMD instructions now enabled")
set(x86_has_sse2 1)
elseif (TG_OWT_ARCH_X86_FORCE_SSE)
message(STATUS "SSE SIMD instructions enabled (can be disabled with -DTG_OWT_ARCH_X86_FORCE_SSE=OFF).")
set(x86_has_sse2 1)

if (WIN32)
# TODO: Add the correct flags for Windows here.
elseif (APPLE)
# TODO: Add the correct flags for Apple devices here.
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -msse2")
endif()
else()
message(STATUS "Runtime checks of SSE SIMD activated (can be forced with -DTG_OWT_ARCH_X86_FORCE_SSE=ON).")
endif()
endif()
7 changes: 0 additions & 7 deletions cmake/init_target.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,6 @@ function(init_target target_name) # init_target(my_target folder_name)
endif()
endif()

if (is_x86)
target_compile_options(${target_name}
PRIVATE
-msse2
)
endif()

target_compile_definitions(${target_name}
PRIVATE
HAVE_NETINET_IN_H
Expand Down
2 changes: 1 addition & 1 deletion cmake/libpffft.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ PRIVATE
_USE_MATH_DEFINES
)

if (NOT is_x86 AND NOT is_x64 AND NOT arm_use_neon)
if (NOT x86_has_sse2 AND NOT arm_use_neon)
target_compile_definitions(libpffft
PRIVATE
PFFFT_SIMD_DISABLE
Expand Down
20 changes: 11 additions & 9 deletions src/modules/audio_processing/aec3/adaptive_fir_filter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ void ComputeFrequencyResponse_Neon(

#if defined(WEBRTC_ARCH_X86_FAMILY)
// Computes and stores the frequency response of the filter.
void ComputeFrequencyResponse_Sse2(
RTC_TARGET_SSE2 void ComputeFrequencyResponse_Sse2(
size_t num_partitions,
const std::vector<std::vector<FftData>>& H,
std::vector<std::array<float, kFftLengthBy2Plus1>>* H2) {
Expand Down Expand Up @@ -210,10 +210,11 @@ void AdaptPartitions_Neon(const RenderBuffer& render_buffer,

#if defined(WEBRTC_ARCH_X86_FAMILY)
// Adapts the filter partitions. (SSE2 variant)
void AdaptPartitions_Sse2(const RenderBuffer& render_buffer,
const FftData& G,
size_t num_partitions,
std::vector<std::vector<FftData>>* H) {
RTC_TARGET_SSE2 void AdaptPartitions_Sse2(
const RenderBuffer& render_buffer,
const FftData& G,
size_t num_partitions,
std::vector<std::vector<FftData>>* H) {
rtc::ArrayView<const std::vector<FftData>> render_buffer_data =
render_buffer.GetFftBuffer();
const size_t num_render_channels = render_buffer_data[0].size();
Expand Down Expand Up @@ -375,10 +376,11 @@ void ApplyFilter_Neon(const RenderBuffer& render_buffer,

#if defined(WEBRTC_ARCH_X86_FAMILY)
// Produces the filter output (SSE2 variant).
void ApplyFilter_Sse2(const RenderBuffer& render_buffer,
size_t num_partitions,
const std::vector<std::vector<FftData>>& H,
FftData* S) {
RTC_TARGET_SSE2 void ApplyFilter_Sse2(
const RenderBuffer& render_buffer,
size_t num_partitions,
const std::vector<std::vector<FftData>>& H,
FftData* S) {
// const RenderBuffer& render_buffer,
// rtc::ArrayView<const FftData> H,
// FftData* S) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ void ErlComputer_NEON(
#if defined(WEBRTC_ARCH_X86_FAMILY)
// Computes and stores the echo return loss estimate of the filter, which is the
// sum of the partition frequency responses.
void ErlComputer_SSE2(
RTC_TARGET_SSE2 void ErlComputer_SSE2(
const std::vector<std::array<float, kFftLengthBy2Plus1>>& H2,
rtc::ArrayView<float> erl) {
std::fill(erl.begin(), erl.end(), 0.f);
Expand Down
2 changes: 1 addition & 1 deletion src/modules/audio_processing/aec3/fft_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ struct FftData {
rtc::ArrayView<float> power_spectrum) const {
RTC_DCHECK_EQ(kFftLengthBy2Plus1, power_spectrum.size());
switch (optimization) {
#if defined(WEBRTC_ARCH_X86_FAMILY)
#if defined(WEBRTC_ARCH_X86_FAMILY) && defined(WEBRTC_HAS_SSE2)
case Aec3Optimization::kSse2: {
constexpr int kNumFourBinBands = kFftLengthBy2 / 4;
constexpr int kLimit = kNumFourBinBands * 4;
Expand Down
16 changes: 8 additions & 8 deletions src/modules/audio_processing/aec3/matched_filter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -144,14 +144,14 @@ void MatchedFilterCore_NEON(size_t x_start_index,

#if defined(WEBRTC_ARCH_X86_FAMILY)

void MatchedFilterCore_SSE2(size_t x_start_index,
float x2_sum_threshold,
float smoothing,
rtc::ArrayView<const float> x,
rtc::ArrayView<const float> y,
rtc::ArrayView<float> h,
bool* filters_updated,
float* error_sum) {
RTC_TARGET_SSE2 void MatchedFilterCore_SSE2(size_t x_start_index,
float x2_sum_threshold,
float smoothing,
rtc::ArrayView<const float> x,
rtc::ArrayView<const float> y,
rtc::ArrayView<float> h,
bool* filters_updated,
float* error_sum) {
const int h_size = static_cast<int>(h.size());
const int x_size = static_cast<int>(x.size());
RTC_DCHECK_EQ(0, h_size % 4);
Expand Down
6 changes: 3 additions & 3 deletions src/modules/audio_processing/aec3/vector_math.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class VectorMath {
// Elementwise square root.
void Sqrt(rtc::ArrayView<float> x) {
switch (optimization_) {
#if defined(WEBRTC_ARCH_X86_FAMILY)
#if defined(WEBRTC_ARCH_X86_FAMILY) && defined(WEBRTC_HAS_SSE2)
case Aec3Optimization::kSse2: {
const int x_size = static_cast<int>(x.size());
const int vector_limit = x_size >> 2;
Expand Down Expand Up @@ -116,7 +116,7 @@ class VectorMath {
RTC_DCHECK_EQ(z.size(), x.size());
RTC_DCHECK_EQ(z.size(), y.size());
switch (optimization_) {
#if defined(WEBRTC_ARCH_X86_FAMILY)
#if defined(WEBRTC_ARCH_X86_FAMILY) && defined(WEBRTC_HAS_SSE2)
case Aec3Optimization::kSse2: {
const int x_size = static_cast<int>(x.size());
const int vector_limit = x_size >> 2;
Expand Down Expand Up @@ -162,7 +162,7 @@ class VectorMath {
void Accumulate(rtc::ArrayView<const float> x, rtc::ArrayView<float> z) {
RTC_DCHECK_EQ(z.size(), x.size());
switch (optimization_) {
#if defined(WEBRTC_ARCH_X86_FAMILY)
#if defined(WEBRTC_ARCH_X86_FAMILY) && defined(WEBRTC_HAS_SSE2)
case Aec3Optimization::kSse2: {
const int x_size = static_cast<int>(x.size());
const int vector_limit = x_size >> 2;
Expand Down
2 changes: 1 addition & 1 deletion src/modules/audio_processing/agc2/rnn_vad/rnn.cc
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ void ComputeFullyConnectedLayerOutput(

#if defined(WEBRTC_ARCH_X86_FAMILY)
// Fully connected layer SSE2 implementation.
void ComputeFullyConnectedLayerOutputSse2(
RTC_TARGET_SSE2 void ComputeFullyConnectedLayerOutputSse2(
size_t input_size,
size_t output_size,
rtc::ArrayView<const float> input,
Expand Down
6 changes: 6 additions & 0 deletions src/rtc_base/system/inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,10 @@

#endif

#if defined(__GNUC__) && !defined(__SSE2__)
#define RTC_TARGET_SSE2 __attribute__((__target__("sse2")))
#else
#define RTC_TARGET_SSE2
#endif

#endif // RTC_BASE_SYSTEM_INLINE_H_