webrtc-sdk · kuba-- · May 24, 2024 · Jun 1, 2024 · Jun 2, 2024 · Jun 5, 2024
diff --git a/api/audio_codecs/L16/audio_encoder_L16.cc b/api/audio_codecs/L16/audio_encoder_L16.cc
@@ -29,6 +29,7 @@ absl::optional<AudioEncoderL16::Config> AudioEncoderL16::SdpToConfig(
   }
   Config config;
   config.sample_rate_hz = format.clockrate_hz;
+  config.pre_encoded = format.pre_encoded;
   config.num_channels = rtc::dchecked_cast<int>(format.num_channels);
   auto ptime_iter = format.parameters.find("ptime");
   if (ptime_iter != format.parameters.end()) {
@@ -66,6 +67,7 @@ std::unique_ptr<AudioEncoder> AudioEncoderL16::MakeAudioEncoder(
   c.num_channels = config.num_channels;
   c.frame_size_ms = config.frame_size_ms;
   c.payload_type = payload_type;
+  c.pre_encoded = config.pre_encoded;
   if (!config.IsOk()) {
     RTC_DCHECK_NOTREACHED();
     return nullptr;

diff --git a/api/audio_codecs/L16/audio_encoder_L16.h b/api/audio_codecs/L16/audio_encoder_L16.h
@@ -38,6 +38,7 @@ struct RTC_EXPORT AudioEncoderL16 {
     int sample_rate_hz = 8000;
     int num_channels = 1;
     int frame_size_ms = 10;
+    bool pre_encoded = false;
   };
   static absl::optional<Config> SdpToConfig(const SdpAudioFormat& audio_format);
   static void AppendSupportedEncoders(std::vector<AudioCodecSpec>* specs);

diff --git a/api/audio_codecs/audio_encoder.cc b/api/audio_codecs/audio_encoder.cc
@@ -110,5 +110,19 @@ ANAStats AudioEncoder::GetANAStats() const {
   return ANAStats();
 }
 
+size_t AudioEncoder::AppendPreEncodeData(rtc::ArrayView<const int16_t> audio,
+                                      rtc::Buffer* encoded) {
+  const size_t old_size = encoded->size();
+  for (const int16_t it : audio) {
+    uint8_t arr[2] = {
+       static_cast<uint8_t>((it >> 8) & 0x00ff),
+       static_cast<uint8_t>(it & 0x00ff),
+    };
+
+    encoded->AppendData(arr, 2);
+  }
+  return (encoded->size() - old_size);
+}
+
 constexpr int AudioEncoder::kMaxNumberOfChannels;
 }  // namespace webrtc
diff --git a/api/audio_codecs/audio_encoder.h b/api/audio_codecs/audio_encoder.h
@@ -255,6 +255,10 @@ class AudioEncoder {
   virtual EncodedInfo EncodeImpl(uint32_t rtp_timestamp,
                                  rtc::ArrayView<const int16_t> audio,
                                  rtc::Buffer* encoded) = 0;
+
+  // The AppendPreEncodeData function adds raw audio data to the end of the encoded buffer.
+  virtual size_t AppendPreEncodeData(rtc::ArrayView<const int16_t> audio,
+                                 rtc::Buffer* encoded);
 };
 }  // namespace webrtc
 #endif  // API_AUDIO_CODECS_AUDIO_ENCODER_H_
diff --git a/api/audio_codecs/audio_format.cc b/api/audio_codecs/audio_format.cc
@@ -22,7 +22,10 @@ SdpAudioFormat::SdpAudioFormat(SdpAudioFormat&&) = default;
 SdpAudioFormat::SdpAudioFormat(absl::string_view name,
                                int clockrate_hz,
                                size_t num_channels)
-    : name(name), clockrate_hz(clockrate_hz), num_channels(num_channels) {}
+    : name(name),
+      clockrate_hz(clockrate_hz),
+      num_channels(num_channels),
+      pre_encoded(false) {}
 
 SdpAudioFormat::SdpAudioFormat(absl::string_view name,
                                int clockrate_hz,
@@ -31,7 +34,8 @@ SdpAudioFormat::SdpAudioFormat(absl::string_view name,
     : name(name),
       clockrate_hz(clockrate_hz),
       num_channels(num_channels),
-      parameters(param) {}
+      parameters(param),
+      pre_encoded(false) {}
 
 SdpAudioFormat::SdpAudioFormat(absl::string_view name,
                                int clockrate_hz,
@@ -40,7 +44,8 @@ SdpAudioFormat::SdpAudioFormat(absl::string_view name,
     : name(name),
       clockrate_hz(clockrate_hz),
       num_channels(num_channels),
-      parameters(std::move(param)) {}
+      parameters(std::move(param)),
+      pre_encoded(false) {}
 
 bool SdpAudioFormat::Matches(const SdpAudioFormat& o) const {
   return absl::EqualsIgnoreCase(name, o.name) &&

diff --git a/api/audio_codecs/audio_format.h b/api/audio_codecs/audio_format.h
@@ -56,6 +56,7 @@ struct RTC_EXPORT SdpAudioFormat {
   int clockrate_hz;
   size_t num_channels;
   Parameters parameters;
+  bool pre_encoded;
 };
 
 // Information about how an audio format is treated by the codec implementation.

diff --git a/api/audio_codecs/g711/audio_encoder_g711.cc b/api/audio_codecs/g711/audio_encoder_g711.cc
@@ -31,6 +31,7 @@ absl::optional<AudioEncoderG711::Config> AudioEncoderG711::SdpToConfig(
     config.type = is_pcmu ? Config::Type::kPcmU : Config::Type::kPcmA;
     config.num_channels = rtc::dchecked_cast<int>(format.num_channels);
     config.frame_size_ms = 20;
+    config.pre_encoded = format.pre_encoded;
     auto ptime_iter = format.parameters.find("ptime");
     if (ptime_iter != format.parameters.end()) {
       const auto ptime = rtc::StringToNumber<int>(ptime_iter->second);
@@ -75,13 +76,15 @@ std::unique_ptr<AudioEncoder> AudioEncoderG711::MakeAudioEncoder(
       AudioEncoderPcmU::Config impl_config;
       impl_config.num_channels = config.num_channels;
       impl_config.frame_size_ms = config.frame_size_ms;
+      impl_config.pre_encoded = config.pre_encoded;
       impl_config.payload_type = payload_type;
       return std::make_unique<AudioEncoderPcmU>(impl_config);
     }
     case Config::Type::kPcmA: {
       AudioEncoderPcmA::Config impl_config;
       impl_config.num_channels = config.num_channels;
       impl_config.frame_size_ms = config.frame_size_ms;
+      impl_config.pre_encoded = config.pre_encoded;
       impl_config.payload_type = payload_type;
       return std::make_unique<AudioEncoderPcmA>(impl_config);
     }

diff --git a/api/audio_codecs/g711/audio_encoder_g711.h b/api/audio_codecs/g711/audio_encoder_g711.h
@@ -37,6 +37,7 @@ struct RTC_EXPORT AudioEncoderG711 {
     Type type = Type::kPcmU;
     int num_channels = 1;
     int frame_size_ms = 20;
+    bool pre_encoded = false;
   };
   static absl::optional<AudioEncoderG711::Config> SdpToConfig(
       const SdpAudioFormat& audio_format);

diff --git a/api/audio_codecs/g722/audio_encoder_g722.cc b/api/audio_codecs/g722/audio_encoder_g722.cc
@@ -30,6 +30,7 @@ absl::optional<AudioEncoderG722Config> AudioEncoderG722::SdpToConfig(
 
   AudioEncoderG722Config config;
   config.num_channels = rtc::checked_cast<int>(format.num_channels);
+  config.pre_encoded = format.pre_encoded;
   auto ptime_iter = format.parameters.find("ptime");
   if (ptime_iter != format.parameters.end()) {
     auto ptime = rtc::StringToNumber<int>(ptime_iter->second);

diff --git a/api/audio_codecs/g722/audio_encoder_g722_config.h b/api/audio_codecs/g722/audio_encoder_g722_config.h
@@ -22,6 +22,7 @@ struct AudioEncoderG722Config {
   }
   int frame_size_ms = 20;
   int num_channels = 1;
+  bool pre_encoded = false;
 };
 
 }  // namespace webrtc

diff --git a/api/audio_codecs/ilbc/audio_encoder_ilbc.cc b/api/audio_codecs/ilbc/audio_encoder_ilbc.cc
@@ -45,6 +45,7 @@ absl::optional<AudioEncoderIlbcConfig> AudioEncoderIlbc::SdpToConfig(
   }
 
   AudioEncoderIlbcConfig config;
+  config.pre_encoded = format.pre_encoded;
   auto ptime_iter = format.parameters.find("ptime");
   if (ptime_iter != format.parameters.end()) {
     auto ptime = rtc::StringToNumber<int>(ptime_iter->second);

diff --git a/api/audio_codecs/ilbc/audio_encoder_ilbc_config.h b/api/audio_codecs/ilbc/audio_encoder_ilbc_config.h
@@ -21,6 +21,7 @@ struct AudioEncoderIlbcConfig {
   int frame_size_ms = 30;  // Valid values are 20, 30, 40, and 60 ms.
   // Note that frame size 40 ms produces encodings with two 20 ms frames in
   // them, and frame size 60 ms consists of two 30 ms frames.
+  bool pre_encoded = false;
 };
 
 }  // namespace webrtc

diff --git a/api/audio_codecs/opus/audio_encoder_opus_config.cc b/api/audio_codecs/opus/audio_encoder_opus_config.cc
@@ -44,7 +44,8 @@ AudioEncoderOpusConfig::AudioEncoderOpusConfig()
       complexity_threshold_window_bps(1500),
       dtx_enabled(false),
       uplink_bandwidth_update_interval_ms(200),
-      payload_type(-1) {}
+      payload_type(-1),
+      pre_encoded(false) {}
 AudioEncoderOpusConfig::AudioEncoderOpusConfig(const AudioEncoderOpusConfig&) =
     default;
 AudioEncoderOpusConfig::~AudioEncoderOpusConfig() = default;

diff --git a/api/audio_codecs/opus/audio_encoder_opus_config.h b/api/audio_codecs/opus/audio_encoder_opus_config.h
@@ -67,6 +67,8 @@ struct RTC_EXPORT AudioEncoderOpusConfig {
   // NOTE: This member isn't necessary, and will soon go away. See
   // https://bugs.chromium.org/p/webrtc/issues/detail?id=7847
   int payload_type;
+
+  bool pre_encoded;
 };
 
 }  // namespace webrtc

diff --git a/api/audio_options.cc b/api/audio_options.cc
@@ -56,6 +56,7 @@ void AudioOptions::SetAll(const AudioOptions& change) {
   SetFrom(&audio_network_adaptor, change.audio_network_adaptor);
   SetFrom(&audio_network_adaptor_config, change.audio_network_adaptor_config);
   SetFrom(&init_recording_on_send, change.init_recording_on_send);
+  SetFrom(&pre_encoded, change.pre_encoded);
 }
 
 bool AudioOptions::operator==(const AudioOptions& o) const {
@@ -75,7 +76,8 @@ bool AudioOptions::operator==(const AudioOptions& o) const {
          combined_audio_video_bwe == o.combined_audio_video_bwe &&
          audio_network_adaptor == o.audio_network_adaptor &&
          audio_network_adaptor_config == o.audio_network_adaptor_config &&
-         init_recording_on_send == o.init_recording_on_send;
+         init_recording_on_send == o.init_recording_on_send &&
+         pre_encoded == o.pre_encoded;
 }
 
 std::string AudioOptions::ToString() const {
@@ -100,6 +102,7 @@ std::string AudioOptions::ToString() const {
   ToStringIfSet(&result, "combined_audio_video_bwe", combined_audio_video_bwe);
   ToStringIfSet(&result, "audio_network_adaptor", audio_network_adaptor);
   ToStringIfSet(&result, "init_recording_on_send", init_recording_on_send);
+  ToStringIfSet(&result, "pre_encoded", pre_encoded);
   result << "}";
   return result.str();
 }

diff --git a/api/audio_options.h b/api/audio_options.h
@@ -73,6 +73,8 @@ struct RTC_EXPORT AudioOptions {
   // true.
   // TODO(webrtc:13566): Remove this option. See issue for details.
   absl::optional<bool> init_recording_on_send;
+  // Audio is already pre-encoded, so we can pass encoding.
+  absl::optional<bool> pre_encoded;
 };
 
 }  // namespace cricket

diff --git a/audio/audio_transport_impl.cc b/audio/audio_transport_impl.cc
@@ -210,17 +210,6 @@ int32_t AudioTransportImpl::NeedMorePlayData(const size_t nSamples,
                                              int64_t* elapsed_time_ms,
                                              int64_t* ntp_time_ms) {
   TRACE_EVENT0("webrtc", "AudioTransportImpl::SendProcessedData");
-  RTC_DCHECK_EQ(sizeof(int16_t) * nChannels, nBytesPerSample);
-  RTC_DCHECK_GE(nChannels, 1);
-  RTC_DCHECK_LE(nChannels, 2);
-  RTC_DCHECK_GE(
-      samplesPerSec,
-      static_cast<uint32_t>(AudioProcessing::NativeRate::kSampleRate8kHz));
-
-  // 100 = 1 second / data duration (10 ms).
-  RTC_DCHECK_EQ(nSamples * 100, samplesPerSec);
-  RTC_DCHECK_LE(nBytesPerSample * nSamples * nChannels,
-                AudioFrame::kMaxDataSizeBytes);
 
   mixer_->Mix(nChannels, &mixed_frame_);
   *elapsed_time_ms = mixed_frame_.elapsed_time_ms_;
@@ -229,12 +218,10 @@ int32_t AudioTransportImpl::NeedMorePlayData(const size_t nSamples,
   if (audio_processing_) {
     const auto error =
         ProcessReverseAudioFrame(audio_processing_, &mixed_frame_);
-    RTC_DCHECK_EQ(error, AudioProcessing::kNoError);
   }
 
   nSamplesOut = Resample(mixed_frame_, samplesPerSec, &render_resampler_,
                          static_cast<int16_t*>(audioSamples));
-  RTC_DCHECK_EQ(nSamplesOut, nChannels * nSamples);
   return 0;
 }
 

diff --git a/media/engine/webrtc_voice_engine.cc b/media/engine/webrtc_voice_engine.cc
@@ -394,6 +394,7 @@ void WebRtcVoiceEngine::Init() {
     options.audio_jitter_buffer_max_packets = 200;
     options.audio_jitter_buffer_fast_accelerate = false;
     options.audio_jitter_buffer_min_delay_ms = 0;
+    options.pre_encoded = false;
     ApplyOptions(options);
   }
   initialized_ = true;
@@ -1650,6 +1651,7 @@ bool WebRtcVoiceMediaChannel::SetSendCodecs(
           IsCodec(voice_codec, kRedCodecName))) {
       webrtc::SdpAudioFormat format(voice_codec.name, voice_codec.clockrate,
                                     voice_codec.channels, voice_codec.params);
+      format.pre_encoded = options_.pre_encoded.value_or(false);
 
       voice_codec_info = engine()->encoder_factory_->QueryAudioEncoder(format);
       if (!voice_codec_info) {

diff --git a/modules/audio_coding/codecs/g711/audio_encoder_pcm.cc b/modules/audio_coding/codecs/g711/audio_encoder_pcm.cc
@@ -29,7 +29,8 @@ AudioEncoderPcm::AudioEncoderPcm(const Config& config, int sample_rate_hz)
           static_cast<size_t>(config.frame_size_ms / 10)),
       full_frame_samples_(config.num_channels * config.frame_size_ms *
                           sample_rate_hz / 1000),
-      first_timestamp_in_buffer_(0) {
+      first_timestamp_in_buffer_(0),
+      pre_encoded_(config.pre_encoded) {
   RTC_CHECK_GT(sample_rate_hz, 0) << "Sample rate must be larger than 0 Hz";
   RTC_CHECK_EQ(config.frame_size_ms % 10, 0)
       << "Frame size must be an integer multiple of 10 ms.";
@@ -74,13 +75,16 @@ AudioEncoder::EncodedInfo AudioEncoderPcm::EncodeImpl(
   EncodedInfo info;
   info.encoded_timestamp = first_timestamp_in_buffer_;
   info.payload_type = payload_type_;
-  info.encoded_bytes = encoded->AppendData(
-      full_frame_samples_ * BytesPerSample(),
-      [&](rtc::ArrayView<uint8_t> encoded) {
-        return EncodeCall(&speech_buffer_[0], full_frame_samples_,
-                          encoded.data());
-      });
-  speech_buffer_.clear();
+  if (pre_encoded_) {
+    info.encoded_bytes = AppendPreEncodeData(audio, encoded);
+  } else {
+    info.encoded_bytes = encoded->AppendData(
+        full_frame_samples_ * BytesPerSample(),
+        [&](rtc::ArrayView<uint8_t> encoded) {
+          return EncodeCall(&speech_buffer_[0], full_frame_samples_,
+                            encoded.data());
+        });
+  }  speech_buffer_.clear();
   info.encoder_type = GetCodecType();
   return info;
 }

diff --git a/modules/audio_coding/codecs/g711/audio_encoder_pcm.h b/modules/audio_coding/codecs/g711/audio_encoder_pcm.h
@@ -29,10 +29,11 @@ class AudioEncoderPcm : public AudioEncoder {
     int frame_size_ms;
     size_t num_channels;
     int payload_type;
+    bool pre_encoded;
 
    protected:
     explicit Config(int pt)
-        : frame_size_ms(20), num_channels(1), payload_type(pt) {}
+        : frame_size_ms(20), num_channels(1), payload_type(pt), pre_encoded(false) {}
   };
 
   ~AudioEncoderPcm() override;
@@ -67,6 +68,7 @@ class AudioEncoderPcm : public AudioEncoder {
   const int sample_rate_hz_;
   const size_t num_channels_;
   const int payload_type_;
+  bool pre_encoded_;
   const size_t num_10ms_frames_per_packet_;
   const size_t full_frame_samples_;
   std::vector<int16_t> speech_buffer_;

diff --git a/modules/audio_coding/codecs/g722/audio_encoder_g722.cc b/modules/audio_coding/codecs/g722/audio_encoder_g722.cc
@@ -33,7 +33,8 @@ AudioEncoderG722Impl::AudioEncoderG722Impl(const AudioEncoderG722Config& config,
       num_10ms_frames_buffered_(0),
       first_timestamp_in_buffer_(0),
       encoders_(new EncoderState[num_channels_]),
-      interleave_buffer_(2 * num_channels_) {
+      interleave_buffer_(2 * num_channels_),
+      pre_encoded_(config.pre_encoded) {
   RTC_CHECK(config.IsOk());
   const size_t samples_per_channel =
       kSampleRateHz / 100 * num_10ms_frames_per_packet_;
@@ -103,38 +104,44 @@ AudioEncoder::EncodedInfo AudioEncoderG722Impl::EncodeImpl(
     return EncodedInfo();
   }
 
-  // Encode each channel separately.
-  RTC_CHECK_EQ(num_10ms_frames_buffered_, num_10ms_frames_per_packet_);
-  num_10ms_frames_buffered_ = 0;
-  const size_t samples_per_channel = SamplesPerChannel();
-  for (size_t i = 0; i < num_channels_; ++i) {
-    const size_t bytes_encoded = WebRtcG722_Encode(
-        encoders_[i].encoder, encoders_[i].speech_buffer.get(),
-        samples_per_channel, encoders_[i].encoded_buffer.data());
-    RTC_CHECK_EQ(bytes_encoded, samples_per_channel / 2);
-  }
-
-  const size_t bytes_to_encode = samples_per_channel / 2 * num_channels_;
   EncodedInfo info;
-  info.encoded_bytes = encoded->AppendData(
-      bytes_to_encode, [&](rtc::ArrayView<uint8_t> encoded) {
-        // Interleave the encoded bytes of the different channels. Each separate
-        // channel and the interleaved stream encodes two samples per byte, most
-        // significant half first.
-        for (size_t i = 0; i < samples_per_channel / 2; ++i) {
-          for (size_t j = 0; j < num_channels_; ++j) {
-            uint8_t two_samples = encoders_[j].encoded_buffer.data()[i];
-            interleave_buffer_.data()[j] = two_samples >> 4;
-            interleave_buffer_.data()[num_channels_ + j] = two_samples & 0xf;
+  if (pre_encoded_) {
+    info.encoded_bytes = AppendPreEncodeData(audio, encoded);
+  } else {
+    // Encode each channel separately.
+    RTC_CHECK_EQ(num_10ms_frames_buffered_, num_10ms_frames_per_packet_);
+    num_10ms_frames_buffered_ = 0;
+    const size_t samples_per_channel = SamplesPerChannel();
+    for (size_t i = 0; i < num_channels_; ++i) {
+      const size_t bytes_encoded = WebRtcG722_Encode(
+          encoders_[i].encoder, encoders_[i].speech_buffer.get(),
+          samples_per_channel, encoders_[i].encoded_buffer.data());
+      RTC_CHECK_EQ(bytes_encoded, samples_per_channel / 2);
+    }
+
+    const size_t bytes_to_encode = samples_per_channel / 2 * num_channels_;
+
+    info.encoded_bytes = encoded->AppendData(
+        bytes_to_encode, [&](rtc::ArrayView<uint8_t> encoded) {
+          // Interleave the encoded bytes of the different channels. Each separate
+          // channel and the interleaved stream encodes two samples per byte, most
+          // significant half first.
+          for (size_t i = 0; i < samples_per_channel / 2; ++i) {
+            for (size_t j = 0; j < num_channels_; ++j) {
+              uint8_t two_samples = encoders_[j].encoded_buffer.data()[i];
+              interleave_buffer_.data()[j] = two_samples >> 4;
+              interleave_buffer_.data()[num_channels_ + j] = two_samples & 0xf;
+            }
+            for (size_t j = 0; j < num_channels_; ++j)
+              encoded[i * num_channels_ + j] =
+                  interleave_buffer_.data()[2 * j] << 4 |
+                  interleave_buffer_.data()[2 * j + 1];
           }
-          for (size_t j = 0; j < num_channels_; ++j)
-            encoded[i * num_channels_ + j] =
-                interleave_buffer_.data()[2 * j] << 4 |
-                interleave_buffer_.data()[2 * j + 1];
-        }
-
-        return bytes_to_encode;
-      });
+
+          return bytes_to_encode;
+        });
+  }
+
   info.encoded_timestamp = first_timestamp_in_buffer_;
   info.payload_type = payload_type_;
   info.encoder_type = CodecType::kG722;