diff --git a/manim/scene/scene_file_writer.py b/manim/scene/scene_file_writer.py
index ebd7d13cac..23ff6a88f7 100644
--- a/manim/scene/scene_file_writer.py
+++ b/manim/scene/scene_file_writer.py
@@ -251,11 +251,6 @@ def init_audio(self):
         Preps the writer for adding audio to the movie.
         """
         self.includes_sound = False
-
-    def create_audio_segment(self):
-        """
-        Creates an empty, silent, Audio Segment.
-        """
         self.audio_segment = AudioSegment.silent()
 
     def add_audio_segment(
@@ -280,9 +275,7 @@ def add_audio_segment(
         gain_to_background
             The gain of the segment from the background.
         """
-        if not self.includes_sound:
-            self.includes_sound = True
-            self.create_audio_segment()
+        self.includes_sound = True
         segment = self.audio_segment
         curr_end = segment.duration_seconds
         if time is None:
@@ -502,6 +495,16 @@ def open_partial_movie_stream(self, file_path=None):
             self.video_container = video_container
             self.video_stream = stream
 
+            # No matter what is `self.includes_sound`,
+            # we need to add an audio stream, in the case we add audio
+            # to any one of the partial movies.
+            # This is needed because concat format needs all video
+            # files to have the same number of streams.
+            self.partial_movie_start_time = self.renderer.time
+            self.audio_stream = self.video_container.add_stream(
+                "libvorbis" if config.format == "webm" else "aac",
+            )
+
     def close_partial_movie_stream(self):
         """Close the currently opened video container.
 
@@ -509,9 +512,29 @@ def close_partial_movie_stream(self):
         in the video stream holding a partial file, and then close
         the corresponding container.
         """
+        start = int(np.ceil(1000 * self.partial_movie_start_time))
+        end = int(np.ceil(1000 * self.renderer.time))
+
+        if duration := len(self.audio_segment) < end:
+            self.audio_segment += AudioSegment.silent(duration=end - duration)
+
+        sound = self.audio_segment[start:end]
+        array = np.frombuffer(sound.raw_data, dtype=np.int16).reshape(1, -1)
+        layout = "stereo" if sound.channels == 2 else "mono"
+        frame = av.AudioFrame.from_ndarray(array, layout=layout)
+        frame.rate = sound.frame_rate
+
+        for packet in self.audio_stream.encode(frame):
+            self.video_container.mux(packet)
+
+        # Flushing packets
+
         for packet in self.video_stream.encode():
             self.video_container.mux(packet)
 
+        for packet in self.audio_stream.encode():
+            self.video_container.mux(packet)
+
         self.video_container.close()
 
         logger.info(
@@ -544,8 +567,8 @@ def combine_files(
         self,
         input_files: list[str],
         output_file: Path,
-        create_gif=False,
-        includes_sound=False,
+        create_gif: bool = False,
+        includes_sound: bool = False,
     ):
         file_list = self.partial_movie_directory / "partial_movie_file_list.txt"
         logger.debug(
@@ -568,17 +591,26 @@ def combine_files(
         partial_movies_input = av.open(
             str(file_list), options=av_options, format="concat"
         )
-        partial_movies_stream = partial_movies_input.streams.video[0]
+        partial_movies_video_stream = partial_movies_input.streams.video[0]
+        if includes_sound and not create_gif:
+            partial_movies_audio_stream = partial_movies_input.streams.audio[0]
+        else:
+            partial_movies_audio_stream = None
+
         output_container = av.open(str(output_file), mode="w")
         output_container.metadata["comment"] = (
             f"Rendered with Manim Community v{__version__}"
         )
-        output_stream = output_container.add_stream(
+        output_video_stream = output_container.add_stream(
             codec_name="gif" if create_gif else None,
-            template=partial_movies_stream if not create_gif else None,
+            template=partial_movies_video_stream if not create_gif else None,
         )
+        if includes_sound and config.format != "gif":
+            output_audio_stream = output_container.add_stream(
+                template=partial_movies_audio_stream if not create_gif else None,
+            )
         if config.transparent and config.format == "webm":
-            output_stream.pix_fmt = "yuva420p"
+            output_video_stream.pix_fmt = "yuva420p"
         if create_gif:
             """
             The following solution was largely inspired from this comment
@@ -586,14 +618,14 @@ def combine_files(
             and the following code
             https://github.com/imageio/imageio/blob/65d79140018bb7c64c0692ea72cb4093e8d632a0/imageio/plugins/pyav.py#L927-L996.
             """
-            output_stream.pix_fmt = "rgb8"
+            output_video_stream.pix_fmt = "rgb8"
             if config.transparent:
-                output_stream.pix_fmt = "pal8"
-            output_stream.width = config.pixel_width
-            output_stream.height = config.pixel_height
-            output_stream.rate = config.frame_rate
+                output_video_stream.pix_fmt = "pal8"
+            output_video_stream.width = config.pixel_width
+            output_video_stream.height = config.pixel_height
+            output_video_stream.rate = config.frame_rate
             graph = av.filter.Graph()
-            input_buffer = graph.add_buffer(template=partial_movies_stream)
+            input_buffer = graph.add_buffer(template=partial_movies_video_stream)
             split = graph.add("split")
             palettegen = graph.add("palettegen", "stats_mode=diff")
             paletteuse = graph.add(
@@ -618,27 +650,35 @@ def combine_files(
             while True:
                 try:
                     frame = graph.pull()
-                    frame.time_base = output_stream.codec_context.time_base
+                    frame.time_base = output_video_stream.codec_context.time_base
                     frame.pts = frames_written
                     frames_written += 1
-                    output_container.mux(output_stream.encode(frame))
+                    output_container.mux(output_video_stream.encode(frame))
                 except av.error.EOFError:
                     break
 
-            for packet in output_stream.encode():
+            for packet in output_video_stream.encode():
                 output_container.mux(packet)
 
         else:
-            for packet in partial_movies_input.demux(partial_movies_stream):
+            for packet in partial_movies_input.demux(
+                partial_movies_video_stream, partial_movies_audio_stream
+            ):
                 # We need to skip the "flushing" packets that `demux` generates.
                 if packet.dts is None:
                     continue
 
+                ptype = packet.stream.type
+
                 packet.dts = None  # This seems to be needed, as dts from consecutive
                 # files may not be monotically increasing, so we let libav compute it.
 
                 # We need to assign the packet to the new stream.
-                packet.stream = output_stream
+                if ptype == "video":
+                    packet.stream = output_video_stream
+                elif ptype == "audio":
+                    packet.stream = output_audio_stream
+
                 output_container.mux(packet)
 
         partial_movies_input.close()
@@ -668,85 +708,8 @@ def combine_to_movie(self):
             partial_movie_files,
             movie_file_path,
             is_gif_format(),
-            self.includes_sound,
+            includes_sound=self.includes_sound,
         )
-
-        # handle sound
-        if self.includes_sound and config.format != "gif":
-            sound_file_path = movie_file_path.with_suffix(".wav")
-            # Makes sure sound file length will match video file
-            self.add_audio_segment(AudioSegment.silent(0))
-            self.audio_segment.export(
-                sound_file_path,
-                format="wav",
-                bitrate="312k",
-            )
-            # Audio added to a VP9 encoded (webm) video file needs
-            # to be encoded as vorbis or opus. Directly exporting
-            # self.audio_segment with such a codec works in principle,
-            # but tries to call ffmpeg via its CLI -- which we want
-            # to avoid. This is why we need to do the conversion
-            # manually.
-            if config.format == "webm":
-                with (
-                    av.open(sound_file_path) as wav_audio,
-                    av.open(sound_file_path.with_suffix(".ogg"), "w") as opus_audio,
-                ):
-                    wav_audio_stream = wav_audio.streams.audio[0]
-                    opus_audio_stream = opus_audio.add_stream("libvorbis")
-                    for frame in wav_audio.decode(wav_audio_stream):
-                        for packet in opus_audio_stream.encode(frame):
-                            opus_audio.mux(packet)
-
-                    for packet in opus_audio_stream.encode():
-                        opus_audio.mux(packet)
-
-                sound_file_path = sound_file_path.with_suffix(".ogg")
-
-            temp_file_path = movie_file_path.with_name(
-                f"{movie_file_path.stem}_temp{movie_file_path.suffix}"
-            )
-            av_options = {
-                "shortest": "1",
-                "metadata": f"comment=Rendered with Manim Community v{__version__}",
-            }
-
-            with (
-                av.open(movie_file_path) as video_input,
-                av.open(sound_file_path) as audio_input,
-            ):
-
-                video_stream = video_input.streams.video[0]
-                audio_stream = audio_input.streams.audio[0]
-                output_container = av.open(
-                    str(temp_file_path), mode="w", options=av_options
-                )
-                output_video_stream = output_container.add_stream(template=video_stream)
-                output_audio_stream = output_container.add_stream(template=audio_stream)
-
-                for packet in video_input.demux(video_stream):
-                    # We need to skip the "flushing" packets that `demux` generates.
-                    if packet.dts is None:
-                        continue
-
-                    # We need to assign the packet to the new stream.
-                    packet.stream = output_video_stream
-                    output_container.mux(packet)
-
-                for packet in audio_input.demux(audio_stream):
-                    # We need to skip the "flushing" packets that `demux` generates.
-                    if packet.dts is None:
-                        continue
-
-                    # We need to assign the packet to the new stream.
-                    packet.stream = output_audio_stream
-                    output_container.mux(packet)
-
-                output_container.close()
-
-            shutil.move(str(temp_file_path), str(movie_file_path))
-            sound_file_path.unlink()
-
         self.print_file_ready_message(str(movie_file_path))
         if write_to_movie():
             for file_path in partial_movie_files:
@@ -765,6 +728,7 @@ def combine_to_section_videos(self) -> None:
                 self.combine_files(
                     section.get_clean_partial_movie_files(),
                     self.sections_output_dir / section.video,
+                    includes_sound=self.includes_sound,
                 )
                 sections_index.append(section.get_dict(self.sections_output_dir))
         with (self.sections_output_dir / f"{self.output_name}.json").open("w") as file: