jeertmans · taibeled · Nov 10, 2024 · Nov 10, 2024 · Nov 10, 2024 · Nov 10, 2024
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -35,13 +35,13 @@ jobs:
 
     - name: Install manim dependencies on MacOS
       if: matrix.os == 'macos-latest'
-      run: brew install ffmpeg py3cairo pango pkg-config scipy
+      run: brew install ffmpeg py3cairo pango pkg-config scipy portaudio
 
     - name: Install manim dependencies on Ubuntu
       if: matrix.os == 'ubuntu-latest'
       run: |
         sudo apt-get update
-        sudo apt-get install build-essential python3-dev libcairo2-dev libpango1.0-dev ffmpeg freeglut3-dev xvfb
+        sudo apt-get install build-essential python3-dev libcairo2-dev libpango1.0-dev ffmpeg freeglut3-dev xvfb portaudio19-dev
         nohup Xvfb $DISPLAY &
 
     - name: Install Windows dependencies
@@ -79,13 +79,13 @@ jobs:
 
     - name: Install manim dependencies on MacOS
       if: matrix.os == 'macos-latest'
-      run: brew install ffmpeg py3cairo pango pkg-config scipy
+      run: brew install ffmpeg py3cairo pango pkg-config scipy portaudio
 
     - name: Install manim dependencies on Ubuntu
       if: matrix.os == 'ubuntu-latest'
       run: |
         sudo apt-get update
-        sudo apt-get install build-essential python3-dev libcairo2-dev libpango1.0-dev ffmpeg freeglut3-dev xvfb
+        sudo apt-get install build-essential python3-dev libcairo2-dev libpango1.0-dev ffmpeg freeglut3-dev xvfb portaudio19-dev
         nohup Xvfb $DISPLAY &
 
     - name: Install Windows dependencies

diff --git a/.gitignore b/.gitignore
@@ -48,3 +48,6 @@ paper/media/
 coverage.xml
 
 rendering_times.csv
+
+# Manim Voice Over
+.env
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -13,7 +13,8 @@ RUN apt-get update -qq \
         pkg-config \
         make \
         wget \
-        ghostscript
+        ghostscript \
+        portaudio19-dev
 
 # setup a minimal texlive installation
 COPY docker/texlive-profile.txt /tmp/

diff --git a/example.py b/example.py
@@ -2,26 +2,62 @@
 # type: ignore
 
 from manim_slides import Slide, ThreeDSlide
-from manim_slides.slide import MANIM, MANIMGL
+from manim_slides.slide import MANIM, MANIM_VOICEOVER, MANIMGL
+
+OPENAI = False
 
 if MANIM:
     from manim import *
 elif MANIMGL:
     from manimlib import *
 
+if MANIM_VOICEOVER:
+    from manim_voiceover import VoiceoverScene
 
-class BasicExample(Slide):
-    def construct(self):
-        circle = Circle(radius=3, color=BLUE)
-        dot = Dot()
+    if OPENAI:
+        from manim_voiceover.services.openai import OpenAIService as SpeechService
+    else:
+        from manim_voiceover.services.gtts import GTTSService as SpeechService
 
-        self.play(GrowFromCenter(circle))
+if not MANIM_VOICEOVER:
 
-        self.next_slide(loop=True)
-        self.play(MoveAlongPath(dot, circle), run_time=2, rate_func=linear)
-        self.next_slide()
+    class BasicExample(Slide):
+        def construct(self):
+            circle = Circle(radius=3, color=BLUE)
+            dot = Dot()
+
+            self.play(GrowFromCenter(circle))
+
+            self.next_slide(loop=True)
+            self.play(MoveAlongPath(dot, circle), run_time=2, rate_func=linear)
+            self.next_slide()
+
+            self.play(dot.animate.move_to(ORIGIN))
+else:
+
+    class BasicExample(Slide, VoiceoverScene):
+        def construct(self):
+            self.set_speech_service(SpeechService())
+
+            circle = Circle(radius=3, color=BLUE)
+            dot = Dot()
+
+            with self.voiceover(text="This is a circle") as tracker:
+                self.play(GrowFromCenter(circle), run_time=tracker.duration)
+
+            self.next_slide(loop=True)
+            with self.voiceover(text="Now a dot is moving along the circle") as tracker:
+                self.play(
+                    MoveAlongPath(dot, circle),
+                    rate_func=linear,
+                    run_time=tracker.duration,
+                )
+            self.next_slide()
 
-        self.play(dot.animate.move_to(ORIGIN))
+            with self.voiceover(
+                text="Now the dot is moving back to the center of the circle"
+            ) as tracker:
+                self.play(dot.animate.move_to(ORIGIN), run_time=tracker.duration)
 
 
 class ConvertExample(Slide):

diff --git a/manim_slides/config.py b/manim_slides/config.py
@@ -19,6 +19,7 @@
     model_validator,
 )
 from pydantic_extra_types.color import Color
+from typing_extensions import TypedDict
 
 from .logger import logger
 
@@ -151,6 +152,11 @@ def merge_with(self, other: "Config") -> "Config":
         return self
 
 
+class RelativeAudioType(TypedDict):
+    starting_time: float
+    file: Path
+
+
 class BaseSlideConfig(BaseModel):  # type: ignore
     """Base class for slide config."""
 
@@ -160,6 +166,7 @@ class BaseSlideConfig(BaseModel):  # type: ignore
     reversed_playback_rate: float = 1.0
     notes: str = ""
     dedent_notes: bool = True
+    audio: list[RelativeAudioType] = []
 
     @classmethod
     def wrapper(cls, arg_name: str) -> Callable[..., Any]:

diff --git a/manim_slides/slide/__init__.py b/manim_slides/slide/__init__.py
@@ -1,6 +1,7 @@
 __all__ = [
     "MANIM",
     "MANIMGL",
+    "MANIM_VOICEOVER",
     "API_NAME",
     "Slide",
     "ThreeDSlide",
@@ -48,6 +49,7 @@ def __init__(self) -> None:
 
 MANIM: bool = API_NAME == "manim"
 MANIMGL: bool = API_NAME == "manimlib"
+MANIM_VOICEOVER: bool = "manim_voiceover" in sys.modules
 
 if MANIM:
     try:

diff --git a/manim_slides/slide/base.py b/manim_slides/slide/base.py
@@ -18,7 +18,12 @@
 from ..config import BaseSlideConfig, PresentationConfig, PreSlideConfig, SlideConfig
 from ..defaults import FOLDER_PATH
 from ..logger import logger
-from ..utils import concatenate_video_files, merge_basenames, reverse_video_file
+from ..utils import (
+    add_audio_to_video,
+    concatenate_video_files,
+    merge_basenames,
+    reverse_video_file,
+)
 from . import MANIM
 
 if TYPE_CHECKING:
@@ -538,6 +543,12 @@
                 else:
                     reverse_video_file(dst_file, rev_file)
 
+            if pre_slide_config.audio:
+                new_dst_file = dst_file.with_stem(dst_file.stem + "_audio")
+                add_audio_to_video(dst_file, new_dst_file, pre_slide_config.audio)
+                dst_file.unlink()
+                dst_file = new_dst_file
+
             slides.append(
                 SlideConfig.from_pre_slide_config_and_files(
                     pre_slide_config, dst_file, rev_file

diff --git a/manim_slides/slide/manim.py b/manim_slides/slide/manim.py
@@ -4,11 +4,22 @@
 from manim import Scene, ThreeDScene, config
 from manim.renderer.opengl_renderer import OpenGLRenderer
 from manim.utils.color import rgba_to_color
+from typing_extensions import TypedDict
 
 from ..config import BaseSlideConfig
 from .base import BaseSlide
 
 
+class AudioType(TypedDict):
+    starting_time: float
+    file: Path
+
+
+class SlideAudioType(TypedDict):
+    starting_time: float
+    audio: list[AudioType]
+
+
 class Slide(BaseSlide, Scene):  # type: ignore[misc]
     """
     Inherits from :class:`Scene<manim.scene.scene.Scene>` and provide necessary tools
@@ -31,6 +42,13 @@
         for the current slide config.
     """
 
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.slide_audio: SlideAudioType = {
+            "starting_time": 0,
+            "audio": [],
+        }  # self._slides only defines the slide after next_slide() is called, so we need to define the slides here and then update them in next_slide().
+
     @property
     def _frame_shape(self) -> tuple[float, float]:
         if isinstance(self.renderer, OpenGLRenderer):
@@ -117,6 +135,12 @@
             base_slide_config=base_slide_config,
         )
 
+        self.add_audio_to_slide()
+        self.slide_audio = {
+            "starting_time": self.renderer.time,
+            "audio": [],
+        }
+
     def render(self, *args: Any, **kwargs: Any) -> None:
         """MANIM renderer."""
         # We need to disable the caching limit since we rely on intermediate files
@@ -142,6 +166,35 @@
         if flush_manim_cache:
             self.renderer.file_writer.flush_cache_directory()
 
+    def _add_last_slide(self) -> None:
+        super()._add_last_slide()
+        self.add_audio_to_slide()
+
+    def add_audio_to_slide(self) -> None:
+        for audio in self.slide_audio["audio"]:
+            self._slides[-1].audio.append(
+                {
+                    "starting_time": audio["starting_time"]
+                    - self.slide_audio["starting_time"],
+                    "file": audio["file"],
+                }
+            )
+
+    def add_sound(
+        self,
+        sound_file: str,
+        time_offset: float = 0,
+        gain: Optional[float] = None,
+        **kwargs: Any,
+    ) -> None:
+        self.slide_audio["audio"].append(
+            {
+                "starting_time": self.renderer.time,
+                "file": Path(sound_file),
+            }
+        )
+        super().add_sound(sound_file, time_offset, gain, **kwargs)
+
 
 class ThreeDSlide(Slide, ThreeDScene):  # type: ignore[misc]
     """

diff --git a/manim_slides/utils.py b/manim_slides/utils.py
@@ -6,6 +6,7 @@
 
 import av
 
+from .config import RelativeAudioType
 from .logger import logger
 
 
@@ -62,6 +63,43 @@
     os.unlink(tmp_file)  # https://stackoverflow.com/a/54768241
 
 
+def add_audio_to_video(
+    video: Path, dest_file: Path, audio_files: list[RelativeAudioType]
+) -> None:
+    """Add audio to a video file."""
+    with (
+        av.open(str(video), mode="r") as input_container,
+        av.open(str(dest_file), mode="w") as output_container,
+    ):
+        video_stream = input_container.streams.video[0]
+        output_video_stream = output_container.add_stream(template=video_stream)
+
+        for audio_file in audio_files:
+            audio = audio_file["file"]
+            offset = audio_file.get("starting_time", 0)
+
+            with av.open(str(audio)) as audio_container:
+                audio_stream = audio_container.streams.audio[0]
+                output_audio_stream = output_container.add_stream(template=audio_stream)
+
+                for packet in audio_container.demux(audio_stream):
+                    if packet.dts is None:
+                        continue
+
+                    packet.stream = output_audio_stream
+                    packet.pts += offset
+                    packet.dts += offset
+
+                    output_container.mux(packet)
+
+        for packet in input_container.demux(video_stream):
+            if packet.dts is None:
+                continue
+
+            packet.stream = output_video_stream
+            output_container.mux(packet)
+
+
 def merge_basenames(files: list[Path]) -> Path:
     """Merge multiple filenames by concatenating basenames."""
     if len(files) == 0:

diff --git a/pyproject.toml b/pyproject.toml
@@ -55,7 +55,7 @@ docs = [
   "sphinxext-opengraph>=0.7.5",
 ]
 full = [
-  "manim-slides[magic,manim,sphinx-directive]",
+  "manim-slides[magic,manim,sphinx-directive,voiceover]",
 ]
 magic = ["manim-slides[manim]", "ipython>=8.12.2"]
 manim = ["manim>=0.18.0"]
@@ -73,6 +73,9 @@ tests = [
   "pytest-missing-modules>=0.1.0",
   "pytest-qt>=4.2.0",
 ]
+voiceover = [
+  "manim-voiceover[all]>=0.3.7",
+]
 
 [project.scripts]
 manim-slides = "manim_slides.__main__:cli"
@@ -232,4 +235,6 @@ override-dependencies = [
   "manimpango>=0.5.0,<1.0.0",
   "numpy<=1.24;python_version < '3.12'",
   "numpy>=1.26;python_version >= '3.12'",
+  # Newer version is needed otherwise trenton can not be installed properly
+  "openai-whisper>=20240927",
 ]