Restructured to output stems in subfolder and separate backing vocals…

… properly
nomadkaraoke · Dec 5, 2024 · ef3a2df · ef3a2df
1 parent 1b26fe9
commit ef3a2df
Show file tree

Hide file tree

Showing 3 changed files with 128 additions and 75 deletions.
diff --git a/karaoke_prep/karaoke_prep.py b/karaoke_prep/karaoke_prep.py
@@ -23,11 +23,12 @@ def __init__(
         dry_run=False,
         log_level=logging.DEBUG,
         log_formatter=None,
-        model_names=["UVR_MDXNET_KARA_2.onnx", "model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt"],
+        clean_instrumental_model="model_bs_roformer_ep_317_sdr_12.9755.ckpt",
+        backing_vocals_models=["mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt", "UVR-BVE-4B_SN-44100-1.pth"],
+        other_stems_models=["htdemucs_6s.yaml"],
         model_file_dir=os.path.join(tempfile.gettempdir(), "audio-separator-models"),
         output_dir=".",
         lossless_output_format="FLAC",
-        lossy_output_format="MP3",
         use_cuda=False,
         use_coreml=False,
         normalization_enabled=True,
@@ -89,11 +90,12 @@ def __init__(
         self.artist = artist
         self.title = title
         self.filename_pattern = filename_pattern
-        self.model_names = model_names
+        self.clean_instrumental_model = clean_instrumental_model
+        self.backing_vocals_models = backing_vocals_models
+        self.other_stems_models = other_stems_models
         self.model_file_dir = model_file_dir
         self.output_dir = output_dir
         self.lossless_output_format = lossless_output_format.lower()
-        self.lossy_output_format = lossy_output_format.lower()
         self.use_cuda = use_cuda
         self.use_coreml = use_coreml
         self.normalization_enabled = normalization_enabled
@@ -717,6 +719,80 @@ def hex_to_rgb(self, hex_color):
         hex_color = hex_color.lstrip("#")
         return tuple(int(hex_color[i : i + 2], 16) for i in (0, 2, 4))
 
+    def process_audio_separation(self, audio_file, artist_title, track_output_dir):
+        from audio_separator.separator import Separator
+
+        self.logger.info(f"Starting audio separation process for {artist_title}")
+
+        separator = Separator(
+            log_level=self.log_level,
+            log_formatter=self.log_formatter,
+            model_file_dir=self.model_file_dir,
+            output_format=self.lossless_output_format,
+        )
+
+        # Create a "stems" subfolder
+        stems_dir = os.path.join(track_output_dir, "stems")
+        os.makedirs(stems_dir, exist_ok=True)
+        self.logger.info(f"Created stems directory: {stems_dir}")
+
+        result = {"clean_instrumental": {}, "other_stems": {}, "backing_vocals": {}}
+
+        # Step 1: Separate using clean_instrumental_model
+        self.logger.info(f"Step 1: Separating using clean instrumental model: {self.clean_instrumental_model}")
+        instrumental_path = os.path.join(
+            track_output_dir, f"{artist_title} (Instrumental {self.clean_instrumental_model}).{self.lossless_output_format}"
+        )
+        vocals_path = os.path.join(stems_dir, f"{artist_title} (Vocals {self.clean_instrumental_model}).{self.lossless_output_format}")
+
+        separator.load_model(model_filename=self.clean_instrumental_model)
+        clean_output_files = separator.separate(audio_file)
+
+        for file in clean_output_files:
+            if "(Vocals)" in file:
+                os.rename(file, vocals_path)
+                result["clean_instrumental"]["vocals"] = vocals_path
+            elif "(Instrumental)" in file:
+                os.rename(file, instrumental_path)
+                result["clean_instrumental"]["instrumental"] = instrumental_path
+
+        # Step 2: Separate using other_stems_models
+        self.logger.info(f"Step 2: Separating using other stems models: {self.other_stems_models}")
+        for model in self.other_stems_models:
+            self.logger.info(f"Processing with model: {model}")
+            separator.load_model(model_filename=model)
+            other_stems_output = separator.separate(audio_file)
+
+            result["other_stems"][model] = {}
+            for file in other_stems_output:
+                file_name = os.path.basename(file)
+                stem_name = file_name[file_name.rfind("_(") + 2 : file_name.rfind(")_")]
+                new_filename = f"{artist_title} ({stem_name} {model}).{self.lossless_output_format}"
+                other_stem_path = os.path.join(stems_dir, new_filename)
+                os.rename(file, other_stem_path)
+                result["other_stems"][model][stem_name] = other_stem_path
+
+        # Step 3: Separate clean vocals using backing_vocals_models
+        self.logger.info(f"Step 3: Separating clean vocals using backing vocals models: {self.backing_vocals_models}")
+        for model in self.backing_vocals_models:
+            self.logger.info(f"Processing with model: {model}")
+            separator.load_model(model_filename=model)
+            backing_vocals_output = separator.separate(vocals_path)
+
+            result["backing_vocals"][model] = {}
+            for file in backing_vocals_output:
+                if "(Vocals)" in file:
+                    lead_vocals_path = os.path.join(stems_dir, f"{artist_title} (Lead Vocals {model}).{self.lossless_output_format}")
+                    os.rename(file, lead_vocals_path)
+                    result["backing_vocals"][model]["lead_vocals"] = lead_vocals_path
+                elif "(Instrumental)" in file:
+                    backing_vocals_path = os.path.join(stems_dir, f"{artist_title} (Backing Vocals {model}).{self.lossless_output_format}")
+                    os.rename(file, backing_vocals_path)
+                    result["backing_vocals"][model]["backing_vocals"] = backing_vocals_path
+
+        self.logger.info("Audio separation process completed")
+        return result
+
     def prep_single_track(self):
         self.logger.info(f"Preparing single track: {self.artist} - {self.title}")
 
@@ -847,65 +923,24 @@ def prep_single_track(self):
             existing_instrumental_extension = os.path.splitext(self.existing_instrumental)[1]
 
             instrumental_path = os.path.join(track_output_dir, f"{artist_title} (Instrumental Custom){existing_instrumental_extension}")
-            instrumental_path_lossy = os.path.join(track_output_dir, f"{artist_title} (Instrumental Custom).{self.lossy_output_format}")
 
             shutil.copy2(self.existing_instrumental, instrumental_path)
-            self.convert_to_lossy(instrumental_path, instrumental_path_lossy)
 
             processed_track["separated_audio"]["Custom"] = {
                 "instrumental": instrumental_path,
-                "instrumental_lossy": instrumental_path_lossy,
                 "vocals": None,
-                "vocals_lossy": None,
             }
         else:
-            self.logger.info(f"Separating audio for track: {self.title} by {self.artist} using models: {', '.join(self.model_names)}")
-            for model_name in self.model_names:
-                processed_track[f"separated_audio"][model_name] = {}
-                instrumental_path = os.path.join(
-                    track_output_dir, f"{artist_title} (Instrumental {model_name}).{self.lossless_output_format}"
-                )
-                vocals_path = os.path.join(track_output_dir, f"{artist_title} (Vocals {model_name}).{self.lossless_output_format}")
-                instrumental_path_lossy = os.path.join(
-                    track_output_dir, f"{artist_title} (Instrumental {model_name}).{self.lossy_output_format}"
-                )
-                vocals_path_lossy = os.path.join(track_output_dir, f"{artist_title} (Vocals {model_name}).{self.lossy_output_format}")
-
-                if not (os.path.isfile(instrumental_path) and os.path.isfile(vocals_path)):
-                    self.separate_audio(
-                        audio_file=processed_track["input_audio_wav"],
-                        model_name=model_name,
-                        track_output_dir=track_output_dir,
-                        artist_title=artist_title,
-                        instrumental_path=instrumental_path,
-                        vocals_path=vocals_path,
-                    )
-                    if os.path.isfile(instrumental_path):
-                        self.convert_to_lossy(instrumental_path, instrumental_path_lossy)
-                    if os.path.isfile(vocals_path):
-                        self.convert_to_lossy(vocals_path, vocals_path_lossy)
-
-                processed_track[f"separated_audio"][model_name]["instrumental"] = instrumental_path
-                processed_track[f"separated_audio"][model_name]["vocals"] = vocals_path
-                processed_track[f"separated_audio"][model_name]["instrumental_lossy"] = instrumental_path_lossy
-                processed_track[f"separated_audio"][model_name]["vocals_lossy"] = vocals_path_lossy
+            self.logger.info(f"Separating audio for track: {self.title} by {self.artist}")
+            separation_results = self.process_audio_separation(
+                audio_file=processed_track["input_audio_wav"], artist_title=artist_title, track_output_dir=track_output_dir
+            )
+            processed_track["separated_audio"] = separation_results
 
         self.logger.info("Script finished, audio downloaded, lyrics fetched and audio separated!")
 
         return processed_track
 
-    def convert_to_lossy(self, input_filename, output_filename):
-        if input_filename is None or not os.path.isfile(input_filename):
-            raise Exception(f"Error: Invalid input file provided for convert_to_lossy: {input_filename}")
-
-        self.logger.info(f"Converting {self.lossless_output_format} audio to lossy {self.lossy_output_format} format")
-
-        ffmpeg_extras = "-q:a 0" if self.lossy_output_format == "mp3" else ""
-
-        ffmpeg_command = f'{self.ffmpeg_base_command} -i "{input_filename}" {ffmpeg_extras} "{output_filename}"'
-        self.logger.debug(f"Running command: {ffmpeg_command}")
-        os.system(ffmpeg_command)
-
     def process_playlist(self):
         if self.artist is None or self.title is None:
             raise Exception("Error: Artist and Title are required for processing a local file.")

diff --git a/karaoke_prep/utils/prep_cli.py b/karaoke_prep/utils/prep_cli.py
@@ -56,17 +56,23 @@ def main():
     )
 
     parser.add_argument(
-        "--model_names",
+        "--clean_instrumental_model",
+        default="model_bs_roformer_ep_317_sdr_12.9755.ckpt",
+        help="Optional: Model for clean instrumental separation (default: %(default)s).",
+    )
+
+    parser.add_argument(
+        "--backing_vocals_models",
+        nargs="+",
+        default=["mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt", "UVR-BVE-4B_SN-44100-1.pth"],
+        help="Optional: List of models for backing vocals separation (default: %(default)s).",
+    )
+
+    parser.add_argument(
+        "--other_stems_models",
         nargs="+",
-        default=[
-            "UVR_MDXNET_KARA_2.onnx",
-            "mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt",
-            "2_HP-UVR.pth",
-            "model_bs_roformer_ep_317_sdr_12.9755.yaml",
-            "MDX23C-8KFFT-InstVoc_HQ_2.ckpt",
-            "htdemucs_6s.yaml",
-        ],
-        help="Optional: list of model names to be used for separation (default: %(default)s). Example: --model_names UVR_MDXNET_KARA_2.onnx UVR-MDX-NET-Inst_HQ_4.onnx",
+        default=["htdemucs_6s.yaml"],
+        help="Optional: List of models for other stems separation (default: %(default)s).",
     )
 
     default_model_dir_unix = "/tmp/audio-separator-models/"
@@ -94,12 +100,6 @@ def main():
         help="Optional: lossless output format for separated audio (default: FLAC). Example: --lossless_output_format=WAV",
     )
 
-    parser.add_argument(
-        "--lossy_output_format",
-        default="MP3",
-        help="Optional: lossy output format for separated audio (default: MP3). Example: --lossy_output_format=OGG",
-    )
-
     parser.add_argument(
         "--use_cuda",
         action="store_true",
@@ -361,7 +361,9 @@ def main():
     logger.setLevel(log_level)
 
     if args.existing_instrumental:
-        args.model_names = ["Custom"]
+        args.clean_instrumental_model = None
+        args.backing_vocals_models = []
+        args.other_stems_models = []
 
     logger.info(f"KaraokePrep beginning with input_media: {input_media} artist: {artist} and title: {title}")
 
@@ -373,11 +375,12 @@ def main():
         dry_run=args.dry_run,
         log_formatter=log_formatter,
         log_level=log_level,
-        model_names=args.model_names,
+        clean_instrumental_model=args.clean_instrumental_model,
+        backing_vocals_models=args.backing_vocals_models,
+        other_stems_models=args.other_stems_models,
         model_file_dir=args.model_file_dir,
         output_dir=args.output_dir,
         lossless_output_format=args.lossless_output_format,
-        lossy_output_format=args.lossy_output_format,
         use_cuda=args.use_cuda,
         use_coreml=args.use_coreml,
         normalization_enabled=args.normalize,
@@ -429,11 +432,26 @@ def main():
         logger.info(f" Lyrics: {track['lyrics']}")
         logger.info(f" Processed Lyrics: {track['processed_lyrics']}")
 
-        for model_name in args.model_names:
-            logger.info(f" Instrumental: {track['separated_audio'][model_name]['instrumental']}")
-            logger.info(f" Instrumental (Lossy): {track['separated_audio'][model_name]['instrumental_lossy']}")
-            logger.info(f" Vocals: {track['separated_audio'][model_name]['vocals']}")
-            logger.info(f" Vocals (Lossy): {track['separated_audio'][model_name]['vocals_lossy']}")
+        logger.info(f" Separated Audio:")
+
+        # Clean Instrumental
+        logger.info(f"  Clean Instrumental Model:")
+        for stem_type, file_path in track["separated_audio"]["clean_instrumental"].items():
+            logger.info(f"   {stem_type.capitalize()}: {file_path}")
+
+        # Other Stems
+        logger.info(f"  Other Stems Models:")
+        for model, stems in track["separated_audio"]["other_stems"].items():
+            logger.info(f"   Model: {model}")
+            for stem_type, file_path in stems.items():
+                logger.info(f"    {stem_type.capitalize()}: {file_path}")
+
+        # Backing Vocals
+        logger.info(f"  Backing Vocals Models:")
+        for model, stems in track["separated_audio"]["backing_vocals"].items():
+            logger.info(f"   Model: {model}")
+            for stem_type, file_path in stems.items():
+                logger.info(f"    {stem_type.capitalize()}: {file_path}")
 
 
 if __name__ == "__main__":

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "karaoke-prep"
-version = "0.27.2"
+version = "0.28.0"
 description = "Prepare for karaoke video creation, by downloading audio and lyrics for a specified song or playlist from youtube and separating audio stems. After syncing, finalise the video with a title screen!"
 authors = ["Andrew Beveridge <[email protected]>"]
 license = "MIT"