facebookresearch · simonrouard · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024
diff --git a/.gitignore b/.gitignore
@@ -58,5 +58,4 @@ dataset/*
 */notes
 .vscode/
 /notebooks
-/local_scripts
-/notes
+/local_scripts
diff --git a/assets/drum_loop.wav b/assets/drum_loop.wav
diff --git a/assets/pop_song.wav b/assets/pop_song.wav
diff --git a/audiocraft/grids/musicgenstem/__init__.py b/audiocraft/grids/musicgenstem/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""MusicGenStem grids."""
diff --git a/audiocraft/grids/musicgenstem/_explorers.py b/audiocraft/grids/musicgenstem/_explorers.py
@@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+
+import treetable as tt
+
+from .._base_explorers import BaseExplorer
+
+
+class LMExplorer(BaseExplorer):
+    eval_metrics: tp.List[str] = []
+
+    def stages(self) -> tp.List[str]:
+        return ['train', 'valid']
+
+    def get_grid_metrics(self):
+        """Return the metrics that should be displayed in the tracking table."""
+        return [
+            tt.group(
+                'train',
+                [
+                    tt.leaf('epoch'),
+                    tt.leaf('duration', '.1f'),  # duration in minutes
+                    tt.leaf('ping'),
+                    tt.leaf('ce', '.4f'),  # cross entropy
+                    tt.leaf("ppl", '.3f'),  # perplexity
+                ],
+                align='>',
+            ),
+            tt.group(
+                'valid',
+                [
+                    tt.leaf('ce', '.4f'),
+                    tt.leaf('ppl', '.3f'),
+                    tt.leaf('best_ppl', '.3f'),
+                ],
+                align='>',
+            ),
+        ]
+
+    def process_sheep(self, sheep, history):
+        parts = super().process_sheep(sheep, history)
+
+        track_by = {'ppl': 'lower'}  # values should be in ['lower', 'higher']
+        best_metrics = {k: (1 if v == 'lower' else -1) * float('inf') for k, v in track_by.items()}
+
+        def comparator(mode, a, b):
+            return a < b if mode == 'lower' else a > b
+
+        for metrics in history:
+            for key, sub in metrics.items():
+                for metric in track_by:
+                    # for the validation set, keep track of best metrics (ppl in this example)
+                    # this is so we can conveniently compare metrics between runs in the grid
+                    if key == 'valid' and metric in sub and comparator(
+                        track_by[metric], sub[metric], best_metrics[metric]
+                    ):
+                        best_metrics[metric] = sub[metric]
+
+        if 'valid' in parts:
+            parts['valid'].update({f'best_{k}': v for k, v in best_metrics.items()})
+        return parts
+
+
+class GenerationEvalExplorer(BaseExplorer):
+    eval_metrics: tp.List[str] = []
+
+    def stages(self) -> tp.List[str]:
+        return ['evaluate']
+
+    def get_grid_metrics(self):
+        """Return the metrics that should be displayed in the tracking table."""
+        return [
+            tt.group(
+                'evaluate',
+                [
+                    tt.leaf('epoch', '.3f'),
+                    tt.leaf('duration', '.1f'),
+                    tt.leaf('ping'),
+                    tt.leaf('ce', '.4f'),
+                    tt.leaf('ppl', '.3f'),
+                    tt.leaf('fad', '.3f'),
+                    tt.leaf('kld', '.3f'),
+                    tt.leaf('text_consistency', '.3f'),
+                    tt.leaf('chroma_cosine', '.3f'),
+                ],
+                align='>',
+            ),
+        ]
diff --git a/audiocraft/grids/musicgenstem/musicgenstem_base_32khz.py b/audiocraft/grids/musicgenstem/musicgenstem_base_32khz.py
@@ -0,0 +1,37 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from ._explorers import LMExplorer
+from ...environment import AudioCraftEnvironment
+
+
+@LMExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=64, partition=partitions).bind_(label='64gpus')
+
+    # replace this by the desired music dataset
+    launcher.bind_(dset='internal/music_400k_32khz')
+
+    sub = launcher.bind_({'solver': 'musicgenstem/musicgenstem_base_32khz',
+                          'autocast': False, 
+                          'fsdp.use': True,
+                          'model/lm/model_scale': 'medium',
+                          'optim.optimizer': 'adamw', 
+                          'optim.lr': 1e-4, 
+                          'generate.every': 25,
+                          'dataset.generate.num_samples': 64,
+                          })
+
+    sub = launcher.bind()
+
+    sub({'transformer_lm.n_q': 6, 
+         'codebooks_pattern.delay.delays': [0, 0, 0, 1, 2, 3],})
+
+
+    sub({'transformer_lm.n_q': 7, 
+         'codebooks_pattern.delay.delays': [0, 1, 0, 0, 1, 2, 3],
+         'multistem_compression_model_checkpoints.pretrained': 'facebook/encodec_32_khz_bass_2_drums_1_other_4'
+         })
diff --git a/audiocraft/grids/musicgenstem/musicgenstem_pretrained_32khz_eval.py b/audiocraft/grids/musicgenstem/musicgenstem_pretrained_32khz_eval.py
@@ -0,0 +1,85 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Evaluation with objective metrics for the pretrained MusicGen models.
+This grid takes signature from the training grid and runs evaluation-only stage.
+
+When running the grid for the first time, please use:
+REGEN=1 dora grid musicgen.musicgen_pretrained_32khz_eval
+and re-use the REGEN=1 option when the grid is changed to force regenerating it.
+
+Note that you need the proper metrics external libraries setup to use all
+the objective metrics activated in this grid. Refer to the README for more information.
+"""
+
+import os
+
+from ._explorers import GenerationEvalExplorer
+from ...environment import AudioCraftEnvironment
+from ... import train
+
+
+def eval(launcher, batch_size: int = 32, eval_melody: bool = False):
+    opts = {
+        'dset': 'audio/musiccaps_32khz',
+        'solver/musicgen/evaluation': 'objective_eval',
+        'execute_only': 'evaluate',
+        '+dataset.evaluate.batch_size': batch_size,
+        '+metrics.fad.tf.batch_size': 16,
+    }
+    # binary for FAD computation: replace this path with your own path
+    metrics_opts = {
+        'metrics.fad.tf.bin': '/data/home/jadecopet/local/usr/opt/google-research'
+    }
+    opt1 = {'generate.lm.use_sampling': True, 'generate.lm.top_k': 250, 'generate.lm.top_p': 0.}
+    opt2 = {'transformer_lm.two_step_cfg': True}
+
+    sub = launcher.bind(opts)
+    sub.bind_(metrics_opts)
+
+    # base objective metrics
+    sub(opt1, opt2)
+
+
+@GenerationEvalExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=4, partition=partitions)
+
+    if 'REGEN' not in os.environ:
+        folder = train.main.dora.dir / 'grids' / __name__.split('.', 2)[-1]
+        with launcher.job_array():
+            for sig in folder.iterdir():
+                if not sig.is_symlink():
+                    continue
+                xp = train.main.get_xp_from_sig(sig.name)
+                launcher(xp.argv)
+        return
+
+    with launcher.job_array():
+        musicgen_base = launcher.bind(solver="musicgen/musicgen_base_32khz")
+        musicgen_base.bind_({'autocast': False, 'fsdp.use': True})
+
+        # base musicgen models
+        musicgen_base_small = musicgen_base.bind({'continue_from': '//pretrained/facebook/musicgen-small'})
+        eval(musicgen_base_small, batch_size=128)
+
+        musicgen_base_medium = musicgen_base.bind({'continue_from': '//pretrained/facebook/musicgen-medium'})
+        musicgen_base_medium.bind_({'model/lm/model_scale': 'medium'})
+        eval(musicgen_base_medium, batch_size=128)
+
+        musicgen_base_large = musicgen_base.bind({'continue_from': '//pretrained/facebook/musicgen-large'})
+        musicgen_base_large.bind_({'model/lm/model_scale': 'large'})
+        eval(musicgen_base_large, batch_size=128)
+
+        # melody musicgen model
+        musicgen_melody = launcher.bind(solver="musicgen/musicgen_melody_32khz")
+        musicgen_melody.bind_({'autocast': False, 'fsdp.use': True})
+
+        musicgen_melody_medium = musicgen_melody.bind({'continue_from': '//pretrained/facebook/musicgen-melody'})
+        musicgen_melody_medium.bind_({'model/lm/model_scale': 'medium'})
+        eval(musicgen_melody_medium, batch_size=128, eval_melody=True)
diff --git a/audiocraft/models/__init__.py b/audiocraft/models/__init__.py
@@ -10,7 +10,7 @@
 from . import builders, loaders
 from .encodec import (
     CompressionModel, EncodecModel, DAC,
-    HFEncodecModel, HFEncodecCompressionModel)
+    HFEncodecModel, HFEncodecCompressionModel, MultiStemCompressionModel)
 from .audiogen import AudioGen
 from .lm import LMModel
 from .lm_magnet import MagnetLMModel
@@ -19,3 +19,4 @@
 from .magnet import MAGNeT
 from .unet import DiffusionUnet
 from .watermark import WMModel
+from .musicgenstem import MusicGenStem
diff --git a/audiocraft/models/builders.py b/audiocraft/models/builders.py
@@ -26,11 +26,11 @@
 from ..modules.conditioners import (BaseConditioner, ChromaStemConditioner,
                                     CLAPEmbeddingConditioner, ConditionFuser,
                                     ConditioningProvider, LUTConditioner,
-                                    T5Conditioner, StyleConditioner)
+                                    T5Conditioner, StyleConditioner, MultiStemStyleConditioner)
 from ..modules.diffusion_schedule import MultiBandProcessor, SampleProcessor
 from ..utils.utils import dict_from_config
 from .encodec import (CompressionModel, EncodecModel,
-                      InterleaveStereoCompressionModel)
+                      InterleaveStereoCompressionModel, MultiStemCompressionModel)
 from .lm import LMModel
 from .lm_magnet import MagnetLMModel
 from .unet import DiffusionUnet
@@ -40,9 +40,11 @@
 def get_quantizer(
     quantizer: str, cfg: omegaconf.DictConfig, dimension: int
 ) -> qt.BaseQuantizer:
-    klass = {"no_quant": qt.DummyQuantizer, "rvq": qt.ResidualVectorQuantizer}[
-        quantizer
-    ]
+    klass = {
+        'no_quant': qt.DummyQuantizer,
+        'rvq': qt.ResidualVectorQuantizer,
+        'dac': qt.DACResidualVectorQuantizer,
+    }[quantizer]
     kwargs = dict_from_config(getattr(cfg, quantizer))
     if quantizer != "no_quant":
         kwargs["dimension"] = dimension
@@ -52,6 +54,9 @@ def get_quantizer(
 def get_encodec_autoencoder(encoder_name: str, cfg: omegaconf.DictConfig):
     if encoder_name == "seanet":
         kwargs = dict_from_config(getattr(cfg, "seanet"))
+        # deprecated params
+        kwargs.pop("encoder_transformer", None)
+        kwargs.pop("decoder_transformer", None)
         encoder_override_kwargs = kwargs.pop("encoder")
         decoder_override_kwargs = kwargs.pop("decoder")
         encoder_kwargs = {**kwargs, **encoder_override_kwargs}
@@ -167,6 +172,13 @@ def get_conditioner_provider(
                 device=device,
                 **model_args
             )
+        elif model_type == 'multistem_style':
+            conditioners[str(cond)] = MultiStemStyleConditioner(
+                output_dim=output_dim,
+                device=device,
+                **model_args
+            )
+
         else:
             raise ValueError(f"Unrecognized conditioning model: {model_type}")
     conditioner = ConditioningProvider(