From c864acf2b7d280d553b01601a35954ba0a366ab3 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Fri, 17 Nov 2023 12:10:46 +0100 Subject: [PATCH 1/7] Update versions --- requirements.txt | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/requirements.txt b/requirements.txt index 836de40ab6..864215117e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,31 +1,31 @@ # core deps numpy==1.22.0;python_version<="3.10" -numpy==1.24.3;python_version>"3.10" -cython==0.29.30 +numpy>=1.24.3;python_version>"3.10" +cython>=0.29.30 scipy>=1.11.2 torch>=2.1 torchaudio -soundfile==0.12.* -librosa==0.10.* -scikit-learn==1.3.0 +soundfile>=0.12.0 +librosa>=0.10.0 +scikit-learn>=1.3.0 numba==0.55.1;python_version<"3.9" -numba==0.57.0;python_version>="3.9" -inflect==5.6.* -tqdm==4.64.* -anyascii==0.3.* -pyyaml==6.* -fsspec==2023.6.0 # <= 2023.9.1 makes aux tests fail -aiohttp==3.8.* -packaging==23.1 +numba>=0.57.0;python_version>="3.9" +inflect>=5.6.0 +tqdm>=4.64.1 +anyascii>=0.3.0 +pyyaml>=6.0 +fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail +aiohttp>=3.8.1 +packaging>=23.1 # deps for examples -flask==2.* +flask>=2.0.1 # deps for inference -pysbd==0.3.4 +pysbd>=0.3.4 # deps for notebooks -umap-learn==0.5.* +umap-learn>=0.5.1 pandas>=1.4,<2.0 # deps for training -matplotlib==3.7.* +matplotlib>=3.7.0 # coqui stack trainer # config management @@ -47,11 +47,11 @@ bnnumerizer bnunicodenormalizer #deps for tortoise k_diffusion -einops==0.6.* -transformers==4.33.* +einops>=0.6.0 +transformers>=4.33.0 #deps for bark -encodec==0.1.* +encodec>=0.1.1 # deps for XTTS -unidecode==1.3.* +unidecode>=1.3.2 num2words spacy[ja]>=3 \ No newline at end of file From 44494daa27c5b8e1abd424c4ad5f003e3556ec73 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Fri, 17 Nov 2023 13:01:32 +0100 Subject: [PATCH 2/7] Update CI version --- .github/workflows/pypi-release.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml index 49a5b3004e..2bbcf3cd70 100644 --- a/.github/workflows/pypi-release.yml +++ b/.github/workflows/pypi-release.yml @@ -10,7 +10,7 @@ jobs: build-sdist: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Verify tag matches version run: | set -ex @@ -38,7 +38,7 @@ jobs: matrix: python-version: ["3.9", "3.10", "3.11"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} From f21067a84a2236d1d8c7c1a8d91ef6704f2dce31 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Fri, 17 Nov 2023 13:42:33 +0100 Subject: [PATCH 3/7] Make k_diffusion optional --- TTS/tts/layers/tortoise/diffusion.py | 13 +++++++++++-- requirements.txt | 1 - 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/TTS/tts/layers/tortoise/diffusion.py b/TTS/tts/layers/tortoise/diffusion.py index cb350af779..fcdaa9d76e 100644 --- a/TTS/tts/layers/tortoise/diffusion.py +++ b/TTS/tts/layers/tortoise/diffusion.py @@ -13,12 +13,19 @@ import numpy as np import torch import torch as th -from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral from tqdm import tqdm from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper -K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m} + +try: + from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral + + K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m} +except ImportError: + K_DIFFUSION_SAMPLERS = None + + SAMPLERS = ["dpm++2m", "p", "ddim"] @@ -531,6 +538,8 @@ def sample_loop(self, *args, **kwargs): if self.conditioning_free is not True: raise RuntimeError("cond_free must be true") with tqdm(total=self.num_timesteps) as pbar: + if K_DIFFUSION_SAMPLERS is None: + raise ModuleNotFoundError("Install k_diffusion for using k_diffusion samplers") return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs) else: raise RuntimeError("sampler not impl") diff --git a/requirements.txt b/requirements.txt index 864215117e..ce0e5d9207 100644 --- a/requirements.txt +++ b/requirements.txt @@ -46,7 +46,6 @@ bangla bnnumerizer bnunicodenormalizer #deps for tortoise -k_diffusion einops>=0.6.0 transformers>=4.33.0 #deps for bark From a3279f92942b2d2d0d7a628e79f2002ef5ea88eb Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Fri, 17 Nov 2023 13:43:34 +0100 Subject: [PATCH 4/7] Make style --- TTS/tts/layers/tortoise/diffusion.py | 1 - TTS/tts/layers/xtts/gpt.py | 4 +++- TTS/tts/layers/xtts/tokenizer.py | 23 ++++++++++++----------- TTS/tts/models/xtts.py | 14 +++++--------- tests/zoo_tests/test_models.py | 4 ++-- 5 files changed, 22 insertions(+), 24 deletions(-) diff --git a/TTS/tts/layers/tortoise/diffusion.py b/TTS/tts/layers/tortoise/diffusion.py index fcdaa9d76e..7bea02ca08 100644 --- a/TTS/tts/layers/tortoise/diffusion.py +++ b/TTS/tts/layers/tortoise/diffusion.py @@ -17,7 +17,6 @@ from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper - try: from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py index d914ebf90f..e7b186b858 100644 --- a/TTS/tts/layers/xtts/gpt.py +++ b/TTS/tts/layers/xtts/gpt.py @@ -441,7 +441,9 @@ def forward( audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token) # Pad mel codes with stop_audio_token - audio_codes = self.set_mel_padding(audio_codes, code_lengths - 3) # -3 to get the real code lengths without consider start and stop tokens that was not added yet + audio_codes = self.set_mel_padding( + audio_codes, code_lengths - 3 + ) # -3 to get the real code lengths without consider start and stop tokens that was not added yet # Build input and target tensors # Prepend start token to inputs and append stop token to targets diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 1ef655a3cc..5284874397 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -1,23 +1,22 @@ import os import re -import torch -import pypinyin import textwrap - from functools import cached_property + +import pypinyin +import torch from hangul_romanize import Transliter from hangul_romanize.rule import academic from num2words import num2words +from spacy.lang.ar import Arabic +from spacy.lang.en import English +from spacy.lang.es import Spanish +from spacy.lang.ja import Japanese +from spacy.lang.zh import Chinese from tokenizers import Tokenizer from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words -from spacy.lang.en import English -from spacy.lang.zh import Chinese -from spacy.lang.ja import Japanese -from spacy.lang.ar import Arabic -from spacy.lang.es import Spanish - def get_spacy_lang(lang): if lang == "zh": @@ -32,6 +31,7 @@ def get_spacy_lang(lang): # For most languages, Enlish does the job return English() + def split_sentence(text, lang, text_split_length=250): """Preprocess the input text""" text_splits = [] @@ -67,6 +67,7 @@ def split_sentence(text, lang, text_split_length=250): return text_splits + _whitespace_re = re.compile(r"\s+") # List of (regular expression, replacement) pairs for abbreviations: @@ -619,7 +620,7 @@ def katsu(self): return cutlet.Cutlet() def check_input_length(self, txt, lang): - lang = lang.split("-")[0] # remove the region + lang = lang.split("-")[0] # remove the region limit = self.char_limits.get(lang, 250) if len(txt) > limit: print( @@ -640,7 +641,7 @@ def preprocess_text(self, txt, lang): return txt def encode(self, txt, lang): - lang = lang.split("-")[0] # remove the region + lang = lang.split("-")[0] # remove the region self.check_input_length(txt, lang) txt = self.preprocess_text(txt, lang) lang = "zh-cn" if lang == "zh" else lang diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 3583591f8b..208ec4d561 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -513,13 +513,13 @@ def inference( enable_text_splitting=False, **hf_generate_kwargs, ): - language = language.split("-")[0] # remove the country code + language = language.split("-")[0] # remove the country code length_scale = 1.0 / max(speed, 0.05) if enable_text_splitting: text = split_sentence(text, language, self.tokenizer.char_limits[language]) else: text = [text] - + wavs = [] gpt_latents_list = [] for sent in text: @@ -563,9 +563,7 @@ def inference( if length_scale != 1.0: gpt_latents = F.interpolate( - gpt_latents.transpose(1, 2), - scale_factor=length_scale, - mode="linear" + gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear" ).transpose(1, 2) gpt_latents_list.append(gpt_latents.cpu()) @@ -623,7 +621,7 @@ def inference_stream( enable_text_splitting=False, **hf_generate_kwargs, ): - language = language.split("-")[0] # remove the country code + language = language.split("-")[0] # remove the country code length_scale = 1.0 / max(speed, 0.05) if enable_text_splitting: text = split_sentence(text, language, self.tokenizer.char_limits[language]) @@ -675,9 +673,7 @@ def inference_stream( gpt_latents = torch.cat(all_latents, dim=0)[None, :] if length_scale != 1.0: gpt_latents = F.interpolate( - gpt_latents.transpose(1, 2), - scale_factor=length_scale, - mode="linear" + gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear" ).transpose(1, 2) wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device)) wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks( diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index a5aad5c1ea..8fa56e287a 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -186,7 +186,7 @@ def test_xtts_v2_streaming(): "en", gpt_cond_latent, speaker_embedding, - speed=1.5 + speed=1.5, ) wav_chuncks = [] for i, chunk in enumerate(chunks): @@ -198,7 +198,7 @@ def test_xtts_v2_streaming(): "en", gpt_cond_latent, speaker_embedding, - speed=0.66 + speed=0.66, ) wav_chuncks = [] for i, chunk in enumerate(chunks): From 6075fa208c4f508bd9b629d13b99800724899502 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 17 Nov 2023 11:13:46 -0300 Subject: [PATCH 5/7] Ensures that only GPT model is in training mode during XTTS GPT training (#3241) * Ensures that only GPT model is in training mode during training * Fix parallel wavegan unit test --- TTS/tts/layers/xtts/trainer/gpt_trainer.py | 7 ++++--- TTS/vocoder/configs/parallel_wavegan_config.py | 1 + requirements.txt | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py index 005b30bede..4789e1f43f 100644 --- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py +++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py @@ -318,9 +318,10 @@ def eval_step(self, batch, criterion): batch["cond_idxs"] = None return self.train_step(batch, criterion) - def on_epoch_start(self, trainer): # pylint: disable=W0613 - # guarante that dvae will be in eval mode after .train() on evaluation end - self.dvae = self.dvae.eval() + def on_train_epoch_start(self, trainer): + trainer.model.eval() # the whole model to eval + # put gpt model in training mode + trainer.model.xtts.gpt.train() def on_init_end(self, trainer): # pylint: disable=W0613 # ignore similarities.pth on clearml save/upload diff --git a/TTS/vocoder/configs/parallel_wavegan_config.py b/TTS/vocoder/configs/parallel_wavegan_config.py index 7845dd6bf8..6059d7f04f 100644 --- a/TTS/vocoder/configs/parallel_wavegan_config.py +++ b/TTS/vocoder/configs/parallel_wavegan_config.py @@ -94,6 +94,7 @@ class ParallelWaveganConfig(BaseGANVocoderConfig): use_noise_augment: bool = False use_cache: bool = True steps_to_start_discriminator: int = 200000 + target_loss: str = "loss_1" # LOSS PARAMETERS - overrides use_stft_loss: bool = True diff --git a/requirements.txt b/requirements.txt index ce0e5d9207..1f7a44f6d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,7 +27,7 @@ pandas>=1.4,<2.0 # deps for training matplotlib>=3.7.0 # coqui stack -trainer +trainer>=0.0.32 # config management coqpit>=0.0.16 # chinese g2p deps From 52cb1e2f680a982bcc825e89690cf5d736b8044a Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Fri, 17 Nov 2023 15:16:08 +0100 Subject: [PATCH 6/7] Update model hash for v2.0.2 --- TTS/.models.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 13da715b89..5f4008fb01 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -3,14 +3,14 @@ "multilingual": { "multi-dataset": { "xtts_v2": { - "description": "XTTS-v2 by Coqui with 16 languages.", + "description": "XTTS-v2.0.2 by Coqui with 16 languages.", "hf_url": [ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth", "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json", "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json", "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5" ], - "model_hash": "6a09d1ad43896f06041ed8195956c9698f13b6189dc80f1c74bdc2b8e8d15324", + "model_hash": "5ce0502bfe3bc88dc8d9312b12a7558c", "default_vocoder": null, "commit": "480a6cdf7", "license": "CPML", From c011ab7455875f42b795d7fef61e2ddb1bad2910 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 16 Nov 2023 17:52:13 +0100 Subject: [PATCH 7/7] Update to v0.20.6 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index 1b619f3482..752e630381 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.20.5 +0.20.6