diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index c61ec0197b..de7e439d8a 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -6,33 +6,53 @@ labels: bug assignees: '' --- + +## 🐛 Description -Welcome to the 🐸TTS project! We are excited to see your interest, and appreciate your support! + -This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file. +### To Reproduce -If you've found a bug, please provide the following information: + + +### Expected behavior + + + +### Environment + + -**Environment (please complete the following information):** -- **OS Platform and Distribution (e.g., Linux Ubuntu 16.04)**: -- **PyTorch or TensorFlow version (use command below)**: -- **Python version**: -- **CUDA/cuDNN version**: -- **GPU model and memory**: -- **Exact command to reproduce**: +- 🐸TTS Version (e.g., 1.3.0): +- PyTorch Version (e.g., 1.8) +- Python version: +- OS (e.g., Linux): +- CUDA/cuDNN version: +- GPU models and configuration: +- How you installed PyTorch (`conda`, `pip`, source): +- Any other relevant information: -**Additional context** -Add any other context about the problem here. +### Additional context + diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index d49aa70749..941ab9b143 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -2,25 +2,24 @@ name: 🚀 Feature request about: Suggest a feature or an idea for this project title: '[Feature request] ' -labels: feature request +labels: feature request assignees: '' --- + +**🚀 Feature Description** -Welcome to the 🐸TTS project! We are excited to see your interest, and appreciate your support! + -This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file. +**Solution** -If you have a feature request, then please provide the following information: + -**Is your feature request related to a problem? Please describe.** -A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] +**Alternative Solutions** -**Describe the solution you'd like** -A clear and concise description of what you want to happen. - -**Describe alternatives you've considered** -A clear and concise description of any alternative solutions or features you've considered. + **Additional context** -Add any other context or screenshots about the feature request here. + + diff --git a/TTS/.models.json b/TTS/.models.json index 6bff584b92..44c5fc6c63 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -98,7 +98,7 @@ "fast_pitch":{ "description": "FastPitch model trained on VCTK dataseset.", "github_rls_url": "https://coqui.gateway.scarf.sh/v0.4.0/tts_models--en--vctk--fast_pitch.zip", - "default_vocoder": "vocoder_models/en/vctk/hifigan_v2", + "default_vocoder": null, "commit": "bdab788d", "author": "Eren @erogol", "license": "CC BY-NC-ND 4.0", diff --git a/TTS/bin/collect_env_info.py b/TTS/bin/collect_env_info.py new file mode 100644 index 0000000000..662fcd02ec --- /dev/null +++ b/TTS/bin/collect_env_info.py @@ -0,0 +1,48 @@ +"""Get detailed info about the working environment.""" +import os +import platform +import sys + +import numpy +import torch + +sys.path += [os.path.abspath(".."), os.path.abspath(".")] +import json + +import TTS + + +def system_info(): + return { + "OS": platform.system(), + "architecture": platform.architecture(), + "version": platform.version(), + "processor": platform.processor(), + "python": platform.python_version(), + } + + +def cuda_info(): + return { + "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())], + "available": torch.cuda.is_available(), + "version": torch.version.cuda, + } + + +def package_info(): + return { + "numpy": numpy.__version__, + "PyTorch_version": torch.__version__, + "PyTorch_debug": torch.version.debug, + "TTS": TTS.__version__, + } + + +def main(): + details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()} + print(json.dumps(details, indent=4, sort_keys=True)) + + +if __name__ == "__main__": + main() diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 3e69e1adca..fb2e41b4ab 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -254,7 +254,7 @@ def main(): print(" > Text: {}".format(args.text)) # kick it - wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav) + wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav, args.gst_style) # save the results print(" > Saving output to {}".format(args.out_path)) diff --git a/TTS/server/server.py b/TTS/server/server.py index 5d66d9eb63..c6d67141c0 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -103,7 +103,7 @@ def convert_boolean(x): model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda ) -use_multi_speaker = hasattr(synthesizer.tts_model, "speaker_manager") and synthesizer.tts_model.num_speakers > 1 +use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and synthesizer.tts_model.num_speakers > 1 speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None) # TODO: set this from SpeakerManager use_gst = synthesizer.tts_config.get("use_gst", False) diff --git a/TTS/trainer.py b/TTS/trainer.py index 9fcd77a765..2a2cfc46bb 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -284,8 +284,8 @@ def __init__( # pylint: disable=dangerous-default-value self.optimizer = self.get_optimizer(self.model, self.config) # CALLBACK - self.callbacks = TrainerCallback(self) - self.callbacks.on_init_start() + self.callbacks = TrainerCallback() + self.callbacks.on_init_start(self) # init AMP if self.use_amp_scaler: @@ -324,7 +324,7 @@ def __init__( # pylint: disable=dangerous-default-value num_params = count_parameters(self.model) print("\n > Model has {} parameters".format(num_params)) - self.callbacks.on_init_end() + self.callbacks.on_init_end(self) @staticmethod def parse_argv(args: Union[Coqpit, List]): @@ -677,7 +677,7 @@ def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_ti Returns: Tuple[Dict, Dict]: Model outputs and losses. """ - self.callbacks.on_train_step_start() + self.callbacks.on_train_step_start(self) # format data batch = self.format_batch(batch) loader_time = time.time() - loader_start_time @@ -792,7 +792,7 @@ def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_ti self.dashboard_logger.flush() self.total_steps_done += 1 - self.callbacks.on_train_step_end() + self.callbacks.on_train_step_end(self) return outputs, loss_dict def train_epoch(self) -> None: @@ -983,7 +983,7 @@ def _fit(self) -> None: if self.num_gpus > 1: # let all processes sync up before starting with a new epoch of training dist.barrier() - self.callbacks.on_epoch_start() + self.callbacks.on_epoch_start(self) self.keep_avg_train = KeepAverage() self.keep_avg_eval = KeepAverage() if self.config.run_eval else None self.epochs_done = epoch @@ -999,7 +999,7 @@ def _fit(self) -> None: ) if self.args.rank in [None, 0]: self.save_best_model() - self.callbacks.on_epoch_end() + self.callbacks.on_epoch_end(self) def fit(self) -> None: """Where the ✨️magic✨️ happens...""" @@ -1008,7 +1008,7 @@ def fit(self) -> None: if self.args.rank == 0: self.dashboard_logger.finish() except KeyboardInterrupt: - self.callbacks.on_keyboard_interrupt() + self.callbacks.on_keyboard_interrupt(self) # if the output folder is empty remove the run. remove_experiment_folder(self.output_path) # clear the DDP processes diff --git a/TTS/tts/configs/fast_speech_config.py b/TTS/tts/configs/fast_speech_config.py index 040a891091..31d994421d 100644 --- a/TTS/tts/configs/fast_speech_config.py +++ b/TTS/tts/configs/fast_speech_config.py @@ -110,6 +110,7 @@ class FastSpeechConfig(BaseTTSConfig): model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False) # multi-speaker settings + num_speakers: int = 0 speakers_file: str = None use_speaker_embedding: bool = False use_d_vector_file: bool = False @@ -142,7 +143,7 @@ class FastSpeechConfig(BaseTTSConfig): r: int = 1 # DO NOT CHANGE # dataset configs - compute_f0: bool = True + compute_f0: bool = False f0_cache_path: str = None # testing diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 780249363d..4fae974f88 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -91,6 +91,8 @@ def load_tts_samples( for idx, ins in enumerate(meta_data_eval_all): attn_file = meta_data[ins[1]].strip() meta_data_eval_all[idx].append(attn_file) + # set none for the next iter + formatter = None return meta_data_train_all, meta_data_eval_all @@ -110,3 +112,18 @@ def _get_formatter_by_name(name): """Returns the respective preprocessing function.""" thismodule = sys.modules[__name__] return getattr(thismodule, name.lower()) + + +def find_unique_chars(data_samples, verbose=True): + texts = "".join(item[0] for item in data_samples) + chars = set(texts) + lower_chars = filter(lambda c: c.islower(), chars) + chars_force_lower = [c.lower() for c in chars] + chars_force_lower = set(chars_force_lower) + + if verbose: + print(f" > Number of unique characters: {len(chars)}") + print(f" > Unique characters: {''.join(sorted(chars))}") + print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") + print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}") + return chars_force_lower diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index dcd1874006..425eb0cdd0 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -60,7 +60,13 @@ def mozilla_de(root_path, meta_file): def mailabs(root_path, meta_files=None): - """Normalizes M-AI-Labs meta data files to TTS format""" + """Normalizes M-AI-Labs meta data files to TTS format + + Args: + root_path (str): root folder of the MAILAB language folder. + meta_files (str): list of meta files to be used in the training. If None, finds all the csv files + recursively. Defaults to None + """ speaker_regex = re.compile("by_book/(male|female)/(?P[^/]+)/") if meta_files is None: csv_files = glob(root_path + "/**/metadata.csv", recursive=True) diff --git a/TTS/tts/layers/vits/stochastic_duration_predictor.py b/TTS/tts/layers/vits/stochastic_duration_predictor.py index 53f7ca7c04..91e53da325 100644 --- a/TTS/tts/layers/vits/stochastic_duration_predictor.py +++ b/TTS/tts/layers/vits/stochastic_duration_predictor.py @@ -266,7 +266,7 @@ def forward(self, x, x_mask, dr=None, g=None, reverse=False, noise_scale=1.0): flows = list(reversed(self.flows)) flows = flows[:-2] + [flows[-1]] # remove a useless vflow - z = torch.rand(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale + z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale for flow in flows: z = torch.flip(z, [1]) z = flow(z, x_mask, g=x, reverse=reverse) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index b77c1e2315..854526de8c 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -219,7 +219,7 @@ def get_data_loader( use_phonemes=config.use_phonemes, phoneme_language=config.phoneme_language, enable_eos_bos=config.enable_eos_bos_chars, - use_noise_augment=not is_eval, + use_noise_augment=False if is_eval else config.use_noise_augment, verbose=verbose, speaker_id_mapping=speaker_id_mapping, d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None, diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 5185139e93..578c26c0d3 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -250,11 +250,11 @@ def synthesis( # GST processing style_mel = None custom_symbols = None - if CONFIG.has("gst") and CONFIG.gst and style_wav is not None: - if isinstance(style_wav, dict): - style_mel = style_wav - else: - style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda) + if style_wav: + style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda) + elif CONFIG.has("gst") and CONFIG.gst and not style_wav: + if CONFIG.gst.gst_style_input_weights: + style_mel = CONFIG.gst.gst_style_input_weights if hasattr(model, "make_symbols"): custom_symbols = model.make_symbols(CONFIG) # preprocess the given text diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 66f518b407..537d23017f 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -5,6 +5,7 @@ from typing import Dict, List import gruut +from gruut_ipa import IPA from TTS.tts.utils.text import cleaners from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes @@ -32,7 +33,7 @@ GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ") -def text2phone(text, language, use_espeak_phonemes=False): +def text2phone(text, language, use_espeak_phonemes=False, keep_stress=False): """Convert graphemes to phonemes. Parameters: text (str): text to phonemize @@ -51,36 +52,44 @@ def text2phone(text, language, use_espeak_phonemes=False): ph = japanese_text_to_phonemes(text) return ph - if gruut.is_language_supported(language): - # Use gruut for phonemization - phonemizer_args = { - "remove_stress": True, - "ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA | - "ipa_major_breaks": False, # don't replace periods with IPA ‖ - } - - if use_espeak_phonemes: - # Use a lexicon/g2p model train on eSpeak IPA instead of gruut IPA. - # This is intended for backwards compatibility with TTS<=v0.0.13 - # pre-trained models. - phonemizer_args["model_prefix"] = "espeak" - - ph_list = gruut.text_to_phonemes( - text, - lang=language, - return_format="word_phonemes", - phonemizer_args=phonemizer_args, - ) - - # Join and re-split to break apart dipthongs, suprasegmentals, etc. - ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list] - ph = "| ".join(ph_words) - - # Fix a few phonemes - ph = ph.translate(GRUUT_TRANS_TABLE) - return ph - - raise ValueError(f" [!] Language {language} is not supported for phonemization.") + if not gruut.is_language_supported(language): + raise ValueError(f" [!] Language {language} is not supported for phonemization.") + + # Use gruut for phonemization + ph_list = [] + for sentence in gruut.sentences(text, lang=language, espeak=use_espeak_phonemes): + for word in sentence: + if word.is_break: + # Use actual character for break phoneme (e.g., comma) + if ph_list: + # Join with previous word + ph_list[-1].append(word.text) + else: + # First word is punctuation + ph_list.append([word.text]) + elif word.phonemes: + # Add phonemes for word + word_phonemes = [] + + for word_phoneme in word.phonemes: + if not keep_stress: + # Remove primary/secondary stress + word_phoneme = IPA.without_stress(word_phoneme) + + word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE) + + if word_phoneme: + # Flatten phonemes + word_phonemes.extend(word_phoneme) + + if word_phonemes: + ph_list.append(word_phonemes) + + # Join and re-split to break apart dipthongs, suprasegmentals, etc. + ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list] + ph = "| ".join(ph_words) + + return ph def intersperse(sequence, token): diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index dd9c570197..e64b95e01f 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -674,7 +674,7 @@ def compute_f0(self, x: np.ndarray) -> np.ndarray: return f0 ### Audio Processing ### - def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8) -> int: + def find_endpoint(self, wav: np.ndarray, min_silence_sec=0.8) -> int: """Find the last point without silence at the end of a audio signal. Args: @@ -687,7 +687,7 @@ def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8) """ window_length = int(self.sample_rate * min_silence_sec) hop_length = int(window_length / 4) - threshold = self._db_to_amp(threshold_db) + threshold = self._db_to_amp(-self.trim_db) for x in range(hop_length, len(wav) - window_length, hop_length): if np.max(wav[x : x + window_length]) < threshold: return x + hop_length diff --git a/TTS/utils/callbacks.py b/TTS/utils/callbacks.py index 18b6c34c27..511d215c65 100644 --- a/TTS/utils/callbacks.py +++ b/TTS/utils/callbacks.py @@ -1,75 +1,105 @@ class TrainerCallback: - def __init__(self, trainer): - super().__init__() - self.trainer = trainer - - def on_init_start(self) -> None: - if hasattr(self.trainer.model, "on_init_start"): - self.trainer.model.on_init_start(self.trainer) - - if hasattr(self.trainer.criterion, "on_init_start"): - self.trainer.criterion.on_init_start(self.trainer) - - if hasattr(self.trainer.optimizer, "on_init_start"): - self.trainer.optimizer.on_init_start(self.trainer) - - def on_init_end(self) -> None: - if hasattr(self.trainer.model, "on_init_end"): - self.trainer.model.on_init_end(self.trainer) - - if hasattr(self.trainer.criterion, "on_init_end"): - self.trainer.criterion.on_init_end(self.trainer) - - if hasattr(self.trainer.optimizer, "on_init_end"): - self.trainer.optimizer.on_init_end(self.trainer) - - def on_epoch_start(self) -> None: - if hasattr(self.trainer.model, "on_epoch_start"): - self.trainer.model.on_epoch_start(self.trainer) - - if hasattr(self.trainer.criterion, "on_epoch_start"): - self.trainer.criterion.on_epoch_start(self.trainer) - - if hasattr(self.trainer.optimizer, "on_epoch_start"): - self.trainer.optimizer.on_epoch_start(self.trainer) - - def on_epoch_end(self) -> None: - if hasattr(self.trainer.model, "on_epoch_end"): - self.trainer.model.on_epoch_end(self.trainer) - - if hasattr(self.trainer.criterion, "on_epoch_end"): - self.trainer.criterion.on_epoch_end(self.trainer) - - if hasattr(self.trainer.optimizer, "on_epoch_end"): - self.trainer.optimizer.on_epoch_end(self.trainer) - - def on_train_step_start(self) -> None: - if hasattr(self.trainer.model, "on_train_step_start"): - self.trainer.model.on_train_step_start(self.trainer) - - if hasattr(self.trainer.criterion, "on_train_step_start"): - self.trainer.criterion.on_train_step_start(self.trainer) - - if hasattr(self.trainer.optimizer, "on_train_step_start"): - self.trainer.optimizer.on_train_step_start(self.trainer) - - def on_train_step_end(self) -> None: - - if hasattr(self.trainer.model, "on_train_step_end"): - self.trainer.model.on_train_step_end(self.trainer) - - if hasattr(self.trainer.criterion, "on_train_step_end"): - self.trainer.criterion.on_train_step_end(self.trainer) - - if hasattr(self.trainer.optimizer, "on_train_step_end"): - self.trainer.optimizer.on_train_step_end(self.trainer) - - def on_keyboard_interrupt(self) -> None: - if hasattr(self.trainer.model, "on_keyboard_interrupt"): - self.trainer.model.on_keyboard_interrupt(self.trainer) - - if hasattr(self.trainer.criterion, "on_keyboard_interrupt"): - self.trainer.criterion.on_keyboard_interrupt(self.trainer) - - if hasattr(self.trainer.optimizer, "on_keyboard_interrupt"): - self.trainer.optimizer.on_keyboard_interrupt(self.trainer) + @staticmethod + def on_init_start(trainer) -> None: + if hasattr(trainer.model, "module"): + if hasattr(trainer.model.module, "on_init_start"): + trainer.model.module.on_init_start(trainer) + else: + if hasattr(trainer.model, "on_init_start"): + trainer.model.on_init_start(trainer) + + if hasattr(trainer.criterion, "on_init_start"): + trainer.criterion.on_init_start(trainer) + + if hasattr(trainer.optimizer, "on_init_start"): + trainer.optimizer.on_init_start(trainer) + + @staticmethod + def on_init_end(trainer) -> None: + if hasattr(trainer.model, "module"): + if hasattr(trainer.model.module, "on_init_end"): + trainer.model.module.on_init_end(trainer) + else: + if hasattr(trainer.model, "on_init_end"): + trainer.model.on_init_end(trainer) + + if hasattr(trainer.criterion, "on_init_end"): + trainer.criterion.on_init_end(trainer) + + if hasattr(trainer.optimizer, "on_init_end"): + trainer.optimizer.on_init_end(trainer) + + @staticmethod + def on_epoch_start(trainer) -> None: + if hasattr(trainer.model, "module"): + if hasattr(trainer.model.module, "on_epoch_start"): + trainer.model.module.on_epoch_start(trainer) + else: + if hasattr(trainer.model, "on_epoch_start"): + trainer.model.on_epoch_start(trainer) + + if hasattr(trainer.criterion, "on_epoch_start"): + trainer.criterion.on_epoch_start(trainer) + + if hasattr(trainer.optimizer, "on_epoch_start"): + trainer.optimizer.on_epoch_start(trainer) + + @staticmethod + def on_epoch_end(trainer) -> None: + if hasattr(trainer.model, "module"): + if hasattr(trainer.model.module, "on_epoch_end"): + trainer.model.module.on_epoch_end(trainer) + else: + if hasattr(trainer.model, "on_epoch_end"): + trainer.model.on_epoch_end(trainer) + + if hasattr(trainer.criterion, "on_epoch_end"): + trainer.criterion.on_epoch_end(trainer) + + if hasattr(trainer.optimizer, "on_epoch_end"): + trainer.optimizer.on_epoch_end(trainer) + + @staticmethod + def on_train_step_start(trainer) -> None: + if hasattr(trainer.model, "module"): + if hasattr(trainer.model.module, "on_train_step_start"): + trainer.model.module.on_train_step_start(trainer) + else: + if hasattr(trainer.model, "on_train_step_start"): + trainer.model.on_train_step_start(trainer) + + if hasattr(trainer.criterion, "on_train_step_start"): + trainer.criterion.on_train_step_start(trainer) + + if hasattr(trainer.optimizer, "on_train_step_start"): + trainer.optimizer.on_train_step_start(trainer) + + @staticmethod + def on_train_step_end(trainer) -> None: + if hasattr(trainer.model, "module"): + if hasattr(trainer.model.module, "on_train_step_end"): + trainer.model.module.on_train_step_end(trainer) + else: + if hasattr(trainer.model, "on_train_step_end"): + trainer.model.on_train_step_end(trainer) + + if hasattr(trainer.criterion, "on_train_step_end"): + trainer.criterion.on_train_step_end(trainer) + + if hasattr(trainer.optimizer, "on_train_step_end"): + trainer.optimizer.on_train_step_end(trainer) + + @staticmethod + def on_keyboard_interrupt(trainer) -> None: + if hasattr(trainer.model, "module"): + if hasattr(trainer.model.module, "on_keyboard_interrupt"): + trainer.model.module.on_keyboard_interrupt(trainer) + else: + if hasattr(trainer.model, "on_keyboard_interrupt"): + trainer.model.on_keyboard_interrupt(trainer) + + if hasattr(trainer.criterion, "on_keyboard_interrupt"): + trainer.criterion.on_keyboard_interrupt(trainer) + + if hasattr(trainer.optimizer, "on_keyboard_interrupt"): + trainer.optimizer.on_keyboard_interrupt(trainer) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index af07419f44..043c498236 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -74,6 +74,8 @@ def __init__( if vocoder_checkpoint: self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda) self.output_sample_rate = self.vocoder_config.audio["sample_rate"] + else: + print(" > Using Griffin-Lim as no vocoder model defined") @staticmethod def _get_segmenter(lang: str): @@ -265,7 +267,8 @@ def tts(self, text: str, speaker_idx: str = "", speaker_wav=None, style_wav=None waveform = waveform.squeeze() # trim silence - waveform = trim_silence(waveform, self.ap) + if self.tts_config.audio["do_trim_silence"] is True: + waveform = trim_silence(waveform, self.ap) wavs += list(waveform) wavs += [0] * 10000 diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py index 81ba87c42f..e36c2cd16a 100644 --- a/TTS/vocoder/models/gan.py +++ b/TTS/vocoder/models/gan.py @@ -310,6 +310,7 @@ def get_data_loader( # pylint: disable=no-self-use data_items: List, verbose: bool, num_gpus: int, + rank: int = 0, # pylint: disable=unused-argument ): """Initiate and return the GAN dataloader. diff --git a/docs/source/finetuning.md b/docs/source/finetuning.md index fd9a295c03..42b9e51887 100644 --- a/docs/source/finetuning.md +++ b/docs/source/finetuning.md @@ -93,13 +93,13 @@ them and fine-tune it for your own dataset. This will help you in two main ways: ```bash CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \ - --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts + --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar ``` ```bash CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py \ --config_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/config.json \ - --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts + --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar ``` As stated above, you can also use command-line arguments to change the model configuration. @@ -107,7 +107,7 @@ them and fine-tune it for your own dataset. This will help you in two main ways: ```bash CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \ - --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts + --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar --coqpit.run_name "glow-tts-finetune" \ --coqpit.lr 0.00001 ``` diff --git a/docs/source/formatting_your_dataset.md b/docs/source/formatting_your_dataset.md index cbefc61dc0..3db38af0b0 100644 --- a/docs/source/formatting_your_dataset.md +++ b/docs/source/formatting_your_dataset.md @@ -19,15 +19,15 @@ Let's assume you created the audio clips and their transcription. You can collec You can either create separate transcription files for each clip or create a text file that maps each audio clip to its transcription. In this file, each line must be delimitered by a special character separating the audio file name from the transcription. And make sure that the delimiter is not used in the transcription text. -We recommend the following format delimited by `||`. +We recommend the following format delimited by `||`. In the following example, `audio1`, `audio2` refer to files `audio1.wav`, `audio2.wav` etc. ``` # metadata.txt -audio1.wav || This is my sentence. -audio2.wav || This is maybe my sentence. -audio3.wav || This is certainly my sentence. -audio4.wav || Let this be your sentence. +audio1||This is my sentence. +audio2||This is maybe my sentence. +audio3||This is certainly my sentence. +audio4||Let this be your sentence. ... ``` @@ -80,4 +80,4 @@ See `TTS.tts.datasets.TTSDataset`, a generic `Dataset` implementation for the `t See `TTS.vocoder.datasets.*`, for different `Dataset` implementations for the `vocoder` models. See `TTS.utils.audio.AudioProcessor` that includes all the audio processing and feature extraction functions used in a -`Dataset` implementation. Feel free to add things as you need.passed \ No newline at end of file +`Dataset` implementation. Feel free to add things as you need.passed diff --git a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py index 346d650b8f..63efb78470 100644 --- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py +++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py @@ -35,7 +35,7 @@ test_delay_epochs=-1, r=2, # gradual_training=[[0, 6, 48], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]], - double_decoder_consistency=False, + double_decoder_consistency=True, epochs=1000, text_cleaner="phoneme_cleaners", use_phonemes=True, diff --git a/recipes/vctk/tacotron2/train_tacotron2.py b/recipes/vctk/tacotron2/train_tacotron2.py new file mode 100644 index 0000000000..346d650b8f --- /dev/null +++ b/recipes/vctk/tacotron2/train_tacotron2.py @@ -0,0 +1,87 @@ +import os + +from TTS.config.shared_configs import BaseAudioConfig +from TTS.trainer import Trainer, TrainingArgs +from TTS.tts.configs.shared_configs import BaseDatasetConfig +from TTS.tts.configs.tacotron2_config import Tacotron2Config +from TTS.tts.datasets import load_tts_samples +from TTS.tts.models.tacotron2 import Tacotron2 +from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.audio import AudioProcessor + +output_path = os.path.dirname(os.path.abspath(__file__)) +dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) + +audio_config = BaseAudioConfig( + sample_rate=22050, + resample=False, # Resample to 22050 Hz. It slows down training. Use `TTS/bin/resample.py` to pre-resample and set this False for faster training. + do_trim_silence=True, + trim_db=23.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + preemphasis=0.0, +) + +config = Tacotron2Config( # This is the config that is saved for the future use + audio=audio_config, + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + r=2, + # gradual_training=[[0, 6, 48], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]], + double_decoder_consistency=False, + epochs=1000, + text_cleaner="phoneme_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=150, + print_eval=False, + mixed_precision=True, + sort_by_audio_len=True, + min_seq_len=14800, + max_seq_len=22050 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio + output_path=output_path, + datasets=[dataset_config], + use_speaker_embedding=True, # set this to enable multi-sepeaker training + decoder_ssim_alpha=0.0, # disable ssim losses that causes NaN for some runs. + postnet_ssim_alpha=0.0, + postnet_diff_spec_alpha=0.0, + decoder_diff_spec_alpha=0.0, + attention_norm="softmax", + optimizer="Adam", + lr_scheduler=None, + lr=3e-5, +) + +# init audio processor +ap = AudioProcessor(**config.audio.to_dict()) + +# load training samples +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) + +# init speaker manager for multi-speaker training +# it mainly handles speaker-id to speaker-name for the model and the data-loader +speaker_manager = SpeakerManager() +speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) + +# init model +model = Tacotron2(config, speaker_manager) + +# init the trainer and 🚀 +trainer = Trainer( + TrainingArgs(), + config, + output_path, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + training_assets={"audio_processor": ap}, +) +trainer.fit() diff --git a/requirements.txt b/requirements.txt index a87a3c6f5c..3ec33ceb02 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,6 +23,6 @@ coqpit mecab-python3==1.0.3 unidic-lite==1.0.8 # gruut+supported langs -gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0 +gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0 fsspec>=2021.04.0 -pyworld \ No newline at end of file +pyworld diff --git a/tests/aux_tests/test_text_processing.py b/tests/aux_tests/test_text_processing.py index 3c424a15a1..62d60a42bf 100644 --- a/tests/aux_tests/test_text_processing.py +++ b/tests/aux_tests/test_text_processing.py @@ -9,12 +9,12 @@ EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" -EXPECTED_PHONEMES = "ɹ|iː|s|ə|n|t| ɹ|ᵻ|s|ɜː|tʃ| æ|t| h|ɑːɹ|v|ɚ|d| h|æ|z| ʃ|oʊ|n| m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| f|ɔːɹ| æ|z| l|ɪ|ɾ|əl| æ|z| eɪ|t| w|iː|k|s| k|æ|n| æ|k|tʃ|uː|əl|i| ɪ|ŋ|k|ɹ|iː|s| ,| ð|ə| ɡ|ɹ|eɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑːɹ|t|s| ʌ|v| ð|ə| b|ɹ|eɪ|n| ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| f|ɔːɹ| ɪ|m|oʊ|ʃ|ə|n|əl| ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| æ|n|d| l|ɜː|n|ɪ|ŋ| !" +EXPECTED_PHONEMES = "ɹ|i|ː|s|ə|n|t| ɹ|ᵻ|s|ɜ|ː|t|ʃ| æ|ɾ| h|ɑ|ː|ɹ|v|ɚ|d| h|ɐ|z| ʃ|o|ʊ|n| m|ɛ|d|ᵻ|t|e|ɪ|ɾ|ɪ|ŋ| f|ɔ|ː|ɹ| æ|z| l|ɪ|ɾ|ə|l| æ|z| e|ɪ|t| w|i|ː|k|s| k|æ|ŋ| æ|k|t|ʃ|u|ː|ə|l|i| ɪ|ŋ|k|ɹ|i|ː|s|,| ð|ə| ɡ|ɹ|e|ɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑ|ː|ɹ|t|s| ʌ|v| ð|ə| b|ɹ|e|ɪ|n| ɹ|ᵻ|s|p|ɑ|ː|n|s|ᵻ|b|ə|l| f|ɔ|ː|ɹ| ɪ|m|o|ʊ|ʃ|ə|n|ə|l| ɹ|ɛ|ɡ|j|ʊ|l|e|ɪ|ʃ|ə|n| æ|n|d| l|ɜ|ː|n|ɪ|ŋ|!" # ----------------------------------------------------------------------------- -class TextProcessingTextCase(unittest.TestCase): +class TextProcessingTestCase(unittest.TestCase): """Tests for text to phoneme conversion""" def test_phoneme_to_sequence(self): @@ -40,7 +40,7 @@ def _test_phoneme_to_sequence(self, add_blank): sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) - gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ?" + gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?" print(text_hat) print(len(sequence)) self.assertEqual(text_hat, text_hat_with_params) @@ -51,7 +51,7 @@ def _test_phoneme_to_sequence(self, add_blank): sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) - gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ" + gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" print(text_hat) print(len(sequence)) self.assertEqual(text_hat, text_hat_with_params) @@ -62,7 +62,7 @@ def _test_phoneme_to_sequence(self, add_blank): sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) - gt = "biː ɐ vɔɪs , nɑːt ɐn ɛkoʊ !" + gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!" print(text_hat) print(len(sequence)) self.assertEqual(text_hat, text_hat_with_params) @@ -73,7 +73,7 @@ def _test_phoneme_to_sequence(self, add_blank): sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) - gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ." + gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ." print(text_hat) print(len(sequence)) self.assertEqual(text_hat, text_hat_with_params) @@ -86,7 +86,7 @@ def _test_phoneme_to_sequence(self, add_blank): ) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) - gt = "^biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ .~" + gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~" print(text_hat) print(len(sequence)) self.assertEqual(text_hat, text_hat_with_params)