Merge pull request #914 from coqui-ai/dev

v0.4.2
coqui-ai · Dec 8, 2021 · 7f1a237 · 7f1a237
2 parents 33aa27e + bce143c
commit 7f1a237
Show file tree

Hide file tree

Showing 24 changed files with 398 additions and 177 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -6,33 +6,53 @@ labels: bug
 assignees: ''
 
 ---
+<!-- Welcome to the 🐸TTS!
+We are excited to see your interest, and appreciate your support! --->
+## 🐛 Description
 
-Welcome to the 🐸TTS project! We are excited to see your interest, and appreciate your support!
+<!-- A clear and concise description of what the bug is. -->
 
-This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file.
+### To Reproduce
 
-If you've found a bug, please provide the following information:
+<!--
+Please share your code to reproduce the error. Issues fixed faster if you can provide a working example.
 
-**Describe the bug**
-A clear and concise description of what the bug is.
+The best place for sharing code is colab. https://colab.research.google.com/
+So we can directly run your code and reproduce the issue.
+
+In the worse case provide steps to reproduce the behaviour.
 
-**To Reproduce**
-Steps to reproduce the behavior:
 1. Run the following command '...'
 2. ...
 3. See error
+-->
+
+### Expected behavior
+
+<!-- Write down what the expected behaviour -->
+
+### Environment
+
+<!--
+You can either run `TTS/bin/collect_env_info.py`
+
+```bash
+wget https://raw.githubusercontent.com/coqui-ai/TTS/main/TTS/bin/collect_env_details.py
+python collect_env_details.py
+```
 
-**Expected behavior**
-A clear and concise description of what you expected to happen.
+or fill in the fields below manually.
+-->
 
-**Environment (please complete the following information):**
-- **OS Platform and Distribution (e.g., Linux Ubuntu 16.04)**:
-- **PyTorch or TensorFlow version (use command below)**:
-- **Python version**:
-- **CUDA/cuDNN version**:
-- **GPU model and memory**:
-- **Exact command to reproduce**:
+- 🐸TTS Version (e.g., 1.3.0):
+- PyTorch Version (e.g., 1.8)
+- Python version:
+- OS (e.g., Linux):
+- CUDA/cuDNN version:
+- GPU models and configuration:
+- How you installed PyTorch (`conda`, `pip`, source):
+- Any other relevant information:
 
-**Additional context**
-Add any other context about the problem here.
+### Additional context
 
+<!-- Add any other context about the problem here. -->
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -2,25 +2,24 @@
 name: 🚀 Feature request
 about: Suggest a feature or an idea for this project
 title: '[Feature request] '
-labels: feature request 
+labels: feature request
 assignees: ''
 
 ---
+<!-- Welcome to the 🐸TTS project!
+We are excited to see your interest, and appreciate your support! --->
+**🚀 Feature Description**
 
-Welcome to the 🐸TTS project! We are excited to see your interest, and appreciate your support!
+<!--A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] -->
 
-This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file.
+**Solution**
 
-If you have a feature request, then please provide the following information:
+<!-- A clear and concise description of what you want to happen. -->
 
-**Is your feature request related to a problem? Please describe.**
-A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+**Alternative Solutions**
 
-**Describe the solution you'd like**
-A clear and concise description of what you want to happen.
-
-**Describe alternatives you've considered**
-A clear and concise description of any alternative solutions or features you've considered.
+<!-- A clear and concise description of any alternative solutions or features you've considered. -->
 
 **Additional context**
-Add any other context or screenshots about the feature request here.
+
+<!-- Add any other context or screenshots about the feature request here. -->
diff --git a/TTS/.models.json b/TTS/.models.json
@@ -98,7 +98,7 @@
                 "fast_pitch":{
                     "description": "FastPitch model trained on VCTK dataseset.",
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.4.0/tts_models--en--vctk--fast_pitch.zip",
-                    "default_vocoder": "vocoder_models/en/vctk/hifigan_v2",
+                    "default_vocoder": null,
                     "commit": "bdab788d",
                     "author": "Eren @erogol",
                     "license": "CC BY-NC-ND 4.0",

diff --git a/TTS/bin/collect_env_info.py b/TTS/bin/collect_env_info.py
@@ -0,0 +1,48 @@
+"""Get detailed info about the working environment."""
+import os
+import platform
+import sys
+
+import numpy
+import torch
+
+sys.path += [os.path.abspath(".."), os.path.abspath(".")]
+import json
+
+import TTS
+
+
+def system_info():
+    return {
+        "OS": platform.system(),
+        "architecture": platform.architecture(),
+        "version": platform.version(),
+        "processor": platform.processor(),
+        "python": platform.python_version(),
+    }
+
+
+def cuda_info():
+    return {
+        "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
+        "available": torch.cuda.is_available(),
+        "version": torch.version.cuda,
+    }
+
+
+def package_info():
+    return {
+        "numpy": numpy.__version__,
+        "PyTorch_version": torch.__version__,
+        "PyTorch_debug": torch.version.debug,
+        "TTS": TTS.__version__,
+    }
+
+
+def main():
+    details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
+    print(json.dumps(details, indent=4, sort_keys=True))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
@@ -254,7 +254,7 @@ def main():
     print(" > Text: {}".format(args.text))
 
     # kick it
-    wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav)
+    wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav, args.gst_style)
 
     # save the results
     print(" > Saving output to {}".format(args.out_path))

diff --git a/TTS/server/server.py b/TTS/server/server.py
@@ -103,7 +103,7 @@ def convert_boolean(x):
     model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda
 )
 
-use_multi_speaker = hasattr(synthesizer.tts_model, "speaker_manager") and synthesizer.tts_model.num_speakers > 1
+use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and synthesizer.tts_model.num_speakers > 1
 speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
 # TODO: set this from SpeakerManager
 use_gst = synthesizer.tts_config.get("use_gst", False)

diff --git a/TTS/trainer.py b/TTS/trainer.py
@@ -284,8 +284,8 @@ def __init__(  # pylint: disable=dangerous-default-value
         self.optimizer = self.get_optimizer(self.model, self.config)
 
         # CALLBACK
-        self.callbacks = TrainerCallback(self)
-        self.callbacks.on_init_start()
+        self.callbacks = TrainerCallback()
+        self.callbacks.on_init_start(self)
 
         # init AMP
         if self.use_amp_scaler:
@@ -324,7 +324,7 @@ def __init__(  # pylint: disable=dangerous-default-value
         num_params = count_parameters(self.model)
         print("\n > Model has {} parameters".format(num_params))
 
-        self.callbacks.on_init_end()
+        self.callbacks.on_init_end(self)
 
     @staticmethod
     def parse_argv(args: Union[Coqpit, List]):
@@ -677,7 +677,7 @@ def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_ti
         Returns:
             Tuple[Dict, Dict]: Model outputs and losses.
         """
-        self.callbacks.on_train_step_start()
+        self.callbacks.on_train_step_start(self)
         # format data
         batch = self.format_batch(batch)
         loader_time = time.time() - loader_start_time
@@ -792,7 +792,7 @@ def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_ti
             self.dashboard_logger.flush()
 
         self.total_steps_done += 1
-        self.callbacks.on_train_step_end()
+        self.callbacks.on_train_step_end(self)
         return outputs, loss_dict
 
     def train_epoch(self) -> None:
@@ -983,7 +983,7 @@ def _fit(self) -> None:
             if self.num_gpus > 1:
                 # let all processes sync up before starting with a new epoch of training
                 dist.barrier()
-            self.callbacks.on_epoch_start()
+            self.callbacks.on_epoch_start(self)
             self.keep_avg_train = KeepAverage()
             self.keep_avg_eval = KeepAverage() if self.config.run_eval else None
             self.epochs_done = epoch
@@ -999,7 +999,7 @@ def _fit(self) -> None:
             )
             if self.args.rank in [None, 0]:
                 self.save_best_model()
-            self.callbacks.on_epoch_end()
+            self.callbacks.on_epoch_end(self)
 
     def fit(self) -> None:
         """Where the ✨️magic✨️ happens..."""
@@ -1008,7 +1008,7 @@ def fit(self) -> None:
             if self.args.rank == 0:
                 self.dashboard_logger.finish()
         except KeyboardInterrupt:
-            self.callbacks.on_keyboard_interrupt()
+            self.callbacks.on_keyboard_interrupt(self)
             # if the output folder is empty remove the run.
             remove_experiment_folder(self.output_path)
             # clear the DDP processes

diff --git a/TTS/tts/configs/fast_speech_config.py b/TTS/tts/configs/fast_speech_config.py
@@ -110,6 +110,7 @@ class FastSpeechConfig(BaseTTSConfig):
     model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
 
     # multi-speaker settings
+    num_speakers: int = 0
     speakers_file: str = None
     use_speaker_embedding: bool = False
     use_d_vector_file: bool = False
@@ -142,7 +143,7 @@ class FastSpeechConfig(BaseTTSConfig):
     r: int = 1  # DO NOT CHANGE
 
     # dataset configs
-    compute_f0: bool = True
+    compute_f0: bool = False
     f0_cache_path: str = None
 
     # testing

diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py
@@ -91,6 +91,8 @@ def load_tts_samples(
                 for idx, ins in enumerate(meta_data_eval_all):
                     attn_file = meta_data[ins[1]].strip()
                     meta_data_eval_all[idx].append(attn_file)
+        # set none for the next iter
+        formatter = None
     return meta_data_train_all, meta_data_eval_all
 
 
@@ -110,3 +112,18 @@ def _get_formatter_by_name(name):
     """Returns the respective preprocessing function."""
     thismodule = sys.modules[__name__]
     return getattr(thismodule, name.lower())
+
+
+def find_unique_chars(data_samples, verbose=True):
+    texts = "".join(item[0] for item in data_samples)
+    chars = set(texts)
+    lower_chars = filter(lambda c: c.islower(), chars)
+    chars_force_lower = [c.lower() for c in chars]
+    chars_force_lower = set(chars_force_lower)
+
+    if verbose:
+        print(f" > Number of unique characters: {len(chars)}")
+        print(f" > Unique characters: {''.join(sorted(chars))}")
+        print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
+        print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
+    return chars_force_lower
diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
@@ -60,7 +60,13 @@ def mozilla_de(root_path, meta_file):
 
 
 def mailabs(root_path, meta_files=None):
-    """Normalizes M-AI-Labs meta data files to TTS format"""
+    """Normalizes M-AI-Labs meta data files to TTS format
+
+    Args:
+        root_path (str): root folder of the MAILAB language folder.
+        meta_files (str):  list of meta files to be used in the training. If None, finds all the csv files
+            recursively. Defaults to None
+    """
     speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
     if meta_files is None:
         csv_files = glob(root_path + "/**/metadata.csv", recursive=True)

diff --git a/TTS/tts/layers/vits/stochastic_duration_predictor.py b/TTS/tts/layers/vits/stochastic_duration_predictor.py
@@ -266,7 +266,7 @@ def forward(self, x, x_mask, dr=None, g=None, reverse=False, noise_scale=1.0):
 
         flows = list(reversed(self.flows))
         flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
-        z = torch.rand(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
+        z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
         for flow in flows:
             z = torch.flip(z, [1])
             z = flow(z, x_mask, g=x, reverse=reverse)

diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
@@ -219,7 +219,7 @@ def get_data_loader(
                 use_phonemes=config.use_phonemes,
                 phoneme_language=config.phoneme_language,
                 enable_eos_bos=config.enable_eos_bos_chars,
-                use_noise_augment=not is_eval,
+                use_noise_augment=False if is_eval else config.use_noise_augment,
                 verbose=verbose,
                 speaker_id_mapping=speaker_id_mapping,
                 d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None,

diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
@@ -250,11 +250,11 @@ def synthesis(
     # GST processing
     style_mel = None
     custom_symbols = None
-    if CONFIG.has("gst") and CONFIG.gst and style_wav is not None:
-        if isinstance(style_wav, dict):
-            style_mel = style_wav
-        else:
-            style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda)
+    if style_wav:
+        style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda)
+    elif CONFIG.has("gst") and CONFIG.gst and not style_wav:
+        if CONFIG.gst.gst_style_input_weights:
+            style_mel = CONFIG.gst.gst_style_input_weights
     if hasattr(model, "make_symbols"):
         custom_symbols = model.make_symbols(CONFIG)
     # preprocess the given text