Merge pull request #777 from coqui-ai/dev

v0.2.1
coqui-ai · Aug 31, 2021 · 5793dca · 5793dca
2 parents c308226 + 2b7e55f
commit 5793dca
Show file tree

Hide file tree

Showing 50 changed files with 489 additions and 342 deletions.
diff --git a/.gitignore b/.gitignore
@@ -155,4 +155,5 @@ deps.json
 speakers.json
 internal/*
 *_pitch.npy
-*_phoneme.npy
+*_phoneme.npy
+wandb
diff --git a/.pylintrc b/.pylintrc
@@ -64,6 +64,11 @@ disable=missing-docstring,
         too-many-public-methods,
         too-many-lines,
         bare-except,
+        ## for avoiding weird p3.6 CI linter error
+        ## TODO: see later if we can remove this
+        assigning-non-slot,
+        unsupported-assignment-operation,
+        ## end
         line-too-long,
         fixme,
         wrong-import-order,
@@ -73,6 +78,7 @@ disable=missing-docstring,
         invalid-name,
         too-many-instance-attributes,
         arguments-differ,
+        arguments-renamed,
         no-name-in-module,
         no-member,
         unsubscriptable-object,

diff --git a/README.md b/README.md
@@ -102,7 +102,7 @@ You can also help us implement more models.
 ## Install TTS
 🐸TTS is tested on Ubuntu 18.04 with **python >= 3.6, < 3.9**.
 
-If you are only interested in [synthesizing speech](https://github.com/coqui-ai/TTS/tree/dev#example-synthesizing-speech-on-terminal-using-the-released-models) with the released 🐸TTS models, installing from PyPI is the easiest option.
+If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.
 
 ```bash
 pip install TTS

diff --git a/TTS/VERSION b/TTS/VERSION
@@ -1 +1 @@
-0.2.0
+0.2.1
diff --git a/TTS/__init__.py b/TTS/__init__.py
@@ -1,6 +1,6 @@
 import os
 
-with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f:
+with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
     version = f.read().strip()
 
 __version__ = version
diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py
@@ -97,7 +97,7 @@
         enable_eos_bos=C.enable_eos_bos_chars,
     )
 
-    dataset.sort_items()
+    dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
     loader = DataLoader(
         dataset,
         batch_size=args.batch_size,
@@ -158,7 +158,7 @@
         # ourput metafile
         metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
 
-        with open(metafile, "w") as f:
+        with open(metafile, "w", encoding="utf-8") as f:
             for p in file_paths:
                 f.write(f"{p[0]}|{p[1]}\n")
         print(f" >> Metafile created: {metafile}")
diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py
@@ -32,6 +32,7 @@ def main():
     command.append("--restore_path={}".format(args.restore_path))
     command.append("--config_path={}".format(args.config_path))
     command.append("--group_id=group_{}".format(group_id))
+    command.append("--use_ddp=true")
     command += unargs
     command.append("")
 
@@ -42,7 +43,7 @@ def main():
         my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
         command[-1] = "--rank={}".format(i)
         # prevent stdout for processes with rank != 0
-        stdout = None if i == 0 else open(os.devnull, "w")
+        stdout = None
         p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env)  # pylint: disable=consider-using-with
         processes.append(p)
         print(command)

diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
@@ -46,7 +46,7 @@ def setup_loader(ap, r, verbose=False):
     if c.use_phonemes and c.compute_input_seq_cache:
         # precompute phonemes to have a better estimate of sequence lengths.
         dataset.compute_input_seq(c.num_loader_workers)
-    dataset.sort_items()
+    dataset.sort_and_filter_items(c.get("sort_by_audio_len", default=False))
 
     loader = DataLoader(
         dataset,
@@ -215,7 +215,7 @@ def extract_spectrograms(
                 wav = ap.inv_melspectrogram(mel)
                 ap.save_wav(wav, wav_gl_path)
 
-    with open(os.path.join(output_path, metada_name), "w") as f:
+    with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
         for data in export_metadata:
             f.write(f"{data[0]}|{data[1]+'.npy'}\n")
 

diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py
@@ -190,7 +190,7 @@ class BaseTrainingConfig(Coqpit):
             Name of the model that is used in the training.
 
         run_name (str):
-            Name of the experiment. This prefixes the output folder name.
+            Name of the experiment. This prefixes the output folder name. Defaults to `coqui_tts`.
 
         run_description (str):
             Short description of the experiment.
@@ -272,7 +272,7 @@ class BaseTrainingConfig(Coqpit):
     """
 
     model: str = None
-    run_name: str = ""
+    run_name: str = "coqui_tts"
     run_description: str = ""
     # training params
     epochs: int = 10000

diff --git a/TTS/model.py b/TTS/model.py
@@ -23,35 +23,31 @@ class BaseModel(nn.Module, ABC):
     """
 
     @abstractmethod
-    def forward(self, text: torch.Tensor, aux_input={}, **kwargs) -> Dict:
+    def forward(self, input: torch.Tensor, *args, aux_input={}, **kwargs) -> Dict:
         """Forward pass for the model mainly used in training.
 
-        You can be flexible here and use different number of arguments and argument names since it is mostly used by
-        `train_step()` in training whitout exposing it to the out of the class.
+        You can be flexible here and use different number of arguments and argument names since it is intended to be
+        used by `train_step()` without exposing it out of the model.
 
         Args:
-            text (torch.Tensor): Input text character sequence ids.
+            input (torch.Tensor): Input tensor.
             aux_input (Dict): Auxiliary model inputs like embeddings, durations or any other sorts of inputs.
-                for the model.
 
         Returns:
-            Dict: model outputs. This must include an item keyed `model_outputs` as the final artifact of the model.
+            Dict: Model outputs. Main model output must be named as "model_outputs".
         """
         outputs_dict = {"model_outputs": None}
         ...
         return outputs_dict
 
     @abstractmethod
-    def inference(self, text: torch.Tensor, aux_input={}) -> Dict:
+    def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
         """Forward pass for inference.
 
-        After the model is trained this is the only function that connects the model the out world.
-
-        This function must only take a `text` input and a dictionary that has all the other model specific inputs.
         We don't use `*kwargs` since it is problematic with the TorchScript API.
 
         Args:
-            text (torch.Tensor): [description]
+            input (torch.Tensor): [description]
             aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc.
 
         Returns:

diff --git a/TTS/speaker_encoder/losses.py b/TTS/speaker_encoder/losses.py
@@ -1,6 +1,6 @@
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
+from torch import nn
 
 
 # adapted from https://github.com/cvqluu/GE2E-Loss

diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py
@@ -1,6 +1,6 @@
 import numpy as np
 import torch
-import torch.nn as nn
+from torch import nn
 
 from TTS.utils.io import load_fsspec
 

diff --git a/TTS/speaker_encoder/speaker_encoder_config.py b/TTS/speaker_encoder/speaker_encoder_config.py
@@ -1,5 +1,5 @@
 from dataclasses import asdict, dataclass, field
-from typing import List
+from typing import Dict, List
 
 from coqpit import MISSING
 
@@ -14,7 +14,7 @@ class SpeakerEncoderConfig(BaseTrainingConfig):
     audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
     datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
     # model params
-    model_params: dict = field(
+    model_params: Dict = field(
         default_factory=lambda: {
             "model_name": "lstm",
             "input_dim": 80,
@@ -25,9 +25,9 @@ class SpeakerEncoderConfig(BaseTrainingConfig):
         }
     )
 
-    audio_augmentation: dict = field(default_factory=lambda: {})
+    audio_augmentation: Dict = field(default_factory=lambda: {})
 
-    storage: dict = field(
+    storage: Dict = field(
         default_factory=lambda: {
             "sample_from_storage_p": 0.66,  # the probability with which we'll sample from the DataSet in-memory storage
             "storage_size": 15,  # the size of the in-memory storage with respect to a single batch

diff --git a/TTS/speaker_encoder/utils/prepare_voxceleb.py b/TTS/speaker_encoder/utils/prepare_voxceleb.py
@@ -94,7 +94,8 @@ def download_and_extract(directory, subset, urls):
         extract_path = zip_filepath.strip(".zip")
 
         # check zip file md5sum
-        md5 = hashlib.md5(open(zip_filepath, "rb").read()).hexdigest()
+        with open(zip_filepath, "rb") as f_zip:
+            md5 = hashlib.md5(f_zip.read()).hexdigest()
         if md5 != MD5SUM[subset]:
             raise ValueError("md5sum of %s mismatch" % zip_filepath)