Skip to content

Commit

Permalink
Merge pull request #777 from coqui-ai/dev
Browse files Browse the repository at this point in the history
v0.2.1
  • Loading branch information
erogol authored Aug 31, 2021
2 parents c308226 + 2b7e55f commit 5793dca
Show file tree
Hide file tree
Showing 50 changed files with 489 additions and 342 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -155,4 +155,5 @@ deps.json
speakers.json
internal/*
*_pitch.npy
*_phoneme.npy
*_phoneme.npy
wandb
6 changes: 6 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ disable=missing-docstring,
too-many-public-methods,
too-many-lines,
bare-except,
## for avoiding weird p3.6 CI linter error
## TODO: see later if we can remove this
assigning-non-slot,
unsupported-assignment-operation,
## end
line-too-long,
fixme,
wrong-import-order,
Expand All @@ -73,6 +78,7 @@ disable=missing-docstring,
invalid-name,
too-many-instance-attributes,
arguments-differ,
arguments-renamed,
no-name-in-module,
no-member,
unsubscriptable-object,
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ You can also help us implement more models.
## Install TTS
🐸TTS is tested on Ubuntu 18.04 with **python >= 3.6, < 3.9**.

If you are only interested in [synthesizing speech](https://github.com/coqui-ai/TTS/tree/dev#example-synthesizing-speech-on-terminal-using-the-released-models) with the released 🐸TTS models, installing from PyPI is the easiest option.
If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.

```bash
pip install TTS
Expand Down
2 changes: 1 addition & 1 deletion TTS/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.2.0
0.2.1
2 changes: 1 addition & 1 deletion TTS/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os

with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f:
with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
version = f.read().strip()

__version__ = version
4 changes: 2 additions & 2 deletions TTS/bin/compute_attention_masks.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
enable_eos_bos=C.enable_eos_bos_chars,
)

dataset.sort_items()
dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
loader = DataLoader(
dataset,
batch_size=args.batch_size,
Expand Down Expand Up @@ -158,7 +158,7 @@
# ourput metafile
metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")

with open(metafile, "w") as f:
with open(metafile, "w", encoding="utf-8") as f:
for p in file_paths:
f.write(f"{p[0]}|{p[1]}\n")
print(f" >> Metafile created: {metafile}")
3 changes: 2 additions & 1 deletion TTS/bin/distribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def main():
command.append("--restore_path={}".format(args.restore_path))
command.append("--config_path={}".format(args.config_path))
command.append("--group_id=group_{}".format(group_id))
command.append("--use_ddp=true")
command += unargs
command.append("")

Expand All @@ -42,7 +43,7 @@ def main():
my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
command[-1] = "--rank={}".format(i)
# prevent stdout for processes with rank != 0
stdout = None if i == 0 else open(os.devnull, "w")
stdout = None
p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) # pylint: disable=consider-using-with
processes.append(p)
print(command)
Expand Down
4 changes: 2 additions & 2 deletions TTS/bin/extract_tts_spectrograms.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def setup_loader(ap, r, verbose=False):
if c.use_phonemes and c.compute_input_seq_cache:
# precompute phonemes to have a better estimate of sequence lengths.
dataset.compute_input_seq(c.num_loader_workers)
dataset.sort_items()
dataset.sort_and_filter_items(c.get("sort_by_audio_len", default=False))

loader = DataLoader(
dataset,
Expand Down Expand Up @@ -215,7 +215,7 @@ def extract_spectrograms(
wav = ap.inv_melspectrogram(mel)
ap.save_wav(wav, wav_gl_path)

with open(os.path.join(output_path, metada_name), "w") as f:
with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
for data in export_metadata:
f.write(f"{data[0]}|{data[1]+'.npy'}\n")

Expand Down
4 changes: 2 additions & 2 deletions TTS/config/shared_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ class BaseTrainingConfig(Coqpit):
Name of the model that is used in the training.
run_name (str):
Name of the experiment. This prefixes the output folder name.
Name of the experiment. This prefixes the output folder name. Defaults to `coqui_tts`.
run_description (str):
Short description of the experiment.
Expand Down Expand Up @@ -272,7 +272,7 @@ class BaseTrainingConfig(Coqpit):
"""

model: str = None
run_name: str = ""
run_name: str = "coqui_tts"
run_description: str = ""
# training params
epochs: int = 10000
Expand Down
18 changes: 7 additions & 11 deletions TTS/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,35 +23,31 @@ class BaseModel(nn.Module, ABC):
"""

@abstractmethod
def forward(self, text: torch.Tensor, aux_input={}, **kwargs) -> Dict:
def forward(self, input: torch.Tensor, *args, aux_input={}, **kwargs) -> Dict:
"""Forward pass for the model mainly used in training.
You can be flexible here and use different number of arguments and argument names since it is mostly used by
`train_step()` in training whitout exposing it to the out of the class.
You can be flexible here and use different number of arguments and argument names since it is intended to be
used by `train_step()` without exposing it out of the model.
Args:
text (torch.Tensor): Input text character sequence ids.
input (torch.Tensor): Input tensor.
aux_input (Dict): Auxiliary model inputs like embeddings, durations or any other sorts of inputs.
for the model.
Returns:
Dict: model outputs. This must include an item keyed `model_outputs` as the final artifact of the model.
Dict: Model outputs. Main model output must be named as "model_outputs".
"""
outputs_dict = {"model_outputs": None}
...
return outputs_dict

@abstractmethod
def inference(self, text: torch.Tensor, aux_input={}) -> Dict:
def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
"""Forward pass for inference.
After the model is trained this is the only function that connects the model the out world.
This function must only take a `text` input and a dictionary that has all the other model specific inputs.
We don't use `*kwargs` since it is problematic with the TorchScript API.
Args:
text (torch.Tensor): [description]
input (torch.Tensor): [description]
aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc.
Returns:
Expand Down
2 changes: 1 addition & 1 deletion TTS/speaker_encoder/losses.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn


# adapted from https://github.com/cvqluu/GE2E-Loss
Expand Down
2 changes: 1 addition & 1 deletion TTS/speaker_encoder/models/resnet.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np
import torch
import torch.nn as nn
from torch import nn

from TTS.utils.io import load_fsspec

Expand Down
8 changes: 4 additions & 4 deletions TTS/speaker_encoder/speaker_encoder_config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from dataclasses import asdict, dataclass, field
from typing import List
from typing import Dict, List

from coqpit import MISSING

Expand All @@ -14,7 +14,7 @@ class SpeakerEncoderConfig(BaseTrainingConfig):
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
# model params
model_params: dict = field(
model_params: Dict = field(
default_factory=lambda: {
"model_name": "lstm",
"input_dim": 80,
Expand All @@ -25,9 +25,9 @@ class SpeakerEncoderConfig(BaseTrainingConfig):
}
)

audio_augmentation: dict = field(default_factory=lambda: {})
audio_augmentation: Dict = field(default_factory=lambda: {})

storage: dict = field(
storage: Dict = field(
default_factory=lambda: {
"sample_from_storage_p": 0.66, # the probability with which we'll sample from the DataSet in-memory storage
"storage_size": 15, # the size of the in-memory storage with respect to a single batch
Expand Down
3 changes: 2 additions & 1 deletion TTS/speaker_encoder/utils/prepare_voxceleb.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ def download_and_extract(directory, subset, urls):
extract_path = zip_filepath.strip(".zip")

# check zip file md5sum
md5 = hashlib.md5(open(zip_filepath, "rb").read()).hexdigest()
with open(zip_filepath, "rb") as f_zip:
md5 = hashlib.md5(f_zip.read()).hexdigest()
if md5 != MD5SUM[subset]:
raise ValueError("md5sum of %s mismatch" % zip_filepath)

Expand Down
Loading

0 comments on commit 5793dca

Please sign in to comment.