diff --git a/.github/actions/setup-uv/action.yml b/.github/actions/setup-uv/action.yml index c7dd4f5f99..88a73e8481 100644 --- a/.github/actions/setup-uv/action.yml +++ b/.github/actions/setup-uv/action.yml @@ -4,8 +4,9 @@ runs: using: 'composite' steps: - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: - version: "0.5.4" + version: "0.5.17" enable-cache: true cache-dependency-glob: "**/pyproject.toml" + python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/style_check.yml b/.github/workflows/style_check.yml index d1060f6be2..03426808cc 100644 --- a/.github/workflows/style_check.yml +++ b/.github/workflows/style_check.yml @@ -9,15 +9,9 @@ on: jobs: lint: runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9] steps: - uses: actions/checkout@v4 - name: Setup uv uses: ./.github/actions/setup-uv - - name: Set up Python ${{ matrix.python-version }} - run: uv python install ${{ matrix.python-version }} - name: Lint check run: make lint diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 8d639d5dee..aa01abb874 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -22,14 +22,12 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9, "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12"] subset: ["data_tests", "inference_tests", "test_aux", "test_text"] steps: - uses: actions/checkout@v4 - name: Setup uv uses: ./.github/actions/setup-uv - - name: Set up Python ${{ matrix.python-version }} - run: uv python install ${{ matrix.python-version }} - name: Install Espeak if: contains(fromJSON('["inference_tests", "test_text"]'), matrix.subset) run: | @@ -37,7 +35,6 @@ jobs: sudo apt-get install espeak espeak-ng - name: Install dependencies run: | - sudo apt-get update sudo apt-get install -y --no-install-recommends git make gcc make system-deps - name: Install custom Trainer and/or Coqpit if requested @@ -51,7 +48,7 @@ jobs: - name: Unit tests run: | resolution=highest - if [ "${{ matrix.python-version }}" == "3.9" ]; then + if [ "${{ matrix.python-version }}" == "3.10" ]; then resolution=lowest-direct fi uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }} @@ -67,22 +64,19 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9", "3.12"] - subset: ["test_tts", "test_tts2", "test_vocoder", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"] + python-version: ["3.10", "3.12"] + subset: ["test_tts", "test_tts2", "test_vocoder", "test_xtts"] steps: - uses: actions/checkout@v4 - name: Setup uv uses: ./.github/actions/setup-uv - - name: Set up Python ${{ matrix.python-version }} - run: uv python install ${{ matrix.python-version }} - name: Install Espeak - if: contains(fromJSON('["test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset) + if: contains(fromJSON('["test_tts", "test_tts2", "test_xtts"]'), matrix.subset) run: | sudo apt-get update sudo apt-get install espeak espeak-ng - name: Install dependencies run: | - sudo apt-get update sudo apt-get install -y --no-install-recommends git make gcc make system-deps - name: Install custom Trainer and/or Coqpit if requested @@ -96,7 +90,7 @@ jobs: - name: Integration tests run: | resolution=highest - if [ "${{ matrix.python-version }}" == "3.9" ]; then + if [ "${{ matrix.python-version }}" == "3.10" ]; then resolution=lowest-direct fi uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }} @@ -107,9 +101,48 @@ jobs: name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }} path: .coverage.* if-no-files-found: ignore + zoo: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.12"] + partition: ["0", "1", "2"] + steps: + - uses: actions/checkout@v4 + - name: Setup uv + uses: ./.github/actions/setup-uv + - name: Install Espeak + run: | + sudo apt-get update + sudo apt-get install espeak espeak-ng + - name: Install dependencies + run: | + sudo apt-get install -y --no-install-recommends git make gcc + make system-deps + - name: Install custom Trainer and/or Coqpit if requested + run: | + if [[ -n "${{ github.event.inputs.trainer_branch }}" ]]; then + uv add git+https://github.com/idiap/coqui-ai-Trainer --branch ${{ github.event.inputs.trainer_branch }} + fi + if [[ -n "${{ github.event.inputs.coqpit_branch }}" ]]; then + uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }} + fi + - name: Zoo tests + run: uv run --extra server --extra languages make test_zoo + env: + NUM_PARTITIONS: 3 + TEST_PARTITION: ${{ matrix.partition }} + - name: Upload coverage data + uses: actions/upload-artifact@v4 + with: + include-hidden-files: true + name: coverage-data-zoo-${{ matrix.partition }} + path: .coverage.* + if-no-files-found: ignore coverage: if: always() - needs: [unit, integration] + needs: [unit, integration, zoo] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 62420e9958..2f070ad085 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,13 +7,9 @@ repos: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - - repo: "https://github.com/psf/black" - rev: 24.2.0 - hooks: - - id: black - language_version: python3 - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.0 + rev: v0.9.1 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] + - id: ruff-format diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2b3a973763..5fe9421442 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -88,7 +88,7 @@ curl -LsSf https://astral.sh/uv/install.sh | sh uv run make test_all # run all the tests, report all the errors ``` -9. Format your code. We use ```black``` for code formatting. +9. Format your code. We use ```ruff``` for code formatting. ```bash make style diff --git a/Makefile b/Makefile index 6964773fb5..d86845ddcf 100644 --- a/Makefile +++ b/Makefile @@ -6,55 +6,48 @@ help: target_dirs := tests TTS notebooks recipes -test_all: ## run tests and don't stop on an error. - nose2 --with-coverage --coverage TTS tests - ./run_bash_tests.sh - test: ## run tests. - coverage run -m nose2 -F -v -B tests + coverage run -m pytest -x -v --durations=0 tests test_vocoder: ## run vocoder tests. - coverage run -m nose2 -F -v -B tests.vocoder_tests + coverage run -m pytest -x -v --durations=0 tests/vocoder_tests test_tts: ## run tts tests. - coverage run -m nose2 -F -v -B tests.tts_tests + coverage run -m pytest -x -v --durations=0 tests/tts_tests test_tts2: ## run tts tests. - coverage run -m nose2 -F -v -B tests.tts_tests2 + coverage run -m pytest -x -v --durations=0 tests/tts_tests2 test_xtts: - coverage run -m nose2 -F -v -B tests.xtts_tests + coverage run -m pytest -x -v --durations=0 tests/xtts_tests test_aux: ## run aux tests. - coverage run -m nose2 -F -v -B tests.aux_tests - ./run_bash_tests.sh + coverage run -m pytest -x -v --durations=0 tests/aux_tests + +test_zoo: ## run zoo tests. + coverage run -m pytest -x -v --durations=0 tests/zoo_tests/test_models.py -test_zoo0: ## run zoo tests. - coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_0_step_3 \ - tests.zoo_tests.test_models.test_voice_conversion -test_zoo1: ## run zoo tests. - coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_1_step_3 -test_zoo2: ## run zoo tests. - coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_2_step_3 +test_zoo_big: ## run tests for models that are too big for CI. + coverage run -m pytest -x -v --durations=0 tests/zoo_tests/test_big_models.py inference_tests: ## run inference tests. - coverage run -m nose2 -F -v -B tests.inference_tests + coverage run -m pytest -x -v --durations=0 tests/inference_tests data_tests: ## run data tests. - coverage run -m nose2 -F -v -B tests.data_tests + coverage run -m pytest -x -v --durations=0 tests/data_tests test_text: ## run text tests. - coverage run -m nose2 -F -v -B tests.text_tests + coverage run -m pytest -x -v --durations=0 tests/text_tests test_failed: ## only run tests failed the last time. - coverage run -m nose2 -F -v -B tests + coverage run -m pytest -x -v --last-failed tests style: ## update code style. - uv run --only-dev black ${target_dirs} + uv run --only-dev ruff format ${target_dirs} lint: ## run linters. uv run --only-dev ruff check ${target_dirs} - uv run --only-dev black ${target_dirs} --check + uv run --only-dev ruff format ${target_dirs} --check system-deps: ## install linux system deps sudo apt-get install -y libsndfile1-dev diff --git a/README.md b/README.md index 9ccf8657ab..db8868b26d 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,7 @@ repository are also still a useful source of information. ### Voice Conversion - [FreeVC](https://arxiv.org/abs/2210.15418) +- [kNN-VC](https://doi.org/10.21437/Interspeech.2023-419) - [OpenVoice](https://arxiv.org/abs/2312.01479) ### Others @@ -115,7 +116,7 @@ You can also help us implement more models. ## Installation -🐸TTS is tested on Ubuntu 24.04 with **python >= 3.9, < 3.13**, but should also +🐸TTS is tested on Ubuntu 24.04 with **python >= 3.10, < 3.13**, but should also work on Mac and Windows. If you are only interested in [synthesizing speech](https://coqui-tts.readthedocs.io/en/latest/inference.html) with the pretrained 🐸TTS models, installing from PyPI is the easiest option. @@ -170,7 +171,7 @@ You can also try out Coqui TTS without installation with the docker image. Simply run the following command and you will be able to run TTS: ```bash -docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu +docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/idiap/coqui-tts-cpu python3 TTS/server/server.py --list_models #To get the list of available models python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server ``` @@ -234,7 +235,7 @@ tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH) #### Voice conversion (VC) -Converting the voice in `source_wav` to the voice of `target_wav` +Converting the voice in `source_wav` to the voice of `target_wav`: ```python tts = TTS("voice_conversion_models/multilingual/vctk/freevc24").to("cuda") @@ -246,9 +247,13 @@ tts.voice_conversion_to_file( ``` Other available voice conversion models: +- `voice_conversion_models/multilingual/multi-dataset/knnvc` - `voice_conversion_models/multilingual/multi-dataset/openvoice_v1` - `voice_conversion_models/multilingual/multi-dataset/openvoice_v2` +For more details, see the +[documentation](https://coqui-tts.readthedocs.io/en/latest/vc.html). + #### Voice cloning by combining single speaker TTS model with the default VC model This way, you can clone voices by using any model in 🐸TTS. The FreeVC model is diff --git a/TTS/.models.json b/TTS/.models.json index 36654d0555..05c88bef43 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -787,6 +787,22 @@ "license": "apache 2.0" } }, + "librispeech100": { + "wavlm-hifigan": { + "description": "HiFiGAN vocoder for WavLM features from kNN-VC", + "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan.zip", + "commit": "cfba7e0", + "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5", + "license": "MIT" + }, + "wavlm-hifigan_prematched": { + "description": "Prematched HiFiGAN vocoder for WavLM features from kNN-VC", + "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan_prematched.zip", + "commit": "cfba7e0", + "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5", + "license": "MIT" + } + }, "ljspeech": { "multiband-melgan": { "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip", @@ -927,18 +943,27 @@ "freevc24": { "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip", "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC", + "default_vocoder": null, "author": "Jing-Yi Li @OlaWod", "license": "MIT", "commit": null } }, "multi-dataset": { + "knnvc": { + "description": "kNN-VC model from https://github.com/bshall/knn-vc", + "default_vocoder": "vocoder_models/en/librispeech100/wavlm-hifigan_prematched", + "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5", + "license": "MIT", + "commit": null + }, "openvoice_v1": { "hf_url": [ "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/config.json", "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth" ], "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2", + "default_vocoder": null, "author": "MyShell.ai", "license": "MIT", "commit": null @@ -949,6 +974,7 @@ "https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/checkpoint.pth" ], "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2", + "default_vocoder": null, "author": "MyShell.ai", "license": "MIT", "commit": null diff --git a/TTS/api.py b/TTS/api.py index 86a311112e..abbd164dea 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -4,7 +4,6 @@ import tempfile import warnings from pathlib import Path -from typing import Optional from torch import nn @@ -22,15 +21,15 @@ def __init__( self, model_name: str = "", *, - model_path: Optional[str] = None, - config_path: Optional[str] = None, - vocoder_name: Optional[str] = None, - vocoder_path: Optional[str] = None, - vocoder_config_path: Optional[str] = None, - encoder_path: Optional[str] = None, - encoder_config_path: Optional[str] = None, - speakers_file_path: Optional[str] = None, - language_ids_file_path: Optional[str] = None, + model_path: str | None = None, + config_path: str | None = None, + vocoder_name: str | None = None, + vocoder_path: str | None = None, + vocoder_config_path: str | None = None, + encoder_path: str | None = None, + encoder_config_path: str | None = None, + speakers_file_path: str | None = None, + language_ids_file_path: str | None = None, progress_bar: bool = True, gpu: bool = False, ) -> None: @@ -77,8 +76,8 @@ def __init__( super().__init__() self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar) self.config = load_config(config_path) if config_path else None - self.synthesizer = None - self.voice_converter = None + self.synthesizer: Synthesizer | None = None + self.voice_converter: Synthesizer | None = None self.model_name = "" self.vocoder_path = vocoder_path @@ -95,7 +94,7 @@ def __init__( if "tts_models" in model_name: self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu) elif "voice_conversion_models" in model_name: - self.load_vc_model_by_name(model_name, gpu=gpu) + self.load_vc_model_by_name(model_name, vocoder_name, gpu=gpu) # To allow just TTS("xtts") else: self.load_model_by_name(model_name, vocoder_name, gpu=gpu) @@ -156,25 +155,27 @@ def list_models() -> list[str]: return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False).list_models() def download_model_by_name( - self, model_name: str, vocoder_name: Optional[str] = None - ) -> tuple[Optional[Path], Optional[Path], Optional[Path]]: + self, model_name: str, vocoder_name: str | None = None + ) -> tuple[Path | None, Path | None, Path | None, Path | None, Path | None]: model_path, config_path, model_item = self.manager.download_model(model_name) if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)): # return model directory if there are multiple files # we assume that the model knows how to load itself - return None, None, model_path + return None, None, None, None, model_path if model_item.get("default_vocoder") is None: - return model_path, config_path, None + return model_path, config_path, None, None, None if vocoder_name is None: vocoder_name = model_item["default_vocoder"] - vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name) - # A local vocoder model will take precedence if specified via vocoder_path - if self.vocoder_path is None or self.vocoder_config_path is None: - self.vocoder_path = vocoder_path - self.vocoder_config_path = vocoder_config_path - return model_path, config_path, None - - def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None: + vocoder_path, vocoder_config_path = None, None + # A local vocoder model will take precedence if already specified in __init__ + if model_item["model_type"] == "tts_models": + vocoder_path = self.vocoder_path + vocoder_config_path = self.vocoder_config_path + if vocoder_path is None or vocoder_config_path is None: + vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name) + return model_path, config_path, vocoder_path, vocoder_config_path, None + + def load_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None: """Load one of the 🐸TTS models by name. Args: @@ -183,7 +184,7 @@ def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None """ self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu) - def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False) -> None: + def load_vc_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None: """Load one of the voice conversion models by name. Args: @@ -191,12 +192,19 @@ def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False) -> None: gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. """ self.model_name = model_name - model_path, config_path, model_dir = self.download_model_by_name(model_name) + model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name( + model_name, vocoder_name + ) self.voice_converter = Synthesizer( - vc_checkpoint=model_path, vc_config=config_path, model_dir=model_dir, use_cuda=gpu + vc_checkpoint=model_path, + vc_config=config_path, + vocoder_checkpoint=vocoder_path, + vocoder_config=vocoder_config_path, + model_dir=model_dir, + use_cuda=gpu, ) - def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None: + def load_tts_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None: """Load one of 🐸TTS models by name. Args: @@ -208,7 +216,9 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] = self.synthesizer = None self.model_name = model_name - model_path, config_path, model_dir = self.download_model_by_name(model_name, vocoder_name) + model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name( + model_name, vocoder_name + ) # init synthesizer # None values are fetch from the model @@ -217,8 +227,8 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] = tts_config_path=config_path, tts_speakers_file=None, tts_languages_file=None, - vocoder_checkpoint=self.vocoder_path, - vocoder_config=self.vocoder_config_path, + vocoder_checkpoint=vocoder_path, + vocoder_config=vocoder_config_path, encoder_checkpoint=self.encoder_path, encoder_config=self.encoder_config_path, model_dir=model_dir, @@ -250,11 +260,11 @@ def load_tts_model_by_path(self, model_path: str, config_path: str, *, gpu: bool def _check_arguments( self, - speaker: Optional[str] = None, - language: Optional[str] = None, - speaker_wav: Optional[str] = None, - emotion: Optional[str] = None, - speed: Optional[float] = None, + speaker: str | None = None, + language: str | None = None, + speaker_wav: str | None = None, + emotion: str | None = None, + speed: float | None = None, **kwargs, ) -> None: """Check if the arguments are valid for the model.""" @@ -273,11 +283,11 @@ def _check_arguments( def tts( self, text: str, - speaker: str = None, - language: str = None, - speaker_wav: str = None, - emotion: str = None, - speed: float = None, + speaker: str | None = None, + language: str | None = None, + speaker_wav: str | None = None, + emotion: str | None = None, + speed: float | None = None, split_sentences: bool = True, **kwargs, ): @@ -323,10 +333,10 @@ def tts( def tts_to_file( self, text: str, - speaker: str = None, - language: str = None, - speaker_wav: str = None, - emotion: str = None, + speaker: str | None = None, + language: str | None = None, + speaker_wav: str | None = None, + emotion: str | None = None, speed: float = 1.0, pipe_out=None, file_path: str = "output.wav", @@ -378,7 +388,7 @@ def tts_to_file( def voice_conversion( self, source_wav: str, - target_wav: str, + target_wav: str | list[str], ): """Voice conversion with FreeVC. Convert source wav to target speaker. @@ -396,7 +406,7 @@ def voice_conversion( def voice_conversion_to_file( self, source_wav: str, - target_wav: str, + target_wav: str | list[str], file_path: str = "output.wav", pipe_out=None, ) -> str: @@ -419,9 +429,10 @@ def voice_conversion_to_file( def tts_with_vc( self, text: str, - language: str = None, - speaker_wav: str = None, - speaker: str = None, + *, + language: str | None = None, + speaker_wav: str | list[str], + speaker: str | None = None, split_sentences: bool = True, ): """Convert text to speech with voice conversion. @@ -461,10 +472,11 @@ def tts_with_vc( def tts_with_vc_to_file( self, text: str, - language: str = None, - speaker_wav: str = None, + *, + language: str | None = None, + speaker_wav: str | list[str], file_path: str = "output.wav", - speaker: str = None, + speaker: str | None = None, split_sentences: bool = True, pipe_out=None, ) -> str: diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index b8f69b54e5..8d7a2633a0 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -113,7 +113,7 @@ # compute attentions file_paths = [] - with torch.no_grad(): + with torch.inference_mode(): for data in tqdm(loader): # setup input data text_input = data[0] diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index dc0ce5b18b..d450e26fba 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -15,6 +15,88 @@ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n""" + """ + Example runs: + python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json + + python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv + """, + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument( + "--model_path", + type=str, + help="Path to model checkpoint file. It defaults to the released speaker encoder.", + default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar", + ) + parser.add_argument( + "--config_path", + type=str, + help="Path to model config file. It defaults to the released speaker encoder config.", + default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json", + ) + parser.add_argument( + "--config_dataset_path", + type=str, + help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.", + default=None, + ) + parser.add_argument( + "--output_path", + type=str, + help="Path for output `pth` or `json` file.", + default="speakers.pth", + ) + parser.add_argument( + "--old_file", + type=str, + help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.", + default=None, + ) + parser.add_argument( + "--old_append", + help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False", + default=False, + action="store_true", + ) + parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False) + parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true") + parser.add_argument( + "--formatter_name", + type=str, + help="Name of the formatter to use. You either need to provide this or `config_dataset_path`", + default=None, + ) + parser.add_argument( + "--dataset_name", + type=str, + help="Name of the dataset to use. You either need to provide this or `config_dataset_path`", + default=None, + ) + parser.add_argument( + "--dataset_path", + type=str, + help="Path to the dataset. You either need to provide this or `config_dataset_path`", + default=None, + ) + parser.add_argument( + "--meta_file_train", + type=str, + help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`", + default=None, + ) + parser.add_argument( + "--meta_file_val", + type=str, + help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`", + default=None, + ) + return parser.parse_args() + + def compute_embeddings( model_path, config_path, @@ -102,88 +184,9 @@ def compute_embeddings( print("Speaker embeddings saved at:", mapping_file_path) -if __name__ == "__main__": +def main(arg_list: list[str] | None = None): setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) - - parser = argparse.ArgumentParser( - description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n""" - """ - Example runs: - python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json - - python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv - """, - formatter_class=RawTextHelpFormatter, - ) - parser.add_argument( - "--model_path", - type=str, - help="Path to model checkpoint file. It defaults to the released speaker encoder.", - default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar", - ) - parser.add_argument( - "--config_path", - type=str, - help="Path to model config file. It defaults to the released speaker encoder config.", - default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json", - ) - parser.add_argument( - "--config_dataset_path", - type=str, - help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.", - default=None, - ) - parser.add_argument( - "--output_path", - type=str, - help="Path for output `pth` or `json` file.", - default="speakers.pth", - ) - parser.add_argument( - "--old_file", - type=str, - help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.", - default=None, - ) - parser.add_argument( - "--old_append", - help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False", - default=False, - action="store_true", - ) - parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False) - parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true") - parser.add_argument( - "--formatter_name", - type=str, - help="Name of the formatter to use. You either need to provide this or `config_dataset_path`", - default=None, - ) - parser.add_argument( - "--dataset_name", - type=str, - help="Name of the dataset to use. You either need to provide this or `config_dataset_path`", - default=None, - ) - parser.add_argument( - "--dataset_path", - type=str, - help="Path to the dataset. You either need to provide this or `config_dataset_path`", - default=None, - ) - parser.add_argument( - "--meta_file_train", - type=str, - help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`", - default=None, - ) - parser.add_argument( - "--meta_file_val", - type=str, - help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`", - default=None, - ) - args = parser.parse_args() + args = parse_args(arg_list) compute_embeddings( args.model_path, @@ -200,3 +203,7 @@ def compute_embeddings( disable_cuda=args.disable_cuda, no_eval=args.no_eval, ) + + +if __name__ == "__main__": + main() diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index acec91c369..1da7a092fb 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import argparse import glob @@ -17,10 +16,7 @@ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger -def main(): - """Run preprocessing process.""" - setup_logger("TTS", level=logging.INFO, stream=sys.stderr, formatter=ConsoleFormatter()) - +def parse_args(arg_list: list[str] | None) -> tuple[argparse.Namespace, list[str]]: parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.") parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.") parser.add_argument("out_path", type=str, help="save path (directory and filename).") @@ -30,7 +26,13 @@ def main(): required=False, help="folder including the target set of wavs overriding dataset config.", ) - args, overrides = parser.parse_known_args() + return parser.parse_known_args(arg_list) + + +def main(arg_list: list[str] | None = None): + """Run preprocessing process.""" + setup_logger("TTS", level=logging.INFO, stream=sys.stderr, formatter=ConsoleFormatter()) + args, overrides = parse_args(arg_list) CONFIG = load_config(args.config_path) CONFIG.parse_known_args(overrides, relaxed_parser=True) @@ -95,6 +97,7 @@ def main(): stats["audio_config"] = CONFIG.audio.to_dict() np.save(output_file_path, stats, allow_pickle=True) print(f" > stats saved to {output_file_path}") + sys.exit(0) if __name__ == "__main__": diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index a04005ce39..be9387f015 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -3,8 +3,8 @@ import argparse import logging -import os import sys +from pathlib import Path import numpy as np import torch @@ -13,8 +13,10 @@ from trainer.generic_utils import count_parameters from TTS.config import load_config +from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.datasets import TTSDataset, load_tts_samples from TTS.tts.models import setup_model +from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor @@ -24,56 +26,66 @@ use_cuda = torch.cuda.is_available() -def setup_loader(ap, r): - tokenizer, _ = TTSTokenizer.init_from_config(c) +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True) + parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True) + parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True) + parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug") + parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files") + parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero") + parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True) + return parser.parse_args(arg_list) + + +def setup_loader(config: BaseTTSConfig, ap: AudioProcessor, r, speaker_manager: SpeakerManager, samples) -> DataLoader: + tokenizer, _ = TTSTokenizer.init_from_config(config) dataset = TTSDataset( outputs_per_step=r, compute_linear_spec=False, - samples=meta_data, + samples=samples, tokenizer=tokenizer, ap=ap, batch_group_size=0, - min_text_len=c.min_text_len, - max_text_len=c.max_text_len, - min_audio_len=c.min_audio_len, - max_audio_len=c.max_audio_len, - phoneme_cache_path=c.phoneme_cache_path, + min_text_len=config.min_text_len, + max_text_len=config.max_text_len, + min_audio_len=config.min_audio_len, + max_audio_len=config.max_audio_len, + phoneme_cache_path=config.phoneme_cache_path, precompute_num_workers=0, use_noise_augment=False, - speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None, - d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None, + speaker_id_mapping=speaker_manager.name_to_id if config.use_speaker_embedding else None, + d_vector_mapping=speaker_manager.embeddings if config.use_d_vector_file else None, ) - if c.use_phonemes and c.compute_input_seq_cache: + if config.use_phonemes and config.compute_input_seq_cache: # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(c.num_loader_workers) + dataset.compute_input_seq(config.num_loader_workers) dataset.preprocess_samples() - loader = DataLoader( + return DataLoader( dataset, - batch_size=c.batch_size, + batch_size=config.batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=False, sampler=None, - num_workers=c.num_loader_workers, + num_workers=config.num_loader_workers, pin_memory=False, ) - return loader -def set_filename(wav_path, out_path): - wav_file = os.path.basename(wav_path) - file_name = wav_file.split(".")[0] - os.makedirs(os.path.join(out_path, "quant"), exist_ok=True) - os.makedirs(os.path.join(out_path, "mel"), exist_ok=True) - os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True) - os.makedirs(os.path.join(out_path, "wav"), exist_ok=True) - wavq_path = os.path.join(out_path, "quant", file_name) - mel_path = os.path.join(out_path, "mel", file_name) - wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav") - wav_path = os.path.join(out_path, "wav", file_name + ".wav") - return file_name, wavq_path, mel_path, wav_gl_path, wav_path +def set_filename(wav_path: str, out_path: Path) -> tuple[Path, Path, Path, Path]: + wav_name = Path(wav_path).stem + (out_path / "quant").mkdir(exist_ok=True, parents=True) + (out_path / "mel").mkdir(exist_ok=True, parents=True) + (out_path / "wav_gl").mkdir(exist_ok=True, parents=True) + (out_path / "wav").mkdir(exist_ok=True, parents=True) + wavq_path = out_path / "quant" / wav_name + mel_path = out_path / "mel" / wav_name + wav_gl_path = out_path / "wav_gl" / f"{wav_name}.wav" + out_wav_path = out_path / "wav" / f"{wav_name}.wav" + return wavq_path, mel_path, wav_gl_path, out_wav_path def format_data(data): @@ -115,18 +127,18 @@ def format_data(data): ) -@torch.no_grad() +@torch.inference_mode() def inference( - model_name, - model, - ap, + model_name: str, + model: BaseTTS, + ap: AudioProcessor, text_input, text_lengths, mel_input, mel_lengths, speaker_ids=None, d_vectors=None, -): +) -> np.ndarray: if model_name == "glow_tts": speaker_c = None if speaker_ids is not None: @@ -141,9 +153,9 @@ def inference( aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids}, ) model_output = outputs["model_outputs"] - model_output = model_output.detach().cpu().numpy() + return model_output.detach().cpu().numpy() - elif "tacotron" in model_name: + if "tacotron" in model_name: aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input) postnet_outputs = outputs["model_outputs"] @@ -154,16 +166,24 @@ def inference( for b in range(postnet_outputs.shape[0]): postnet_output = postnet_outputs[b] mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T)) - model_output = torch.stack(mel_specs).cpu().numpy() - - elif model_name == "tacotron2": - model_output = postnet_outputs.detach().cpu().numpy() - return model_output + return torch.stack(mel_specs).cpu().numpy() + if model_name == "tacotron2": + return postnet_outputs.detach().cpu().numpy() + msg = f"Model not supported: {model_name}" + raise ValueError(msg) def extract_spectrograms( - data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt" -): + model_name: str, + data_loader: DataLoader, + model: BaseTTS, + ap: AudioProcessor, + output_path: Path, + quantize_bits: int = 0, + save_audio: bool = False, + debug: bool = False, + metadata_name: str = "metadata.txt", +) -> None: model.eval() export_metadata = [] for _, data in tqdm(enumerate(data_loader), total=len(data_loader)): @@ -182,7 +202,7 @@ def extract_spectrograms( ) = format_data(data) model_output = inference( - c.model.lower(), + model_name, model, ap, text_input, @@ -196,7 +216,7 @@ def extract_spectrograms( for idx in range(text_input.shape[0]): wav_file_path = item_idx[idx] wav = ap.load_wav(wav_file_path) - _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path) + wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path) # quantize and save wav if quantize_bits > 0: @@ -218,74 +238,67 @@ def extract_spectrograms( wav = ap.inv_melspectrogram(mel) ap.save_wav(wav, wav_gl_path) - with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f: + with (output_path / metadata_name).open("w") as f: for data in export_metadata: - f.write(f"{data[0]}|{data[1]+'.npy'}\n") + f.write(f"{data[0] / data[1]}.npy\n") -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data, speaker_manager +def main(arg_list: list[str] | None = None) -> None: + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) + args = parse_args(arg_list) + config = load_config(args.config_path) + config.audio.trim_silence = False # Audio processor - ap = AudioProcessor(**c.audio) + ap = AudioProcessor(**config.audio) # load data instances meta_data_train, meta_data_eval = load_tts_samples( - c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size + config.datasets, + eval_split=args.eval, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, ) # use eval and training partitions meta_data = meta_data_train + meta_data_eval # init speaker manager - if c.use_speaker_embedding: + if config.use_speaker_embedding: speaker_manager = SpeakerManager(data_items=meta_data) - elif c.use_d_vector_file: - speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file) + elif config.use_d_vector_file: + speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) else: speaker_manager = None # setup model - model = setup_model(c) + model = setup_model(config) # restore model - model.load_checkpoint(c, args.checkpoint_path, eval=True) + model.load_checkpoint(config, args.checkpoint_path, eval=True) if use_cuda: model.cuda() num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) + print(f"\n > Model has {num_params} parameters", flush=True) # set r - r = 1 if c.model.lower() == "glow_tts" else model.decoder.r - own_loader = setup_loader(ap, r) + r = 1 if config.model.lower() == "glow_tts" else model.decoder.r + own_loader = setup_loader(config, ap, r, speaker_manager, meta_data) extract_spectrograms( + config.model.lower(), own_loader, model, ap, - args.output_path, + Path(args.output_path), quantize_bits=args.quantize_bits, save_audio=args.save_audio, debug=args.debug, - metada_name="metada.txt", + metadata_name="metadata.txt", ) + sys.exit(0) if __name__ == "__main__": - setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) - - parser = argparse.ArgumentParser() - parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True) - parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True) - parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True) - parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug") - parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files") - parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero") - parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True) - args = parser.parse_args() - - c = load_config(args.config_path) - c.audio.trim_silence = False - main(args) + main() diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index 7c68fdb070..40afa1456c 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -1,4 +1,4 @@ -"""Find all the unique characters in a dataset""" +"""Find all the unique characters in a dataset.""" import argparse import logging @@ -14,18 +14,13 @@ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger -def compute_phonemes(item): +def compute_phonemes(item: dict) -> set[str]: text = item["text"] ph = phonemizer.phonemize(text).replace("|", "") return set(ph) -def main(): - setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) - - # pylint: disable=W0601 - global c, phonemizer - # pylint: disable=bad-option-value +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: parser = argparse.ArgumentParser( description="""Find all the unique characters or phonemes in a dataset.\n\n""" """ @@ -36,13 +31,21 @@ def main(): formatter_class=RawTextHelpFormatter, ) parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) - args = parser.parse_args() + return parser.parse_args(arg_list) - c = load_config(args.config_path) + +def main(arg_list: list[str] | None = None) -> None: + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) + global phonemizer + args = parse_args(arg_list) + config = load_config(args.config_path) # load all datasets train_items, eval_items = load_tts_samples( - c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size + config.datasets, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, ) items = train_items + eval_items print("Num items:", len(items)) @@ -50,13 +53,16 @@ def main(): language_list = [item["language"] for item in items] is_lang_def = all(language_list) - if not c.phoneme_language or not is_lang_def: - raise ValueError("Phoneme language must be defined in config.") + if not config.phoneme_language or not is_lang_def: + msg = "Phoneme language must be defined in config." + raise ValueError(msg) - if not language_list.count(language_list[0]) == len(language_list): - raise ValueError( - "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!" + if language_list.count(language_list[0]) != len(language_list): + msg = ( + "Currently, just one phoneme language per config file is supported !! " + "Please split the dataset config into different configs and run it individually for each language !!" ) + raise ValueError(msg) phonemizer = Gruut(language=language_list[0], keep_puncs=True) @@ -74,6 +80,7 @@ def main(): print(f" > Unique phonemes: {''.join(sorted(phones))}") print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}") print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}") + sys.exit(0) if __name__ == "__main__": diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 5d20db6a59..00d7530427 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -134,7 +134,7 @@ """ -def parse_args() -> argparse.Namespace: +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: """Parse arguments.""" parser = argparse.ArgumentParser( description=description.replace(" ```\n", ""), @@ -274,13 +274,14 @@ def parse_args() -> argparse.Namespace: "--source_wav", type=str, default=None, - help="Original audio file to convert in the voice of the target_wav", + help="Original audio file to convert into the voice of the target_wav", ) parser.add_argument( "--target_wav", type=str, + nargs="*", default=None, - help="Target audio file to convert in the voice of the source_wav", + help="Audio file(s) of the target voice into which to convert the source_wav", ) parser.add_argument( @@ -290,7 +291,7 @@ def parse_args() -> argparse.Namespace: help="Voice dir for tortoise model", ) - args = parser.parse_args() + args = parser.parse_args(arg_list) # print the description if either text or list_models is not set check_args = [ @@ -309,9 +310,9 @@ def parse_args() -> argparse.Namespace: return args -def main() -> None: +def main(arg_list: list[str] | None = None) -> None: """Entry point for `tts` command line interface.""" - args = parse_args() + args = parse_args(arg_list) stream = sys.stderr if args.pipe_out else sys.stdout setup_logger("TTS", level=logging.INFO, stream=stream, formatter=ConsoleFormatter()) @@ -340,18 +341,18 @@ def main() -> None: # 1) List pre-trained TTS models if args.list_models: manager.list_models() - sys.exit() + sys.exit(0) # 2) Info about pre-trained TTS models (without loading a model) if args.model_info_by_idx: model_query = args.model_info_by_idx manager.model_info_by_idx(model_query) - sys.exit() + sys.exit(0) if args.model_info_by_name: model_query_full_name = args.model_info_by_name manager.model_info_by_full_name(model_query_full_name) - sys.exit() + sys.exit(0) # 3) Load a model for further info or TTS/VC device = args.device @@ -377,23 +378,23 @@ def main() -> None: if args.list_speaker_idxs: if not api.is_multi_speaker: logger.info("Model only has a single speaker.") - return + sys.exit(0) logger.info( "Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." ) logger.info(api.speakers) - return + sys.exit(0) # query langauge ids of a multi-lingual model. if args.list_language_idxs: if not api.is_multi_lingual: logger.info("Monolingual model.") - return + sys.exit(0) logger.info( "Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." ) logger.info(api.languages) - return + sys.exit(0) # check the arguments against a multi-speaker model. if api.is_multi_speaker and (not args.speaker_idx and not args.speaker_wav): @@ -401,7 +402,7 @@ def main() -> None: "Looks like you use a multi-speaker model. Define `--speaker_idx` to " "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." ) - return + sys.exit(1) # RUN THE SYNTHESIS if args.text: @@ -430,6 +431,7 @@ def main() -> None: pipe_out=pipe_out, ) logger.info("Saved VC output to %s", args.out_path) + sys.exit(0) if __name__ == "__main__": diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 84123d2db3..914c729856 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging import os @@ -87,7 +86,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False): def evaluation(model, criterion, data_loader, global_step): eval_loss = 0 for _, data in enumerate(data_loader): - with torch.no_grad(): + with torch.inference_mode(): # setup input data inputs, labels = data @@ -219,10 +218,8 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, if global_step % c.print_step == 0: print( - " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} " - "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format( - global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr - ), + f" | > Step:{global_step} Loss:{loss.item():.5f} GradNorm:{grad_norm:.5f} " + f"StepTime:{step_time:.2f} LoaderTime:{loader_time:.2f} AvGLoaderTime:{avg_loader_time:.2f} LR:{current_lr:.6f}", flush=True, ) @@ -236,10 +233,8 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, print("") print( - ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} " - "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format( - epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time - ), + f">>> Epoch:{epoch} AvgLoss: {tot_loss / len(data_loader):.5f} GradNorm:{grad_norm:.5f} " + f"EpochTime:{epoch_time:.2f} AvGLoaderTime:{avg_loader_time:.2f} ", flush=True, ) # evaluation @@ -249,7 +244,7 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, print("\n\n") print("--> EVAL PERFORMANCE") print( - " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss), + f" | > Epoch:{epoch} AvgLoss: {eval_loss:.5f} ", flush=True, ) # save the best checkpoint @@ -301,7 +296,7 @@ def main(args): # pylint: disable=redefined-outer-name criterion, args.restore_step = model.load_checkpoint( c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion ) - print(" > Model restored from step %d" % args.restore_step, flush=True) + print(f" > Model restored from step {args.restore_step}", flush=True) else: args.restore_step = 0 @@ -311,7 +306,7 @@ def main(args): # pylint: disable=redefined-outer-name scheduler = None num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) + print(f"\n > Model has {num_params} parameters", flush=True) if use_cuda: model = model.cuda() diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py index aa04177068..58122b9005 100644 --- a/TTS/bin/train_vocoder.py +++ b/TTS/bin/train_vocoder.py @@ -17,7 +17,7 @@ class TrainVocoderArgs(TrainerArgs): config_path: str = field(default=None, metadata={"help": "Path to the config file."}) -def main(): +def main(arg_list: list[str] | None = None): """Run `tts` model training directly by a `config.json` file.""" setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) @@ -26,7 +26,7 @@ def main(): parser = train_args.init_argparse(arg_prefix="") # override trainer args from comman-line args - args, config_overrides = parser.parse_known_args() + args, config_overrides = parser.parse_known_args(arg_list) train_args.parse_args(args) # load config.json and register @@ -76,6 +76,7 @@ def main(): parse_command_line_args=False, ) trainer.fit() + sys.exit(0) if __name__ == "__main__": diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index e5f40c0296..401003504e 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -1,7 +1,7 @@ import json import os import re -from typing import Any, Dict, Union +from typing import Any, Union import fsspec import yaml @@ -54,11 +54,11 @@ def register_config(model_name: str) -> Coqpit: return config_class -def _process_model_name(config_dict: Dict) -> str: +def _process_model_name(config_dict: dict) -> str: """Format the model name as expected. It is a band-aid for the old `vocoder` model names. Args: - config_dict (Dict): A dictionary including the config fields. + config_dict (dict): A dictionary including the config fields. Returns: str: Formatted modelname. @@ -68,7 +68,7 @@ def _process_model_name(config_dict: Dict) -> str: return model_name -def load_config(config_path: Union[str, os.PathLike[Any]]) -> Coqpit: +def load_config(config_path: str | os.PathLike[Any]) -> Coqpit: """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name to find the corresponding Config class. Then initialize the Config. diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index 7fae77d613..a0a013b0de 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -1,5 +1,4 @@ from dataclasses import asdict, dataclass -from typing import List from coqpit import Coqpit, check_argument from trainer import TrainerConfig @@ -227,7 +226,7 @@ class BaseDatasetConfig(Coqpit): dataset_name: str = "" path: str = "" meta_file_train: str = "" - ignored_speakers: List[str] = None + ignored_speakers: list[str] = None language: str = "" phonemizer: str = "" meta_file_val: str = "" diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 7ac38ed6ee..dac5f0870a 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -104,7 +104,7 @@ def isatty(self): def read_logs(): sys.stdout.flush() - with open(sys.stdout.log_file, "r") as f: + with open(sys.stdout.log_file) as f: return f.read() diff --git a/TTS/encoder/configs/base_encoder_config.py b/TTS/encoder/configs/base_encoder_config.py index ebbaa0457b..d2d0ef580d 100644 --- a/TTS/encoder/configs/base_encoder_config.py +++ b/TTS/encoder/configs/base_encoder_config.py @@ -1,5 +1,4 @@ from dataclasses import asdict, dataclass, field -from typing import Dict, List from coqpit import MISSING @@ -12,9 +11,9 @@ class BaseEncoderConfig(BaseTrainingConfig): model: str = None audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) - datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) + datasets: list[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # model params - model_params: Dict = field( + model_params: dict = field( default_factory=lambda: { "model_name": "lstm", "input_dim": 80, @@ -25,7 +24,7 @@ class BaseEncoderConfig(BaseTrainingConfig): } ) - audio_augmentation: Dict = field(default_factory=lambda: {}) + audio_augmentation: dict = field(default_factory=lambda: {}) # training params epochs: int = 10000 @@ -33,7 +32,7 @@ class BaseEncoderConfig(BaseTrainingConfig): grad_clip: float = 3.0 lr: float = 0.0001 optimizer: str = "radam" - optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0}) lr_decay: bool = False warmup_steps: int = 4000 @@ -56,6 +55,6 @@ class BaseEncoderConfig(BaseTrainingConfig): def check_values(self): super().check_values() c = asdict(self) - assert ( - c["model_params"]["input_dim"] == self.audio.num_mels - ), " [!] model input dimendion must be equal to melspectrogram dimension." + assert c["model_params"]["input_dim"] == self.audio.num_mels, ( + " [!] model input dimendion must be equal to melspectrogram dimension." + ) diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py index 2082019aad..c6680c3a25 100644 --- a/TTS/encoder/models/base_encoder.py +++ b/TTS/encoder/models/base_encoder.py @@ -34,7 +34,7 @@ class BaseEncoder(nn.Module): # pylint: disable=W0102 def __init__(self): - super(BaseEncoder, self).__init__() + super().__init__() def get_torch_mel_spectrogram_class(self, audio_config): return torch.nn.Sequential( @@ -64,11 +64,11 @@ def get_torch_mel_spectrogram_class(self, audio_config): ), ) - @torch.no_grad() + @torch.inference_mode() def inference(self, x, l2_norm=True): return self.forward(x, l2_norm) - @torch.no_grad() + @torch.inference_mode() def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True): """ Generate embeddings for a batch of utterances @@ -107,7 +107,7 @@ def get_criterion(self, c: Coqpit, num_classes=None): elif c.loss == "softmaxproto": criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes) else: - raise Exception("The %s not is a loss supported" % c.loss) + raise Exception(f"The {c.loss} not is a loss supported") return criterion def load_checkpoint( diff --git a/TTS/encoder/models/resnet.py b/TTS/encoder/models/resnet.py index 5eafcd6005..d7f3a2f4bd 100644 --- a/TTS/encoder/models/resnet.py +++ b/TTS/encoder/models/resnet.py @@ -7,7 +7,7 @@ class SELayer(nn.Module): def __init__(self, channel, reduction=8): - super(SELayer, self).__init__() + super().__init__() self.avg_pool = nn.AdaptiveAvgPool2d(1) self.fc = nn.Sequential( nn.Linear(channel, channel // reduction), @@ -27,7 +27,7 @@ class SEBasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8): - super(SEBasicBlock, self).__init__() + super().__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False) @@ -73,7 +73,7 @@ def __init__( use_torch_spec=False, audio_config=None, ): - super(ResNetSpeakerEncoder, self).__init__() + super().__init__() self.encoder_type = encoder_type self.input_dim = input_dim diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py index 495b4def5a..d6c4f9fa50 100644 --- a/TTS/encoder/utils/generic_utils.py +++ b/TTS/encoder/utils/generic_utils.py @@ -12,7 +12,7 @@ logger = logging.getLogger(__name__) -class AugmentWAV(object): +class AugmentWAV: def __init__(self, ap, augmentation_config): self.ap = ap self.use_additive_noise = False diff --git a/TTS/encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py index 37619ed0f8..8d50ffd5f5 100644 --- a/TTS/encoder/utils/prepare_voxceleb.py +++ b/TTS/encoder/utils/prepare_voxceleb.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo # All rights reserved. # @@ -17,7 +16,7 @@ # Only support eager mode and TF>=2.0.0 # pylint: disable=no-member, invalid-name, relative-beyond-top-level # pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes -""" voxceleb 1 & 2 """ +"""voxceleb 1 & 2""" import csv import hashlib @@ -81,19 +80,19 @@ def download_and_extract(directory, subset, urls): zip_filepath = os.path.join(directory, url.split("/")[-1]) if os.path.exists(zip_filepath): continue - logger.info("Downloading %s to %s" % (url, zip_filepath)) + logger.info("Downloading %s to %s", url, zip_filepath) subprocess.call( - "wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath), + "wget {} --user {} --password {} -O {}".format(url, USER["user"], USER["password"], zip_filepath), shell=True, ) statinfo = os.stat(zip_filepath) - logger.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size)) + logger.info("Successfully downloaded %s, size(bytes): %d", url, statinfo.st_size) # concatenate all parts into zip files if ".zip" not in zip_filepath: zip_filepath = "_".join(zip_filepath.split("_")[:-1]) - subprocess.call("cat %s* > %s.zip" % (zip_filepath, zip_filepath), shell=True) + subprocess.call(f"cat {zip_filepath}* > {zip_filepath}.zip", shell=True) zip_filepath += ".zip" extract_path = zip_filepath.strip(".zip") @@ -101,12 +100,12 @@ def download_and_extract(directory, subset, urls): with open(zip_filepath, "rb") as f_zip: md5 = hashlib.md5(f_zip.read()).hexdigest() if md5 != MD5SUM[subset]: - raise ValueError("md5sum of %s mismatch" % zip_filepath) + raise ValueError(f"md5sum of {zip_filepath} mismatch") with zipfile.ZipFile(zip_filepath, "r") as zfile: zfile.extractall(directory) extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename) - subprocess.call("mv %s %s" % (extract_path_ori, extract_path), shell=True) + subprocess.call(f"mv {extract_path_ori} {extract_path}", shell=True) finally: # os.remove(zip_filepath) pass @@ -122,9 +121,9 @@ def exec_cmd(cmd): try: retcode = subprocess.call(cmd, shell=True) if retcode < 0: - logger.info(f"Child was terminated by signal {retcode}") + logger.info("Child was terminated by signal %d", retcode) except OSError as e: - logger.info(f"Execution failed: {e}") + logger.info("Execution failed: %s", e) retcode = -999 return retcode @@ -138,10 +137,10 @@ def decode_aac_with_ffmpeg(aac_file, wav_file): bool, True if success. """ cmd = f"ffmpeg -i {aac_file} {wav_file}" - logger.info(f"Decoding aac file using command line: {cmd}") + logger.info("Decoding aac file using command line: %s", cmd) ret = exec_cmd(cmd) if ret != 0: - logger.error(f"Failed to decode aac file with retcode {ret}") + logger.error("Failed to decode aac file with retcode %s", ret) logger.error("Please check your ffmpeg installation.") return False return True @@ -156,7 +155,7 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file): output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv """ - logger.info("Preprocessing audio and label for subset %s" % subset) + logger.info("Preprocessing audio and label for subset %s", subset) source_dir = os.path.join(input_dir, subset) files = [] @@ -194,7 +193,7 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file): writer.writerow(["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"]) for wav_file in files: writer.writerow(wav_file) - logger.info("Successfully generated csv file {}".format(csv_file_path)) + logger.info("Successfully generated csv file %s", csv_file_path) def processor(directory, subset, force_process): diff --git a/TTS/model.py b/TTS/model.py index 779b1775a3..39faa7f690 100644 --- a/TTS/model.py +++ b/TTS/model.py @@ -1,6 +1,6 @@ import os from abc import abstractmethod -from typing import Any, Union +from typing import Any import torch from coqpit import Coqpit @@ -48,7 +48,7 @@ def inference(self, input: torch.Tensor, aux_input: dict[str, Any] = {}) -> dict def load_checkpoint( self, config: Coqpit, - checkpoint_path: Union[str, os.PathLike[Any]], + checkpoint_path: str | os.PathLike[Any], eval: bool = False, strict: bool = True, cache: bool = False, @@ -64,3 +64,7 @@ def load_checkpoint( It is cached under `trainer.io.get_user_data_dir()/tts_cache`. Defaults to False. """ ... + + @property + def device(self) -> torch.device: + return next(self.parameters()).device diff --git a/TTS/server/server.py b/TTS/server/server.py index 6a4642f9a2..753e9103ab 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -8,9 +8,7 @@ import logging import os import sys -from pathlib import Path from threading import Lock -from typing import Union from urllib.parse import parse_qs try: @@ -19,10 +17,9 @@ msg = "Server requires requires flask, use `pip install coqui-tts[server]`" raise ImportError(msg) from e -from TTS.config import load_config +from TTS.api import TTS from TTS.utils.generic_utils import ConsoleFormatter, setup_logger from TTS.utils.manage import ModelManager -from TTS.utils.synthesizer import Synthesizer logger = logging.getLogger(__name__) setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) @@ -60,6 +57,7 @@ def create_argparser() -> argparse.ArgumentParser: parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None) parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument("--port", type=int, default=5002, help="port to listen on.") + parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu") parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="true to use CUDA.") parser.add_argument( "--debug", action=argparse.BooleanOptionalAction, default=False, help="true to enable Flask debug mode." @@ -73,8 +71,7 @@ def create_argparser() -> argparse.ArgumentParser: # parse the args args = create_argparser().parse_args() -path = Path(__file__).parent / "../.models.json" -manager = ModelManager(path) +manager = ModelManager(models_file=TTS.get_models_file_path()) # update in-use models to the specified released models. model_path = None @@ -86,55 +83,31 @@ def create_argparser() -> argparse.ArgumentParser: # CASE1: list pre-trained TTS models if args.list_models: manager.list_models() - sys.exit() - -# CASE2: load pre-trained model paths -if args.model_name is not None and not args.model_path: - model_path, config_path, model_item = manager.download_model(args.model_name) - args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name - -if args.vocoder_name is not None and not args.vocoder_path: - vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) - -# CASE3: set custom model paths -if args.model_path is not None: - model_path = args.model_path - config_path = args.config_path - speakers_file_path = args.speakers_file_path - -if args.vocoder_path is not None: - vocoder_path = args.vocoder_path - vocoder_config_path = args.vocoder_config_path - -# load models -synthesizer = Synthesizer( - tts_checkpoint=model_path, - tts_config_path=config_path, - tts_speakers_file=speakers_file_path, - tts_languages_file=None, - vocoder_checkpoint=vocoder_path, - vocoder_config=vocoder_config_path, - encoder_checkpoint="", - encoder_config="", - use_cuda=args.use_cuda, -) - -use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and ( - synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None -) -speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None) - -use_multi_language = hasattr(synthesizer.tts_model, "num_languages") and ( - synthesizer.tts_model.num_languages > 1 or synthesizer.tts_languages_file is not None -) -language_manager = getattr(synthesizer.tts_model, "language_manager", None) + sys.exit(0) + +device = args.device +if args.use_cuda: + device = "cuda" + +# CASE2: load models +model_name = args.model_name if args.model_path is None else None +api = TTS( + model_name=model_name, + model_path=args.model_path, + config_path=args.config_path, + vocoder_name=args.vocoder_name, + vocoder_path=args.vocoder_path, + vocoder_config_path=args.vocoder_config_path, + speakers_file_path=args.speakers_file_path, + # language_ids_file_path=args.language_ids_file_path, +).to(device) # TODO: set this from SpeakerManager -use_gst = synthesizer.tts_config.get("use_gst", False) +use_gst = api.synthesizer.tts_config.get("use_gst", False) app = Flask(__name__) -def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]: +def style_wav_uri_to_dict(style_wav: str) -> str | dict: """Transform an uri style_wav, in either a string (path to wav file to be use for style transfer) or a dict (gst tokens/values to be use for styling) @@ -158,27 +131,18 @@ def index(): return render_template( "index.html", show_details=args.show_details, - use_multi_speaker=use_multi_speaker, - use_multi_language=use_multi_language, - speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None, - language_ids=language_manager.name_to_id if language_manager is not None else None, + use_multi_speaker=api.is_multi_speaker, + use_multi_language=api.is_multi_lingual, + speaker_ids=api.speakers, + language_ids=api.languages, use_gst=use_gst, ) @app.route("/details") def details(): - if args.config_path is not None and os.path.isfile(args.config_path): - model_config = load_config(args.config_path) - elif args.model_name is not None: - model_config = load_config(config_path) - - if args.vocoder_config_path is not None and os.path.isfile(args.vocoder_config_path): - vocoder_config = load_config(args.vocoder_config_path) - elif args.vocoder_name is not None: - vocoder_config = load_config(vocoder_config_path) - else: - vocoder_config = None + model_config = api.synthesizer.tts_config + vocoder_config = api.synthesizer.vocoder_config or None return render_template( "details.html", @@ -196,17 +160,23 @@ def details(): def tts(): with lock: text = request.headers.get("text") or request.values.get("text", "") - speaker_idx = request.headers.get("speaker-id") or request.values.get("speaker_id", "") - language_idx = request.headers.get("language-id") or request.values.get("language_id", "") + speaker_idx = ( + request.headers.get("speaker-id") or request.values.get("speaker_id", "") if api.is_multi_speaker else None + ) + language_idx = ( + request.headers.get("language-id") or request.values.get("language_id", "") + if api.is_multi_lingual + else None + ) style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "") style_wav = style_wav_uri_to_dict(style_wav) logger.info("Model input: %s", text) logger.info("Speaker idx: %s", speaker_idx) logger.info("Language idx: %s", language_idx) - wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav) + wavs = api.tts(text, speaker=speaker_idx, language=language_idx, style_wav=style_wav) out = io.BytesIO() - synthesizer.save_wav(wavs, out) + api.synthesizer.save_wav(wavs, out) return send_file(out, mimetype="audio/wav") @@ -248,9 +218,9 @@ def mary_tts_api_process(): else: text = request.args.get("INPUT_TEXT", "") logger.info("Model input: %s", text) - wavs = synthesizer.tts(text) + wavs = api.tts(text) out = io.BytesIO() - synthesizer.save_wav(wavs, out) + api.synthesizer.save_wav(wavs, out) return send_file(out, mimetype="audio/wav") diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py index 317a01af53..2224396d1e 100644 --- a/TTS/tts/configs/align_tts_config.py +++ b/TTS/tts/configs/align_tts_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.align_tts import AlignTTSArgs @@ -70,7 +69,7 @@ class AlignTTSConfig(BaseTTSConfig): model: str = "align_tts" # model specific params model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs) - phase_start_steps: List[int] = None + phase_start_steps: list[int] = None ssim_alpha: float = 1.0 spec_loss_alpha: float = 1.0 @@ -96,7 +95,7 @@ class AlignTTSConfig(BaseTTSConfig): r: int = 1 # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/bark_config.py b/TTS/tts/configs/bark_config.py index b846febe85..61d67b987a 100644 --- a/TTS/tts/configs/bark_config.py +++ b/TTS/tts/configs/bark_config.py @@ -1,6 +1,5 @@ import os from dataclasses import dataclass, field -from typing import Dict from trainer.io import get_user_data_dir @@ -70,9 +69,9 @@ class BarkConfig(BaseTTSConfig): COARSE_INFER_TOKEN: int = 12_050 REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/" - REMOTE_MODEL_PATHS: Dict = None - LOCAL_MODEL_PATHS: Dict = None - SMALL_REMOTE_MODEL_PATHS: Dict = None + REMOTE_MODEL_PATHS: dict = None + LOCAL_MODEL_PATHS: dict = None + SMALL_REMOTE_MODEL_PATHS: dict = None CACHE_DIR: str = str(get_user_data_dir("tts/suno/bark_v0")) DEF_SPEAKER_DIR: str = str(get_user_data_dir("tts/bark_v0/speakers")) diff --git a/TTS/tts/configs/delightful_tts_config.py b/TTS/tts/configs/delightful_tts_config.py index 805d995369..7f9e7a6ab2 100644 --- a/TTS/tts/configs/delightful_tts_config.py +++ b/TTS/tts/configs/delightful_tts_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig @@ -73,7 +72,7 @@ class DelightfulTTSConfig(BaseTTSConfig): # optimizer steps_to_start_discriminator: int = 200000 - grad_clip: List[float] = field(default_factory=lambda: [1000, 1000]) + grad_clip: list[float] = field(default_factory=lambda: [1000, 1000]) lr_gen: float = 0.0002 lr_disc: float = 0.0002 lr_scheduler_gen: str = "ExponentialLR" @@ -140,7 +139,7 @@ class DelightfulTTSConfig(BaseTTSConfig): d_vector_dim: int = None # testing - test_sentences: List[List[str]] = field( + test_sentences: list[list[str]] = field( default_factory=lambda: [ ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."], ["Be a voice, not an echo."], diff --git a/TTS/tts/configs/fast_pitch_config.py b/TTS/tts/configs/fast_pitch_config.py index d086d26564..5b50122e09 100644 --- a/TTS/tts/configs/fast_pitch_config.py +++ b/TTS/tts/configs/fast_pitch_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.forward_tts import ForwardTTSArgs @@ -153,7 +152,7 @@ class FastPitchConfig(BaseTTSConfig): f0_cache_path: str = None # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/fast_speech_config.py b/TTS/tts/configs/fast_speech_config.py index af6c2db6fa..f375292256 100644 --- a/TTS/tts/configs/fast_speech_config.py +++ b/TTS/tts/configs/fast_speech_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.forward_tts import ForwardTTSArgs @@ -147,7 +146,7 @@ class FastSpeechConfig(BaseTTSConfig): f0_cache_path: str = None # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/fastspeech2_config.py b/TTS/tts/configs/fastspeech2_config.py index d179617fb0..3d6ce4f4b3 100644 --- a/TTS/tts/configs/fastspeech2_config.py +++ b/TTS/tts/configs/fastspeech2_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.forward_tts import ForwardTTSArgs @@ -168,7 +167,7 @@ class Fastspeech2Config(BaseTTSConfig): energy_cache_path: str = None # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py index f42f3e5a51..34b4057093 100644 --- a/TTS/tts/configs/glow_tts_config.py +++ b/TTS/tts/configs/glow_tts_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig @@ -171,7 +170,7 @@ class GlowTTSConfig(BaseTTSConfig): r: int = 1 # DO NOT CHANGE - TODO: make this immutable once coqpit implements it. # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/neuralhmm_tts_config.py b/TTS/tts/configs/neuralhmm_tts_config.py index 50f72847ed..bd1736c880 100644 --- a/TTS/tts/configs/neuralhmm_tts_config.py +++ b/TTS/tts/configs/neuralhmm_tts_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig @@ -126,7 +125,7 @@ class NeuralhmmTTSConfig(BaseTTSConfig): memory_rnn_dim: int = 1024 ## Outputnet parameters - outputnet_size: List[int] = field(default_factory=lambda: [1024]) + outputnet_size: list[int] = field(default_factory=lambda: [1024]) flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14}) std_floor: float = 0.001 @@ -143,7 +142,7 @@ class NeuralhmmTTSConfig(BaseTTSConfig): min_audio_len: int = 512 # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "Be a voice, not an echo.", ] @@ -162,9 +161,9 @@ def check_values(self): AssertionError: transition probability is not between 0 and 1 """ assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model." - assert ( - len(self.outputnet_size) >= 1 - ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" - assert ( - 0 < self.flat_start_params["transition_p"] < 1 - ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + assert len(self.outputnet_size) >= 1, ( + f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" + ) + assert 0 < self.flat_start_params["transition_p"] < 1, ( + f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + ) diff --git a/TTS/tts/configs/overflow_config.py b/TTS/tts/configs/overflow_config.py index dc3e5548b8..93a6a9e377 100644 --- a/TTS/tts/configs/overflow_config.py +++ b/TTS/tts/configs/overflow_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig @@ -145,7 +144,7 @@ class OverflowConfig(BaseTTSConfig): # The classname has to be camel case memory_rnn_dim: int = 1024 ## Outputnet parameters - outputnet_size: List[int] = field(default_factory=lambda: [1024]) + outputnet_size: list[int] = field(default_factory=lambda: [1024]) flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14}) std_floor: float = 0.01 @@ -174,7 +173,7 @@ class OverflowConfig(BaseTTSConfig): # The classname has to be camel case min_audio_len: int = 512 # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "Be a voice, not an echo.", ] @@ -193,9 +192,9 @@ def check_values(self): AssertionError: transition probability is not between 0 and 1 """ assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model." - assert ( - len(self.outputnet_size) >= 1 - ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" - assert ( - 0 < self.flat_start_params["transition_p"] < 1 - ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + assert len(self.outputnet_size) >= 1, ( + f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" + ) + assert 0 < self.flat_start_params["transition_p"] < 1, ( + f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + ) diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index bf17322c19..bd5a28b43c 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -1,5 +1,4 @@ from dataclasses import asdict, dataclass, field -from typing import Dict, List from coqpit import Coqpit, check_argument @@ -138,7 +137,7 @@ class CharactersConfig(Coqpit): characters_class: str = None # using BaseVocabulary - vocab_dict: Dict = None + vocab_dict: dict = None # using on BaseCharacters pad: str = None @@ -323,7 +322,7 @@ class BaseTTSConfig(BaseTrainingConfig): shuffle: bool = False drop_last: bool = False # dataset - datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) + datasets: list[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # optimizer optimizer: str = "radam" optimizer_params: dict = None @@ -331,7 +330,7 @@ class BaseTTSConfig(BaseTrainingConfig): lr_scheduler: str = None lr_scheduler_params: dict = field(default_factory=lambda: {}) # testing - test_sentences: List[str] = field(default_factory=lambda: []) + test_sentences: list[str] = field(default_factory=lambda: []) # evaluation eval_split_max_size: int = None eval_split_size: float = 0.01 diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py index bf8517dfc4..29221d7b25 100644 --- a/TTS/tts/configs/speedy_speech_config.py +++ b/TTS/tts/configs/speedy_speech_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.forward_tts import ForwardTTSArgs @@ -164,7 +163,7 @@ class SpeedySpeechConfig(BaseTTSConfig): f0_cache_path: str = None # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index 350b5ea996..e4b419d1fa 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig, CapacitronVAEConfig, GSTConfig @@ -154,7 +153,7 @@ class TacotronConfig(BaseTTSConfig): num_speakers: int = 1 num_chars: int = 0 r: int = 2 - gradual_training: List[List[int]] = None + gradual_training: list[list[int]] = None memory_size: int = -1 prenet_type: str = "original" prenet_dropout: bool = True @@ -212,7 +211,7 @@ class TacotronConfig(BaseTTSConfig): ga_alpha: float = 5.0 # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", @@ -224,12 +223,12 @@ class TacotronConfig(BaseTTSConfig): def check_values(self): if self.gradual_training: - assert ( - self.gradual_training[0][1] == self.r - ), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}" + assert self.gradual_training[0][1] == self.r, ( + f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}" + ) if self.model == "tacotron" and self.audio is not None: - assert self.out_channels == ( - self.audio.fft_size // 2 + 1 - ), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}" + assert self.out_channels == (self.audio.fft_size // 2 + 1), ( + f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}" + ) if self.model == "tacotron2" and self.audio is not None: assert self.out_channels == self.audio.num_mels diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index 2d0242bf13..d85684c721 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.vits import VitsArgs, VitsAudioConfig @@ -112,7 +111,7 @@ class VitsConfig(BaseTTSConfig): audio: VitsAudioConfig = field(default_factory=VitsAudioConfig) # optimizer - grad_clip: List[float] = field(default_factory=lambda: [1000, 1000]) + grad_clip: list[float] = field(default_factory=lambda: [1000, 1000]) lr_gen: float = 0.0002 lr_disc: float = 0.0002 lr_scheduler_gen: str = "ExponentialLR" @@ -146,7 +145,7 @@ class VitsConfig(BaseTTSConfig): add_blank: bool = True # testing - test_sentences: List[List] = field( + test_sentences: list[list] = field( default_factory=lambda: [ ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."], ["Be a voice, not an echo."], @@ -167,7 +166,7 @@ class VitsConfig(BaseTTSConfig): # use d-vectors use_d_vector_file: bool = False - d_vector_file: List[str] = None + d_vector_file: list[str] = None d_vector_dim: int = None def __post_init__(self): diff --git a/TTS/tts/configs/xtts_config.py b/TTS/tts/configs/xtts_config.py index bbf048e1ab..da6cc6edc6 100644 --- a/TTS/tts/configs/xtts_config.py +++ b/TTS/tts/configs/xtts_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig @@ -70,7 +69,7 @@ class XttsConfig(BaseTTSConfig): model_args: XttsArgs = field(default_factory=XttsArgs) audio: XttsAudioConfig = field(default_factory=XttsAudioConfig) model_dir: str = None - languages: List[str] = field( + languages: list[str] = field( default_factory=lambda: [ "en", "es", diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index d1a37da4c1..d83abce00a 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -2,8 +2,8 @@ import os import sys from collections import Counter +from collections.abc import Callable from pathlib import Path -from typing import Callable, Dict, List, Tuple, Union import numpy as np @@ -17,7 +17,7 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01): """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training. Args: - items (List[List]): + items (list[list]): A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`. eval_split_max_size (int): @@ -37,10 +37,8 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01): else: eval_split_size = int(len(items) * eval_split_size) - assert ( - eval_split_size > 0 - ), " [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {}".format( - 1 / len(items) + assert eval_split_size > 0, ( + f" [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {1 / len(items)}" ) np.random.seed(0) np.random.shuffle(items) @@ -71,18 +69,18 @@ def add_extra_keys(metadata, language, dataset_name): def load_tts_samples( - datasets: Union[List[Dict], Dict], + datasets: list[dict] | dict, eval_split=True, formatter: Callable = None, eval_split_max_size=None, eval_split_size=0.01, -) -> Tuple[List[List], List[List]]: - """Parse the dataset from the datasets config, load the samples as a List and load the attention alignments if provided. +) -> tuple[list[list], list[list]]: + """Parse the dataset from the datasets config, load the samples as a list and load the attention alignments if provided. If `formatter` is not None, apply the formatter to the samples else pick the formatter from the available ones based on the dataset name. Args: - datasets (List[Dict], Dict): A list of datasets or a single dataset dictionary. If multiple datasets are + datasets (list[dict], dict): A list of datasets or a single dataset dictionary. If multiple datasets are in the list, they are all merged. eval_split (bool, optional): If true, create a evaluation split. If an eval split provided explicitly, generate @@ -101,7 +99,7 @@ def load_tts_samples( If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%). Returns: - Tuple[List[List], List[List]: training and evaluation splits of the dataset. + tuple[list[list], list[list]: training and evaluation splits of the dataset. """ meta_data_train_all = [] meta_data_eval_all = [] if eval_split else None @@ -153,7 +151,7 @@ def load_tts_samples( def load_attention_mask_meta_data(metafile_path): """Load meta data file created by compute_attention_masks.py""" - with open(metafile_path, "r", encoding="utf-8") as f: + with open(metafile_path, encoding="utf-8") as f: lines = f.readlines() meta_data = [] diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 5f629f32a9..6f21dcd1e0 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -3,7 +3,7 @@ import logging import os import random -from typing import Any, Optional, Union +from typing import Any import numpy as np import numpy.typing as npt @@ -47,7 +47,7 @@ def string2filename(string: str) -> str: return base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore") -def get_audio_size(audiopath: Union[str, os.PathLike[Any]]) -> int: +def get_audio_size(audiopath: str | os.PathLike[Any]) -> int: """Return the number of samples in the audio file.""" if not isinstance(audiopath, str): audiopath = str(audiopath) @@ -63,7 +63,7 @@ def get_audio_size(audiopath: Union[str, os.PathLike[Any]]) -> int: raise RuntimeError(msg) from e -def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: Optional[dict] = None): +def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: dict | None = None): """Create inverse frequency weights for balancing the dataset. Use `multi_dict` to scale relative weights.""" @@ -94,23 +94,23 @@ def __init__( outputs_per_step: int = 1, compute_linear_spec: bool = False, ap: AudioProcessor = None, - samples: Optional[list[dict]] = None, + samples: list[dict] | None = None, tokenizer: "TTSTokenizer" = None, compute_f0: bool = False, compute_energy: bool = False, - f0_cache_path: Optional[str] = None, - energy_cache_path: Optional[str] = None, + f0_cache_path: str | None = None, + energy_cache_path: str | None = None, return_wav: bool = False, batch_group_size: int = 0, min_text_len: int = 0, max_text_len: int = float("inf"), min_audio_len: int = 0, max_audio_len: int = float("inf"), - phoneme_cache_path: Optional[str] = None, + phoneme_cache_path: str | None = None, precompute_num_workers: int = 0, - speaker_id_mapping: Optional[dict] = None, - d_vector_mapping: Optional[dict] = None, - language_id_mapping: Optional[dict] = None, + speaker_id_mapping: dict | None = None, + d_vector_mapping: dict | None = None, + language_id_mapping: dict | None = None, use_noise_augment: bool = False, start_by_longest: bool = False, ) -> None: @@ -231,7 +231,7 @@ def lengths(self) -> list[int]: try: audio_len = get_audio_size(wav_file) except RuntimeError: - logger.warning(f"Failed to compute length for {item['audio_file']}") + logger.warning("Failed to compute length for %s", item["audio_file"]) audio_len = 0 lens.append(audio_len) return lens @@ -352,7 +352,7 @@ def _compute_lengths(samples): try: audio_length = get_audio_size(item["audio_file"]) except RuntimeError: - logger.warning(f"Failed to compute length, skipping {item['audio_file']}") + logger.warning("Failed to compute length, skipping %s", item["audio_file"]) continue text_lenght = len(item["text"]) item["audio_length"] = audio_length @@ -437,14 +437,14 @@ def preprocess_samples(self) -> None: self.samples = samples logger.info("Preprocessing samples") - logger.info(f"Max text length: {np.max(text_lengths)}") - logger.info(f"Min text length: {np.min(text_lengths)}") - logger.info(f"Avg text length: {np.mean(text_lengths)}") - logger.info(f"Max audio length: {np.max(audio_lengths)}") - logger.info(f"Min audio length: {np.min(audio_lengths)}") - logger.info(f"Avg audio length: {np.mean(audio_lengths)}") + logger.info("Max text length: %d", np.max(text_lengths)) + logger.info("Min text length: %d", np.min(text_lengths)) + logger.info("Avg text length: %.2f", np.mean(text_lengths)) + logger.info("Max audio length: %.2f", np.max(audio_lengths)) + logger.info("Min audio length: %.2f", np.min(audio_lengths)) + logger.info("Avg audio length: %.2f", np.mean(audio_lengths)) logger.info("Num. instances discarded samples: %d", len(ignore_idx)) - logger.info(f"Batch group size: {self.batch_group_size}.") + logger.info("Batch group size: %d", self.batch_group_size) @staticmethod def _sort_batch(batch, text_lengths): @@ -640,7 +640,7 @@ class PhonemeDataset(Dataset): def __init__( self, - samples: Union[list[dict], list[list]], + samples: list[dict] | list[list], tokenizer: "TTSTokenizer", cache_path: str, precompute_num_workers: int = 0, @@ -744,10 +744,10 @@ class F0Dataset: def __init__( self, - samples: Union[list[list], list[dict]], + samples: list[list] | list[dict], ap: "AudioProcessor", audio_config=None, # pylint: disable=unused-argument - cache_path: Optional[str] = None, + cache_path: str | None = None, precompute_num_workers: int = 0, normalize_f0: bool = True, ) -> None: @@ -896,9 +896,9 @@ class EnergyDataset: def __init__( self, - samples: Union[list[list], list[dict]], + samples: list[list] | list[dict], ap: "AudioProcessor", - cache_path: Optional[str] = None, + cache_path: str | None = None, precompute_num_workers=0, normalize_energy=True, ) -> None: diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index ff1a76e2c9..3a4605275a 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -5,7 +5,6 @@ import xml.etree.ElementTree as ET from glob import glob from pathlib import Path -from typing import List from tqdm import tqdm @@ -21,7 +20,7 @@ def cml_tts(root_path, meta_file, ignored_speakers=None): https://github.com/freds0/CML-TTS-Dataset/""" filepath = os.path.join(root_path, meta_file) # ensure there are 4 columns for every line - with open(filepath, "r", encoding="utf8") as f: + with open(filepath, encoding="utf8") as f: lines = f.readlines() num_cols = len(lines[0].split("|")) # take the first row as reference for idx, line in enumerate(lines[1:]): @@ -61,7 +60,7 @@ def coqui(root_path, meta_file, ignored_speakers=None): """Interal dataset formatter.""" filepath = os.path.join(root_path, meta_file) # ensure there are 4 columns for every line - with open(filepath, "r", encoding="utf8") as f: + with open(filepath, encoding="utf8") as f: lines = f.readlines() num_cols = len(lines[0].split("|")) # take the first row as reference for idx, line in enumerate(lines[1:]): @@ -104,7 +103,7 @@ def tweb(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "tweb" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("\t") wav_file = os.path.join(root_path, cols[0] + ".wav") @@ -118,7 +117,7 @@ def mozilla(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "mozilla" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = cols[1].strip() @@ -133,7 +132,7 @@ def mozilla_de(root_path, meta_file, **kwargs): # pylint: disable=unused-argume txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "mozilla" - with open(txt_file, "r", encoding="ISO 8859-1") as ttf: + with open(txt_file, encoding="ISO 8859-1") as ttf: for line in ttf: cols = line.strip().split("|") wav_file = cols[0].strip() @@ -177,7 +176,7 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None): if speaker_name in ignored_speakers: continue logger.info(csv_file) - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") if not meta_files: @@ -201,7 +200,7 @@ def ljspeech(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "ljspeech" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") @@ -215,7 +214,7 @@ def ljspeech_test(root_path, meta_file, **kwargs): # pylint: disable=unused-arg https://keithito.com/LJ-Speech-Dataset/""" txt_file = os.path.join(root_path, meta_file) items = [] - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: speaker_id = 0 for idx, line in enumerate(ttf): # 2 samples per speaker to avoid eval split issues @@ -236,7 +235,7 @@ def thorsten(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "thorsten" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") @@ -268,7 +267,7 @@ def ruslan(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "ruslan" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, "RUSLAN", cols[0] + ".wav") @@ -282,7 +281,7 @@ def css10(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "css10" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, cols[0]) @@ -296,7 +295,7 @@ def nancy(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "nancy" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: utt_id = line.split()[1] text = line[line.find('"') + 1 : line.rfind('"') - 1] @@ -309,7 +308,7 @@ def common_voice(root_path, meta_file, ignored_speakers=None): """Normalize the common voice meta data file to TTS format.""" txt_file = os.path.join(root_path, meta_file) items = [] - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: if line.startswith("client_id"): continue @@ -338,7 +337,7 @@ def libri_tts(root_path, meta_files=None, ignored_speakers=None): for meta_file in meta_files: _meta_file = os.path.basename(meta_file).split(".")[0] - with open(meta_file, "r", encoding="utf-8") as ttf: + with open(meta_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("\t") file_name = cols[0] @@ -368,7 +367,7 @@ def custom_turkish(root_path, meta_file, **kwargs): # pylint: disable=unused-ar items = [] speaker_name = "turkish-female" skipped_files = [] - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, "wavs", cols[0].strip() + ".wav") @@ -386,7 +385,7 @@ def brspeech(root_path, meta_file, ignored_speakers=None): """BRSpeech 3.0 beta""" txt_file = os.path.join(root_path, meta_file) items = [] - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: if line.startswith("wav_filename"): continue @@ -425,7 +424,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic """ file_ext = "flac" items = [] - meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) + meta_files = glob(f"{os.path.join(root_path, 'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] @@ -433,7 +432,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic if isinstance(ignored_speakers, list): if speaker_id in ignored_speakers: continue - with open(meta_file, "r", encoding="utf-8") as file_text: + with open(meta_file, encoding="utf-8") as file_text: text = file_text.readlines()[0] # p280 has no mic2 recordings if speaker_id == "p280": @@ -452,7 +451,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" items = [] - meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) + meta_files = glob(f"{os.path.join(root_path, 'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] @@ -460,7 +459,7 @@ def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=Non if isinstance(ignored_speakers, list): if speaker_id in ignored_speakers: continue - with open(meta_file, "r", encoding="utf-8") as file_text: + with open(meta_file, encoding="utf-8") as file_text: text = file_text.readlines()[0] wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav") items.append( @@ -482,7 +481,7 @@ def synpaflex(root_path, metafiles=None, **kwargs): # pylint: disable=unused-ar os.path.dirname(wav_file), "txt", os.path.basename(wav_file).replace(".wav", ".txt") ) if os.path.exists(txt_file) and os.path.exists(wav_file): - with open(txt_file, "r", encoding="utf-8") as file_text: + with open(txt_file, encoding="utf-8") as file_text: text = file_text.readlines()[0] items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}) return items @@ -500,7 +499,7 @@ def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, igno if isinstance(ignored_speakers, list): if speaker_id in ignored_speakers: continue - with open(meta_file, "r", encoding="utf-8") as file_text: + with open(meta_file, encoding="utf-8") as file_text: text = file_text.readline().replace("\n", "") # ignore sentences that contains digits if ignore_digits_sentences and any(map(str.isdigit, text)): @@ -513,7 +512,7 @@ def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, igno def mls(root_path, meta_files=None, ignored_speakers=None): """http://www.openslr.org/94/""" items = [] - with open(os.path.join(root_path, meta_files), "r", encoding="utf-8") as meta: + with open(os.path.join(root_path, meta_files), encoding="utf-8") as meta: for line in meta: file, text = line.split("\t") text = text[:-1] @@ -553,7 +552,7 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): # if not exists meta file, crawl recursively for 'wav' files if meta_file is not None: - with open(str(meta_file), "r", encoding="utf-8") as f: + with open(str(meta_file), encoding="utf-8") as f: return [x.strip().split("|") for x in f.readlines()] elif not cache_to.exists(): @@ -575,7 +574,7 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): if cnt < expected_count: raise ValueError(f"Found too few instances for Voxceleb. Should be around {expected_count}, is: {cnt}") - with open(str(cache_to), "r", encoding="utf-8") as f: + with open(str(cache_to), encoding="utf-8") as f: return [x.strip().split("|") for x in f.readlines()] @@ -583,7 +582,7 @@ def emotion(root_path, meta_file, ignored_speakers=None): """Generic emotion dataset""" txt_file = os.path.join(root_path, meta_file) items = [] - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: if line.startswith("file_path"): continue @@ -601,7 +600,7 @@ def emotion(root_path, meta_file, ignored_speakers=None): return items -def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: # pylint: disable=unused-argument +def baker(root_path: str, meta_file: str, **kwargs) -> list[list[str]]: # pylint: disable=unused-argument """Normalizes the Baker meta data file to TTS format Args: @@ -613,7 +612,7 @@ def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: # pylin txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "baker" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: wav_name, text = line.rstrip("\n").split("|") wav_path = os.path.join(root_path, "clips_22", wav_name) @@ -626,7 +625,7 @@ def kokoro(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "kokoro" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") @@ -640,7 +639,7 @@ def kss(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "kss" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, cols[0]) @@ -653,7 +652,7 @@ def bel_tts_formatter(root_path, meta_file, **kwargs): # pylint: disable=unused txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "bel_tts" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, cols[0]) diff --git a/TTS/tts/layers/bark/hubert/kmeans_hubert.py b/TTS/tts/layers/bark/hubert/kmeans_hubert.py index ade84794eb..87be97d5d1 100644 --- a/TTS/tts/layers/bark/hubert/kmeans_hubert.py +++ b/TTS/tts/layers/bark/hubert/kmeans_hubert.py @@ -7,7 +7,6 @@ # Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py - import torch from einops import pack, unpack from torch import nn diff --git a/TTS/tts/layers/bark/inference_funcs.py b/TTS/tts/layers/bark/inference_funcs.py index 65c7800dcf..457a20ea28 100644 --- a/TTS/tts/layers/bark/inference_funcs.py +++ b/TTS/tts/layers/bark/inference_funcs.py @@ -2,7 +2,6 @@ import os import re from glob import glob -from typing import Dict, List, Optional, Tuple import librosa import numpy as np @@ -34,9 +33,9 @@ def _normalize_whitespace(text): return re.sub(r"\s+", " ", text).strip() -def get_voices(extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value +def get_voices(extra_voice_dirs: list[str] = []): # pylint: disable=dangerous-default-value dirs = extra_voice_dirs - voices: Dict[str, List[str]] = {} + voices: dict[str, list[str]] = {} for d in dirs: subs = os.listdir(d) for sub in subs: @@ -49,7 +48,7 @@ def get_voices(extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-d return voices -def load_npz(npz_file: str) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]: +def load_npz(npz_file: str) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]: x_history = np.load(npz_file) semantic = x_history["semantic_prompt"] coarse = x_history["coarse_prompt"] @@ -58,10 +57,8 @@ def load_npz(npz_file: str) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64 def load_voice( - model, voice: str, extra_voice_dirs: List[str] = [] -) -> Tuple[ - Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]] -]: # pylint: disable=dangerous-default-value + model, voice: str, extra_voice_dirs: list[str] = [] +) -> tuple[npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None]: # pylint: disable=dangerous-default-value if voice == "random": return None, None, None @@ -206,8 +203,8 @@ def generate_text_semantic( semantic_history = None encoded_text = np.array(_tokenize(model.tokenizer, text)) + model.config.TEXT_ENCODING_OFFSET if len(encoded_text) > 256: - p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1) - logger.warning(f"warning, text too long, lopping of last {p}%") + p = (len(encoded_text) - 256) / len(encoded_text) * 100 + logger.warning("warning, text too long, lopping of last %.1f%%", p) encoded_text = encoded_text[:256] encoded_text = np.pad( encoded_text, diff --git a/TTS/tts/layers/bark/load_model.py b/TTS/tts/layers/bark/load_model.py index 6b7caab916..dcec5b5bbc 100644 --- a/TTS/tts/layers/bark/load_model.py +++ b/TTS/tts/layers/bark/load_model.py @@ -88,7 +88,7 @@ def clear_cuda_cache(): def load_model(ckpt_path, device, config, model_type="text"): - logger.info(f"loading {model_type} model from {ckpt_path}...") + logger.info("loading %s model from %s...", model_type, ckpt_path) if device == "cpu": logger.warning("No GPU being used. Careful, Inference might be extremely slow!") @@ -108,11 +108,13 @@ def load_model(ckpt_path, device, config, model_type="text"): and os.path.exists(ckpt_path) and _md5(ckpt_path) != config.REMOTE_MODEL_PATHS[model_type]["checksum"] ): - logger.warning(f"found outdated {model_type} model, removing...") + logger.warning("found outdated %s model, removing...", model_type) os.remove(ckpt_path) if not os.path.exists(ckpt_path): - logger.info(f"{model_type} model not found, downloading...") - _download(config.REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path, config.CACHE_DIR) + logger.info("%s model not found, downloading...", model_type) + # The URL in the config is a 404 and needs to be fixed + download_url = config.REMOTE_MODEL_PATHS[model_type]["path"].replace("tree", "resolve") + _download(download_url, ckpt_path, config.CACHE_DIR) checkpoint = torch.load(ckpt_path, map_location=device, weights_only=is_pytorch_at_least_2_4()) # this is a hack @@ -148,7 +150,7 @@ def load_model(ckpt_path, device, config, model_type="text"): model.load_state_dict(state_dict, strict=False) n_params = model.get_num_params() val_loss = checkpoint["best_val_loss"].item() - logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss") + logger.info("model loaded: %.1fM params, %.3f loss", n_params / 1e6, val_loss) model.eval() model.to(device) del checkpoint, state_dict diff --git a/TTS/tts/layers/bark/model.py b/TTS/tts/layers/bark/model.py index 54a9cecec0..4850d0a88b 100644 --- a/TTS/tts/layers/bark/model.py +++ b/TTS/tts/layers/bark/model.py @@ -175,9 +175,9 @@ def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use assert idx.shape[1] >= 256 + 256 + 1 t = idx.shape[1] - 256 else: - assert ( - t <= self.config.block_size - ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + assert t <= self.config.block_size, ( + f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + ) # forward the GPT model itself if merge_context: diff --git a/TTS/tts/layers/bark/model_fine.py b/TTS/tts/layers/bark/model_fine.py index 29126b41ab..20f54d2152 100644 --- a/TTS/tts/layers/bark/model_fine.py +++ b/TTS/tts/layers/bark/model_fine.py @@ -101,9 +101,9 @@ def __init__(self, config): def forward(self, pred_idx, idx): device = idx.device b, t, codes = idx.size() - assert ( - t <= self.config.block_size - ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + assert t <= self.config.block_size, ( + f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + ) assert pred_idx > 0, "cannot predict 0th codebook" assert codes == self.n_codes_total, (b, t, codes) pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t) diff --git a/TTS/tts/layers/delightful_tts/acoustic_model.py b/TTS/tts/layers/delightful_tts/acoustic_model.py index 981d6cdb1f..9110ff5fd0 100644 --- a/TTS/tts/layers/delightful_tts/acoustic_model.py +++ b/TTS/tts/layers/delightful_tts/acoustic_model.py @@ -1,6 +1,6 @@ ### credit: https://github.com/dunky11/voicesmith import logging -from typing import Callable, Dict, Tuple +from collections.abc import Callable import torch import torch.nn.functional as F @@ -177,7 +177,7 @@ def init_multispeaker(self, args: Coqpit): # pylint: disable=unused-argument self._init_d_vector() @staticmethod - def _set_cond_input(aux_input: Dict): + def _set_cond_input(aux_input: dict): """Set the speaker conditioning input based on the multi-speaker mode.""" sid, g, lid, durations = None, None, None, None if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None: @@ -194,11 +194,11 @@ def _set_cond_input(aux_input: Dict): return sid, g, lid, durations - def get_aux_input(self, aux_input: Dict): + def get_aux_input(self, aux_input: dict): sid, g, lid, _ = self._set_cond_input(aux_input) return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid} - def _set_speaker_input(self, aux_input: Dict): + def _set_speaker_input(self, aux_input: dict): d_vectors = aux_input.get("d_vectors", None) speaker_ids = aux_input.get("speaker_ids", None) @@ -237,7 +237,7 @@ def _forward_aligner( x_mask: torch.IntTensor, y_mask: torch.IntTensor, attn_priors: torch.FloatTensor, - ) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: """Aligner forward pass. 1. Compute a mask to apply to the attention map. @@ -298,7 +298,7 @@ def forward( use_ground_truth: bool = True, d_vectors: torch.Tensor = None, speaker_idx: torch.Tensor = None, - ) -> Dict[str, torch.Tensor]: + ) -> dict[str, torch.Tensor]: sid, g, lid, _ = self._set_cond_input( # pylint: disable=unused-variable {"d_vectors": d_vectors, "speaker_ids": speaker_idx} ) # pylint: disable=unused-variable @@ -421,7 +421,7 @@ def forward( "spk_emb": speaker_embedding, } - @torch.no_grad() + @torch.inference_mode() def inference( self, tokens: torch.Tensor, diff --git a/TTS/tts/layers/delightful_tts/conv_layers.py b/TTS/tts/layers/delightful_tts/conv_layers.py index 1d5139571e..5cf41d4ff6 100644 --- a/TTS/tts/layers/delightful_tts/conv_layers.py +++ b/TTS/tts/layers/delightful_tts/conv_layers.py @@ -1,11 +1,9 @@ -from typing import Tuple - import torch import torch.nn as nn # pylint: disable=consider-using-from-import import torch.nn.functional as F -def calc_same_padding(kernel_size: int) -> Tuple[int, int]: +def calc_same_padding(kernel_size: int) -> tuple[int, int]: pad = kernel_size // 2 return (pad, pad - (kernel_size + 1) % 2) @@ -52,7 +50,7 @@ def __init__( w_init_gain="linear", use_weight_norm=False, ): - super(ConvNorm, self).__init__() # pylint: disable=super-with-arguments + super().__init__() if padding is None: assert kernel_size % 2 == 1 padding = int(dilation * (kernel_size - 1) / 2) @@ -94,7 +92,7 @@ def __init__( lstm_type="bilstm", use_linear=True, ): - super(ConvLSTMLinear, self).__init__() # pylint: disable=super-with-arguments + super().__init__() self.out_dim = out_dim self.lstm_type = lstm_type self.use_linear = use_linear diff --git a/TTS/tts/layers/delightful_tts/encoders.py b/TTS/tts/layers/delightful_tts/encoders.py index bd0c319dc1..31bab8cc97 100644 --- a/TTS/tts/layers/delightful_tts/encoders.py +++ b/TTS/tts/layers/delightful_tts/encoders.py @@ -1,5 +1,3 @@ -from typing import List, Tuple, Union - import torch import torch.nn as nn # pylint: disable=consider-using-from-import import torch.nn.functional as F @@ -36,9 +34,9 @@ class ReferenceEncoder(nn.Module): def __init__( self, num_mels: int, - ref_enc_filters: List[Union[int, int, int, int, int, int]], + ref_enc_filters: list[int | int | int | int | int | int], ref_enc_size: int, - ref_enc_strides: List[Union[int, int, int, int, int]], + ref_enc_strides: list[int | int | int | int | int], ref_enc_gru_size: int, ): super().__init__() @@ -80,7 +78,7 @@ def __init__( batch_first=True, ) - def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ inputs --- [N, n_mels, timesteps] outputs --- [N, E//2] @@ -120,9 +118,9 @@ class UtteranceLevelProsodyEncoder(nn.Module): def __init__( self, num_mels: int, - ref_enc_filters: List[Union[int, int, int, int, int, int]], + ref_enc_filters: list[int | int | int | int | int | int], ref_enc_size: int, - ref_enc_strides: List[Union[int, int, int, int, int]], + ref_enc_strides: list[int | int | int | int | int], ref_enc_gru_size: int, dropout: float, n_hidden: int, @@ -192,9 +190,9 @@ class PhonemeLevelProsodyEncoder(nn.Module): def __init__( self, num_mels: int, - ref_enc_filters: List[Union[int, int, int, int, int, int]], + ref_enc_filters: list[int | int | int | int | int | int], ref_enc_size: int, - ref_enc_strides: List[Union[int, int, int, int, int]], + ref_enc_strides: list[int | int | int | int | int], ref_enc_gru_size: int, dropout: float, n_hidden: int, diff --git a/TTS/tts/layers/delightful_tts/energy_adaptor.py b/TTS/tts/layers/delightful_tts/energy_adaptor.py index ea0d1e4721..d2b4b0ffa8 100644 --- a/TTS/tts/layers/delightful_tts/energy_adaptor.py +++ b/TTS/tts/layers/delightful_tts/energy_adaptor.py @@ -1,4 +1,4 @@ -from typing import Callable, Tuple +from collections.abc import Callable import torch import torch.nn as nn # pylint: disable=consider-using-from-import @@ -59,7 +59,7 @@ def __init__( def get_energy_embedding_train( self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Shapes: x: :math: `[B, T_src, C]` diff --git a/TTS/tts/layers/delightful_tts/networks.py b/TTS/tts/layers/delightful_tts/networks.py index 4305022f18..93b65a2a74 100644 --- a/TTS/tts/layers/delightful_tts/networks.py +++ b/TTS/tts/layers/delightful_tts/networks.py @@ -1,5 +1,4 @@ import math -from typing import Tuple import numpy as np import torch @@ -9,7 +8,7 @@ from TTS.tts.layers.delightful_tts.conv_layers import ConvNorm -def initialize_embeddings(shape: Tuple[int]) -> torch.Tensor: +def initialize_embeddings(shape: tuple[int]) -> torch.Tensor: assert len(shape) == 2, "Can only initialize 2-D embedding matrices ..." # Kaiming initialization return torch.randn(shape) * np.sqrt(2 / shape[1]) @@ -52,7 +51,7 @@ def __init__( kernel_size=3, use_partial_padding=False, # pylint: disable=unused-argument ): - super(BottleneckLayer, self).__init__() # pylint: disable=super-with-arguments + super().__init__() self.reduction_factor = reduction_factor reduced_dim = int(in_dim / reduction_factor) @@ -195,7 +194,7 @@ class STL(nn.Module): """ def __init__(self, n_hidden: int, token_num: int): - super(STL, self).__init__() # pylint: disable=super-with-arguments + super().__init__() num_heads = 1 E = n_hidden diff --git a/TTS/tts/layers/delightful_tts/pitch_adaptor.py b/TTS/tts/layers/delightful_tts/pitch_adaptor.py index 9031369e0f..14e751d2e2 100644 --- a/TTS/tts/layers/delightful_tts/pitch_adaptor.py +++ b/TTS/tts/layers/delightful_tts/pitch_adaptor.py @@ -1,4 +1,4 @@ -from typing import Callable, Tuple +from collections.abc import Callable import torch import torch.nn as nn # pylint: disable=consider-using-from-import @@ -58,7 +58,7 @@ def __init__( def get_pitch_embedding_train( self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Shapes: x: :math: `[B, T_src, C]` diff --git a/TTS/tts/layers/feed_forward/encoder.py b/TTS/tts/layers/feed_forward/encoder.py index caf939ffc7..2d08f03c2d 100644 --- a/TTS/tts/layers/feed_forward/encoder.py +++ b/TTS/tts/layers/feed_forward/encoder.py @@ -143,9 +143,9 @@ def __init__( elif encoder_type.lower() == "residual_conv_bn": self.encoder = ResidualConv1dBNEncoder(in_hidden_channels, out_channels, in_hidden_channels, encoder_params) elif encoder_type.lower() == "fftransformer": - assert ( - in_hidden_channels == out_channels - ), "[!] must be `in_channels` == `out_channels` when encoder type is 'fftransformer'" + assert in_hidden_channels == out_channels, ( + "[!] must be `in_channels` == `out_channels` when encoder type is 'fftransformer'" + ) # pylint: disable=unexpected-keyword-arg self.encoder = FFTransformerBlock(in_hidden_channels, **encoder_params) else: diff --git a/TTS/tts/layers/generic/aligner.py b/TTS/tts/layers/generic/aligner.py index baa6f0e9c4..480c48f9a4 100644 --- a/TTS/tts/layers/generic/aligner.py +++ b/TTS/tts/layers/generic/aligner.py @@ -1,5 +1,3 @@ -from typing import Tuple - import torch from torch import nn @@ -68,7 +66,7 @@ def init_layers(self): def forward( self, queries: torch.tensor, keys: torch.tensor, mask: torch.tensor = None, attn_prior: torch.tensor = None - ) -> Tuple[torch.tensor, torch.tensor]: + ) -> tuple[torch.tensor, torch.tensor]: """Forward pass of the aligner encoder. Shapes: - queries: :math:`[B, C, T_de]` diff --git a/TTS/tts/layers/generic/pos_encoding.py b/TTS/tts/layers/generic/pos_encoding.py index 913add0d14..7765e224aa 100644 --- a/TTS/tts/layers/generic/pos_encoding.py +++ b/TTS/tts/layers/generic/pos_encoding.py @@ -18,9 +18,7 @@ class PositionalEncoding(nn.Module): def __init__(self, channels, dropout_p=0.0, max_len=5000, use_scale=False): super().__init__() if channels % 2 != 0: - raise ValueError( - "Cannot use sin/cos positional encoding with " "odd channels (got channels={:d})".format(channels) - ) + raise ValueError(f"Cannot use sin/cos positional encoding with odd channels (got channels={channels:d})") self.use_scale = use_scale if use_scale: self.scale = torch.nn.Parameter(torch.ones(1)) diff --git a/TTS/tts/layers/generic/transformer.py b/TTS/tts/layers/generic/transformer.py index 9b7ecee2ba..2fe9bcc408 100644 --- a/TTS/tts/layers/generic/transformer.py +++ b/TTS/tts/layers/generic/transformer.py @@ -70,9 +70,7 @@ def forward(self, x, mask=None, g=None): # pylint: disable=unused-argument class FFTDurationPredictor: - def __init__( - self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None - ): # pylint: disable=unused-argument + def __init__(self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None): # pylint: disable=unused-argument self.fft = FFTransformerBlock(in_channels, num_heads, hidden_channels, num_layers, dropout_p) self.proj = nn.Linear(in_channels, 1) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index db62430c9d..1e744d62cf 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -814,7 +814,7 @@ def __init__(self, c): elif c.spec_loss_type == "l1": self.spec_loss = L1LossMasked(False) else: - raise ValueError(" [!] Unknown spec_loss_type {}".format(c.spec_loss_type)) + raise ValueError(f" [!] Unknown spec_loss_type {c.spec_loss_type}") if c.duration_loss_type == "mse": self.dur_loss = MSELossMasked(False) @@ -823,7 +823,7 @@ def __init__(self, c): elif c.duration_loss_type == "huber": self.dur_loss = Huber() else: - raise ValueError(" [!] Unknown duration_loss_type {}".format(c.duration_loss_type)) + raise ValueError(f" [!] Unknown duration_loss_type {c.duration_loss_type}") if c.model_args.use_aligner: self.aligner_loss = ForwardSumLoss() diff --git a/TTS/tts/layers/overflow/common_layers.py b/TTS/tts/layers/overflow/common_layers.py index 9f77af293c..a477b34f0b 100644 --- a/TTS/tts/layers/overflow/common_layers.py +++ b/TTS/tts/layers/overflow/common_layers.py @@ -1,5 +1,4 @@ import logging -from typing import List, Tuple import torch import torch.nn.functional as F @@ -44,7 +43,7 @@ def __init__(self, num_chars, state_per_phone, in_out_channels=512, n_convolutio ) self.rnn_state = None - def forward(self, x: torch.FloatTensor, x_len: torch.LongTensor) -> Tuple[torch.FloatTensor, torch.LongTensor]: + def forward(self, x: torch.FloatTensor, x_len: torch.LongTensor) -> tuple[torch.FloatTensor, torch.LongTensor]: """Forward pass to the encoder. Args: @@ -110,7 +109,7 @@ class ParameterModel(nn.Module): def __init__( self, - outputnet_size: List[int], + outputnet_size: list[int], input_size: int, output_size: int, frame_channels: int, @@ -152,7 +151,7 @@ def __init__( encoder_dim: int, memory_rnn_dim: int, frame_channels: int, - outputnet_size: List[int], + outputnet_size: list[int], flat_start_params: dict, std_floor: float = 1e-2, ): diff --git a/TTS/tts/layers/overflow/neural_hmm.py b/TTS/tts/layers/overflow/neural_hmm.py index a12becef03..9142f65e8c 100644 --- a/TTS/tts/layers/overflow/neural_hmm.py +++ b/TTS/tts/layers/overflow/neural_hmm.py @@ -1,5 +1,3 @@ -from typing import List - import torch import torch.distributions as tdist import torch.nn.functional as F @@ -57,7 +55,7 @@ def __init__( prenet_dropout: float, prenet_dropout_at_inference: bool, memory_rnn_dim: int, - outputnet_size: List[int], + outputnet_size: list[int], flat_start_params: dict, std_floor: float, use_grad_checkpointing: bool = True, diff --git a/TTS/tts/layers/tacotron/tacotron.py b/TTS/tts/layers/tacotron/tacotron.py index 32643dfcee..6f33edf3d7 100644 --- a/TTS/tts/layers/tacotron/tacotron.py +++ b/TTS/tts/layers/tacotron/tacotron.py @@ -1,4 +1,3 @@ -# coding: utf-8 # adapted from https://github.com/r9y9/tacotron_pytorch import logging diff --git a/TTS/tts/layers/tortoise/arch_utils.py b/TTS/tts/layers/tortoise/arch_utils.py index 1bbf676393..00fa559c77 100644 --- a/TTS/tts/layers/tortoise/arch_utils.py +++ b/TTS/tts/layers/tortoise/arch_utils.py @@ -101,9 +101,9 @@ def __init__( if num_head_channels == -1: self.num_heads = num_heads else: - assert ( - channels % num_head_channels == 0 - ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + assert channels % num_head_channels == 0, ( + f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + ) self.num_heads = channels // num_head_channels self.norm = normalization(channels) self.qkv = nn.Conv1d(channels, channels * 3, 1) diff --git a/TTS/tts/layers/tortoise/audio_utils.py b/TTS/tts/layers/tortoise/audio_utils.py index c67ee6c44b..6bbe6c389c 100644 --- a/TTS/tts/layers/tortoise/audio_utils.py +++ b/TTS/tts/layers/tortoise/audio_utils.py @@ -1,7 +1,6 @@ import logging import os from glob import glob -from typing import Dict, List import librosa import numpy as np @@ -88,9 +87,9 @@ def normalize_tacotron_mel(mel): return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1 -def get_voices(extra_voice_dirs: List[str] = []): +def get_voices(extra_voice_dirs: list[str] = []): dirs = extra_voice_dirs - voices: Dict[str, List[str]] = {} + voices: dict[str, list[str]] = {} for d in dirs: subs = os.listdir(d) for sub in subs: @@ -100,7 +99,7 @@ def get_voices(extra_voice_dirs: List[str] = []): return voices -def load_voice(voice: str, extra_voice_dirs: List[str] = []): +def load_voice(voice: str, extra_voice_dirs: list[str] = []): if voice == "random": return None, None @@ -116,7 +115,7 @@ def load_voice(voice: str, extra_voice_dirs: List[str] = []): return conds, None -def load_voices(voices: List[str], extra_voice_dirs: List[str] = []): +def load_voices(voices: list[str], extra_voice_dirs: list[str] = []): latents = [] clips = [] for voice in voices: @@ -126,14 +125,14 @@ def load_voices(voices: List[str], extra_voice_dirs: List[str] = []): return None, None clip, latent = load_voice(voice, extra_voice_dirs) if latent is None: - assert ( - len(latents) == 0 - ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + assert len(latents) == 0, ( + "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + ) clips.extend(clip) elif clip is None: - assert ( - len(clips) == 0 - ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + assert len(clips) == 0, ( + "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + ) latents.append(latent) if len(latents) == 0: return clips, None diff --git a/TTS/tts/layers/tortoise/autoregressive.py b/TTS/tts/layers/tortoise/autoregressive.py index 00c884e973..eaeb2a03c1 100644 --- a/TTS/tts/layers/tortoise/autoregressive.py +++ b/TTS/tts/layers/tortoise/autoregressive.py @@ -1,7 +1,6 @@ # AGPL: a notification must be added stating that changes have been made to that file. import functools import random -from typing import Optional import torch import torch.nn as nn @@ -609,9 +608,9 @@ def inference_speech( if input_tokens is None: inputs = fake_inputs else: - assert ( - num_return_sequences % input_tokens.shape[0] == 0 - ), "The number of return sequences must be divisible by the number of input sequences" + assert num_return_sequences % input_tokens.shape[0] == 0, ( + "The number of return sequences must be divisible by the number of input sequences" + ) fake_inputs = fake_inputs.repeat(num_return_sequences, 1) input_tokens = input_tokens.repeat(num_return_sequences // input_tokens.shape[0], 1) inputs = torch.cat([fake_inputs, input_tokens], dim=1) @@ -640,8 +639,8 @@ def inference_speech( def _prepare_attention_mask_for_generation( inputs: torch.Tensor, - pad_token_id: Optional[torch.Tensor], - eos_token_id: Optional[torch.Tensor], + pad_token_id: torch.Tensor | None, + eos_token_id: torch.Tensor | None, ) -> torch.LongTensor: # No information for attention mask inference -> return default attention mask default_attention_mask = torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device) diff --git a/TTS/tts/layers/tortoise/diffusion.py b/TTS/tts/layers/tortoise/diffusion.py index 2b29091b44..cfb8fa800d 100644 --- a/TTS/tts/layers/tortoise/diffusion.py +++ b/TTS/tts/layers/tortoise/diffusion.py @@ -653,7 +653,7 @@ def p_sample_loop_progressive( """ if device is None: device = next(model.parameters()).device - assert isinstance(shape, (tuple, list)) + assert isinstance(shape, tuple | list) if noise is not None: img = noise else: @@ -805,7 +805,7 @@ def ddim_sample_loop_progressive( """ if device is None: device = next(model.parameters()).device - assert isinstance(shape, (tuple, list)) + assert isinstance(shape, tuple | list) if noise is not None: img = noise else: diff --git a/TTS/tts/layers/tortoise/dpm_solver.py b/TTS/tts/layers/tortoise/dpm_solver.py index 6a1d8ff784..c8892d456a 100644 --- a/TTS/tts/layers/tortoise/dpm_solver.py +++ b/TTS/tts/layers/tortoise/dpm_solver.py @@ -98,9 +98,7 @@ def __init__( if schedule not in ["discrete", "linear", "cosine"]: raise ValueError( - "Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format( - schedule - ) + f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear' or 'cosine'" ) self.schedule = schedule @@ -150,7 +148,7 @@ def marginal_log_mean_coeff(self, t): t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device), - ).reshape((-1)) + ).reshape(-1) elif self.schedule == "linear": return -0.25 * t**2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0 elif self.schedule == "cosine": @@ -447,7 +445,7 @@ def correcting_xt_fn(xt, t, step): Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b. """ - self.model = lambda x, t: model_fn(x, t.expand((x.shape[0]))) + self.model = lambda x, t: model_fn(x, t.expand(x.shape[0])) self.noise_schedule = noise_schedule assert algorithm_type in ["dpmsolver", "dpmsolver++"] self.algorithm_type = algorithm_type @@ -527,7 +525,7 @@ def get_time_steps(self, skip_type, t_T, t_0, N, device): return t else: raise ValueError( - "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type) + f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'" ) def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device): @@ -565,41 +563,21 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type if order == 3: K = steps // 3 + 1 if steps % 3 == 0: - orders = [ - 3, - ] * ( - K - 2 - ) + [2, 1] + orders = [3] * (K - 2) + [2, 1] elif steps % 3 == 1: - orders = [ - 3, - ] * ( - K - 1 - ) + [1] + orders = [3] * (K - 1) + [1] else: - orders = [ - 3, - ] * ( - K - 1 - ) + [2] + orders = [3] * (K - 1) + [2] elif order == 2: if steps % 2 == 0: K = steps // 2 - orders = [ - 2, - ] * K + orders = [2] * K else: K = steps // 2 + 1 - orders = [ - 2, - ] * ( - K - 1 - ) + [1] + orders = [2] * (K - 1) + [1] elif order == 1: K = 1 - orders = [ - 1, - ] * steps + orders = [1] * steps else: raise ValueError("'order' must be '1' or '2' or '3'.") if skip_type == "logSNR": @@ -607,15 +585,7 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device) else: timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[ - torch.cumsum( - torch.tensor( - [ - 0, - ] - + orders - ), - 0, - ).to(device) + torch.cumsum(torch.tensor([0] + orders), 0).to(device) ] return timesteps_outer, orders @@ -693,7 +663,7 @@ def singlestep_dpm_solver_second_update( x_t: A pytorch tensor. The approximated solution at time `t`. """ if solver_type not in ["dpmsolver", "taylor"]: - raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type)) + raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}") if r1 is None: r1 = 0.5 ns = self.noise_schedule @@ -790,7 +760,7 @@ def singlestep_dpm_solver_third_update( x_t: A pytorch tensor. The approximated solution at time `t`. """ if solver_type not in ["dpmsolver", "taylor"]: - raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type)) + raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}") if r1 is None: r1 = 1.0 / 3.0 if r2 is None: @@ -913,7 +883,7 @@ def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, x_t: A pytorch tensor. The approximated solution at time `t`. """ if solver_type not in ["dpmsolver", "taylor"]: - raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type)) + raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}") ns = self.noise_schedule model_prev_1, model_prev_0 = model_prev_list[-2], model_prev_list[-1] t_prev_1, t_prev_0 = t_prev_list[-2], t_prev_list[-1] @@ -1062,7 +1032,7 @@ def singlestep_dpm_solver_update( r2=r2, ) else: - raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order)) + raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}") def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type="dpmsolver"): """ @@ -1086,7 +1056,7 @@ def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, elif order == 3: return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type) else: - raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order)) + raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}") def dpm_solver_adaptive( self, @@ -1150,8 +1120,8 @@ def higher_update(x, s, t, **kwargs): return self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs) else: - raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order)) - while torch.abs((s - t_0)).mean() > t_err: + raise ValueError(f"For adaptive step size solver, order must be 2 or 3, got {order}") + while torch.abs(s - t_0).mean() > t_err: t = ns.inverse_lambda(lambda_s + h) x_lower, lower_noise_kwargs = lower_update(x, s, t) x_higher = higher_update(x, s, t, **lower_noise_kwargs) @@ -1219,9 +1189,9 @@ def inverse( """ t_0 = 1.0 / self.noise_schedule.total_N if t_start is None else t_start t_T = self.noise_schedule.T if t_end is None else t_end - assert ( - t_0 > 0 and t_T > 0 - ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + assert t_0 > 0 and t_T > 0, ( + "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + ) return self.sample( x, steps=steps, @@ -1364,9 +1334,9 @@ def sample( """ t_0 = 1.0 / self.noise_schedule.total_N if t_end is None else t_end t_T = self.noise_schedule.T if t_start is None else t_start - assert ( - t_0 > 0 and t_T > 0 - ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + assert t_0 > 0 and t_T > 0, ( + "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + ) if return_intermediate: assert method in [ "multistep", @@ -1487,7 +1457,7 @@ def sample( if return_intermediate: intermediates.append(x) else: - raise ValueError("Got wrong method {}".format(method)) + raise ValueError(f"Got wrong method {method}") if denoise_to_zero: t = torch.ones((1,)).to(device) * t_0 x = self.denoise_to_zero_fn(x, t) diff --git a/TTS/tts/layers/tortoise/transformer.py b/TTS/tts/layers/tortoise/transformer.py index ed4d79d4ab..531f294220 100644 --- a/TTS/tts/layers/tortoise/transformer.py +++ b/TTS/tts/layers/tortoise/transformer.py @@ -1,4 +1,4 @@ -from typing import TypeVar, Union +from typing import TypeVar import torch import torch.nn.functional as F @@ -11,7 +11,7 @@ _T = TypeVar("_T") -def cast_tuple(val: Union[tuple[_T], list[_T], _T], depth: int = 1) -> tuple[_T]: +def cast_tuple(val: tuple[_T] | list[_T] | _T, depth: int = 1) -> tuple[_T]: if isinstance(val, list): return tuple(val) return val if isinstance(val, tuple) else (val,) * depth @@ -43,9 +43,9 @@ def route_args(router, args, depth): class SequentialSequence(nn.Module): def __init__(self, layers, args_route={}, layer_dropout=0.0): super().__init__() - assert all( - len(route) == len(layers) for route in args_route.values() - ), "each argument route map must have the same depth as the number of sequential layers" + assert all(len(route) == len(layers) for route in args_route.values()), ( + "each argument route map must have the same depth as the number of sequential layers" + ) self.layers = layers self.args_route = args_route self.layer_dropout = layer_dropout diff --git a/TTS/tts/layers/tortoise/vocoder.py b/TTS/tts/layers/tortoise/vocoder.py index a5200c2673..e7497d8190 100644 --- a/TTS/tts/layers/tortoise/vocoder.py +++ b/TTS/tts/layers/tortoise/vocoder.py @@ -1,6 +1,6 @@ +from collections.abc import Callable from dataclasses import dataclass from enum import Enum -from typing import Callable, Optional import torch import torch.nn as nn @@ -293,7 +293,7 @@ def __init__( hop_length=256, n_mel_channels=100, ): - super(UnivNetGenerator, self).__init__() + super().__init__() self.mel_channel = n_mel_channels self.noise_dim = noise_dim self.hop_length = hop_length @@ -344,7 +344,7 @@ def forward(self, c, z): return z def eval(self, inference=False): - super(UnivNetGenerator, self).eval() + super().eval() # don't remove weight norm while validation in training loop if inference: self.remove_weight_norm() @@ -378,7 +378,7 @@ def inference(self, c, z=None): class VocType: constructor: Callable[[], nn.Module] model_path: str - subkey: Optional[str] = None + subkey: str | None = None def optionally_index(self, model_dict): if self.subkey is not None: diff --git a/TTS/tts/layers/tortoise/xtransformers.py b/TTS/tts/layers/tortoise/xtransformers.py index 0892fee19d..b2e74cf118 100644 --- a/TTS/tts/layers/tortoise/xtransformers.py +++ b/TTS/tts/layers/tortoise/xtransformers.py @@ -560,9 +560,9 @@ def __init__( self.rel_pos_bias = rel_pos_bias if rel_pos_bias: - assert ( - rel_pos_num_buckets <= rel_pos_max_distance - ), "number of relative position buckets must be less than the relative position max distance" + assert rel_pos_num_buckets <= rel_pos_max_distance, ( + "number of relative position buckets must be less than the relative position max distance" + ) self.rel_pos = RelativePositionBias( scale=dim_head**0.5, causal=causal, @@ -680,9 +680,9 @@ def forward( del input_mask if exists(attn_mask): - assert ( - 2 <= attn_mask.ndim <= 4 - ), "attention mask must have greater than 2 dimensions but less than or equal to 4" + assert 2 <= attn_mask.ndim <= 4, ( + "attention mask must have greater than 2 dimensions but less than or equal to 4" + ) if attn_mask.ndim == 2: attn_mask = rearrange(attn_mask, "i j -> () () i j") elif attn_mask.ndim == 3: @@ -790,9 +790,9 @@ def __init__( rotary_emb_dim = max(default(rotary_emb_dim, dim_head // 2), 32) self.rotary_pos_emb = RotaryEmbedding(rotary_emb_dim) if rotary_pos_emb else None - assert not ( - alibi_pos_bias and rel_pos_bias - ), "you can only choose Alibi positional bias or T5 relative positional bias, not both" + assert not (alibi_pos_bias and rel_pos_bias), ( + "you can only choose Alibi positional bias or T5 relative positional bias, not both" + ) if alibi_pos_bias: alibi_num_heads = default(alibi_num_heads, heads) @@ -922,9 +922,9 @@ def forward( past_key_values=None, expected_seq_len=None, ): - assert not ( - self.cross_attend ^ (exists(context) or exists(full_context)) - ), "context must be passed in if cross_attend is set to True" + assert not (self.cross_attend ^ (exists(context) or exists(full_context))), ( + "context must be passed in if cross_attend is set to True" + ) assert context is None or full_context is None, "only one of full_context or context can be provided" hiddens = [] @@ -940,9 +940,9 @@ def forward( rotary_pos_emb = None if exists(self.rotary_pos_emb): if not self.training and self.causal: - assert ( - expected_seq_len is not None - ), "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`" + assert expected_seq_len is not None, ( + "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`" + ) elif expected_seq_len is None: expected_seq_len = 0 seq_len = x.shape[1] diff --git a/TTS/tts/layers/vits/transforms.py b/TTS/tts/layers/vits/transforms.py index 3cac1b8d6d..da5deea9ef 100644 --- a/TTS/tts/layers/vits/transforms.py +++ b/TTS/tts/layers/vits/transforms.py @@ -74,7 +74,7 @@ def unconstrained_rational_quadratic_spline( outputs[outside_interval_mask] = inputs[outside_interval_mask] logabsdet[outside_interval_mask] = 0 else: - raise RuntimeError("{} tails are not implemented.".format(tails)) + raise RuntimeError(f"{tails} tails are not implemented.") outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( inputs=inputs[inside_interval_mask], diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py index 20eff26ecc..4e0f53616d 100644 --- a/TTS/tts/layers/xtts/gpt.py +++ b/TTS/tts/layers/xtts/gpt.py @@ -347,12 +347,12 @@ def forward( audio_codes = F.pad(audio_codes, (0, max_mel_len - audio_codes.shape[-1])) # 💖 Lovely assertions - assert ( - max_mel_len <= audio_codes.shape[-1] - ), f" ❗ max_mel_len ({max_mel_len}) > audio_codes.shape[-1] ({audio_codes.shape[-1]})" - assert ( - max_text_len <= text_inputs.shape[-1] - ), f" ❗ max_text_len ({max_text_len}) > text_inputs.shape[-1] ({text_inputs.shape[-1]})" + assert max_mel_len <= audio_codes.shape[-1], ( + f" ❗ max_mel_len ({max_mel_len}) > audio_codes.shape[-1] ({audio_codes.shape[-1]})" + ) + assert max_text_len <= text_inputs.shape[-1], ( + f" ❗ max_text_len ({max_text_len}) > text_inputs.shape[-1] ({text_inputs.shape[-1]})" + ) # Append stop token to text inputs text_inputs = F.pad(text_inputs[:, :max_text_len], (0, 1), value=self.stop_text_token) @@ -454,9 +454,9 @@ def forward( mel_targets[idx, l + 1 :] = -1 # check if stoptoken is in every row of mel_targets - assert (mel_targets == self.stop_audio_token).sum() >= mel_targets.shape[ - 0 - ], f" ❗ mel_targets does not contain stop token ({self.stop_audio_token}) in every row." + assert (mel_targets == self.stop_audio_token).sum() >= mel_targets.shape[0], ( + f" ❗ mel_targets does not contain stop token ({self.stop_audio_token}) in every row." + ) # ignore the loss for the segment used for conditioning # coin flip for the segment to be ignored diff --git a/TTS/tts/layers/xtts/hifigan_decoder.py b/TTS/tts/layers/xtts/hifigan_decoder.py index 2e6ac01a87..550ad3e3b2 100644 --- a/TTS/tts/layers/xtts/hifigan_decoder.py +++ b/TTS/tts/layers/xtts/hifigan_decoder.py @@ -97,7 +97,7 @@ def forward(self, latents, g=None): o = self.waveform_decoder(z, g=g) return o - @torch.no_grad() + @torch.inference_mode() def inference(self, c, g): """ Args: diff --git a/TTS/tts/layers/xtts/stream_generator.py b/TTS/tts/layers/xtts/stream_generator.py index 44cf940c69..e09a5233ac 100644 --- a/TTS/tts/layers/xtts/stream_generator.py +++ b/TTS/tts/layers/xtts/stream_generator.py @@ -4,7 +4,7 @@ import inspect import random import warnings -from typing import Callable, Optional, Union +from collections.abc import Callable import numpy as np import torch @@ -45,18 +45,18 @@ def __init__(self, **kwargs): class NewGenerationMixin(GenerationMixin): - @torch.no_grad() + @torch.inference_mode() def generate( # noqa: PLR0911 self, - inputs: Optional[torch.Tensor] = None, - generation_config: Optional[StreamGenerationConfig] = None, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None, - synced_gpus: Optional[bool] = False, + inputs: torch.Tensor | None = None, + generation_config: StreamGenerationConfig | None = None, + logits_processor: LogitsProcessorList | None = None, + stopping_criteria: StoppingCriteriaList | None = None, + prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], list[int]] | None = None, + synced_gpus: bool | None = False, seed: int = 0, **kwargs, - ) -> Union[GenerateOutput, torch.LongTensor]: + ) -> GenerateOutput | torch.LongTensor: r""" Generates sequences of token ids for models with a language modeling head. @@ -662,23 +662,23 @@ def typeerror(): **model_kwargs, ) - @torch.no_grad() + @torch.inference_mode() def sample_stream( self, input_ids: torch.LongTensor, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_warper: Optional[LogitsProcessorList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[Union[int, list[int]]] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: Optional[bool] = False, + logits_processor: LogitsProcessorList | None = None, + stopping_criteria: StoppingCriteriaList | None = None, + logits_warper: LogitsProcessorList | None = None, + max_length: int | None = None, + pad_token_id: int | None = None, + eos_token_id: int | list[int] | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + output_scores: bool | None = None, + return_dict_in_generate: bool | None = None, + synced_gpus: bool | None = False, **model_kwargs, - ) -> Union[SampleOutput, torch.LongTensor]: + ) -> SampleOutput | torch.LongTensor: r""" Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. @@ -953,7 +953,6 @@ def init_stream_support(): def _get_logits_warper(generation_config: GenerationConfig) -> LogitsProcessorList: - warpers = LogitsProcessorList() if generation_config.temperature is not None and generation_config.temperature != 1.0: diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index fec8358deb..ef4162a1cb 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -76,7 +76,7 @@ def split_sentence(text, lang, text_split_length=250): # List of (regular expression, replacement) pairs for abbreviations: _abbreviations = { "en": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("mrs", "misess"), ("mr", "mister"), @@ -99,7 +99,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "es": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("sra", "señora"), ("sr", "señor"), @@ -112,7 +112,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "fr": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("mme", "madame"), ("mr", "monsieur"), @@ -124,7 +124,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "de": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("fr", "frau"), ("dr", "doktor"), @@ -134,7 +134,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "pt": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("sra", "senhora"), ("sr", "senhor"), @@ -147,7 +147,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "it": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ # ("sig.ra", "signora"), ("sig", "signore"), @@ -159,7 +159,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "pl": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("p", "pani"), ("m", "pan"), @@ -169,19 +169,19 @@ def split_sentence(text, lang, text_split_length=250): ] ], "ar": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ # There are not many common abbreviations in Arabic as in English. ] ], "zh": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts. ] ], "cs": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("dr", "doktor"), # doctor ("ing", "inženýr"), # engineer @@ -190,7 +190,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "ru": [ - (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\b", re.IGNORECASE), x[1]) for x in [ ("г-жа", "госпожа"), # Mrs. ("г-н", "господин"), # Mr. @@ -199,7 +199,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "nl": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("dhr", "de heer"), # Mr. ("mevr", "mevrouw"), # Mrs. @@ -209,7 +209,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "tr": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("b", "bay"), # Mr. ("byk", "büyük"), # büyük @@ -218,7 +218,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "hu": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("dr", "doktor"), # doctor ("b", "bácsi"), # Mr. @@ -227,13 +227,13 @@ def split_sentence(text, lang, text_split_length=250): ] ], "ko": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ # Korean doesn't typically use abbreviations in the same way as Latin-based scripts. ] ], "hi": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ # Hindi doesn't typically use abbreviations in the same way as Latin-based scripts. ] @@ -249,7 +249,7 @@ def expand_abbreviations_multilingual(text, lang="en"): _symbols_multilingual = { "en": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " and "), ("@", " at "), @@ -261,7 +261,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "es": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " y "), ("@", " arroba "), @@ -273,7 +273,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "fr": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " et "), ("@", " arobase "), @@ -285,7 +285,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "de": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " und "), ("@", " at "), @@ -297,7 +297,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "pt": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " e "), ("@", " arroba "), @@ -309,7 +309,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "it": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " e "), ("@", " chiocciola "), @@ -321,7 +321,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "pl": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " i "), ("@", " małpa "), @@ -334,7 +334,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "ar": [ # Arabic - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " و "), ("@", " على "), @@ -347,7 +347,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "zh": [ # Chinese - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " 和 "), ("@", " 在 "), @@ -360,7 +360,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "cs": [ # Czech - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " a "), ("@", " na "), @@ -373,7 +373,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "ru": [ # Russian - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " и "), ("@", " собака "), @@ -386,7 +386,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "nl": [ # Dutch - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " en "), ("@", " bij "), @@ -398,7 +398,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "tr": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " ve "), ("@", " at "), @@ -410,7 +410,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "hu": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " és "), ("@", " kukac "), @@ -423,7 +423,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "ko": [ # Korean - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " 그리고 "), ("@", " 에 "), @@ -435,7 +435,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "hi": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " और "), ("@", " ऐट दी रेट "), @@ -505,7 +505,7 @@ def _expand_decimal_point(m, lang="en"): def _expand_currency(m, lang="en", currency="USD"): - amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", ".")))) + amount = float(re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))) full_amount = num2words(amount, to="currency", currency=currency, lang=lang) and_equivalents = { diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py index 107054189c..0a8af2f950 100644 --- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py +++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py @@ -1,6 +1,5 @@ import logging from dataclasses import dataclass, field -from typing import Dict, List, Tuple, Union import torch import torch.nn as nn @@ -31,7 +30,7 @@ class GPTTrainerConfig(XttsConfig): optimizer_wd_only_on_weights: bool = False weighted_loss_attrs: dict = field(default_factory=lambda: {}) weighted_loss_multipliers: dict = field(default_factory=lambda: {}) - test_sentences: List[dict] = field(default_factory=lambda: []) + test_sentences: list[dict] = field(default_factory=lambda: []) @dataclass @@ -197,10 +196,6 @@ def __init__(self, config: Coqpit): mel_norm_file=self.args.mel_norm_file, sampling_rate=config.audio.dvae_sample_rate ) - @property - def device(self): - return next(self.parameters()).device - def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels, cond_idxs, cond_lens): """ Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode @@ -225,8 +220,8 @@ def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels ) return losses - @torch.no_grad() - def test_run(self, assets) -> Tuple[Dict, Dict]: # pylint: disable=W0613 + @torch.inference_mode() + def test_run(self, assets) -> tuple[dict, dict]: # pylint: disable=W0613 test_audios = {} if self.config.test_sentences: # init gpt for inference mode @@ -241,7 +236,7 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: # pylint: disable=W0613 s_info["language"], gpt_cond_len=3, )["wav"] - test_audios["{}-audio".format(idx)] = wav + test_audios[f"{idx}-audio"] = wav # delete inference layers del self.xtts.gpt.gpt_inference @@ -249,11 +244,15 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: # pylint: disable=W0613 return {"audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.args.output_sample_rate) - def format_batch(self, batch: Dict) -> Dict: + def format_batch(self, batch: dict) -> dict: return batch @torch.no_grad() # torch no grad to avoid gradients from the pre-processing and DVAE codes extraction @@ -335,7 +334,7 @@ def on_init_end(self, trainer): # pylint: disable=W0613 WeightsFileHandler.add_pre_callback(callback_clearml_load_save) - @torch.no_grad() + @torch.inference_mode() def inference( self, x, @@ -355,9 +354,9 @@ def get_sampler(self, dataset: TTSDataset, num_gpus=1): def get_data_loader( self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: bool, - samples: Union[List[Dict], List[List]], + samples: list[dict] | list[list], verbose: bool, num_gpus: int, rank: int = None, @@ -400,7 +399,7 @@ def get_data_loader( ) return loader - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: """Initiate and return the optimizer based on the config parameters.""" # ToDo: deal with multi GPU training if self.config.optimizer_wd_only_on_weights: @@ -431,7 +430,7 @@ def get_optimizer(self) -> List: v.is_norm = isinstance(m, norm_modules) v.is_emb = isinstance(m, emb_modules) - fpn = "%s.%s" % (mn, k) if mn else k # full param name + fpn = f"{mn}.{k}" if mn else k # full param name all_param_names.add(fpn) param_map[fpn] = v if v.is_bias or v.is_norm or v.is_emb: @@ -464,7 +463,7 @@ def get_optimizer(self) -> List: parameters=self.xtts.gpt.parameters(), ) - def get_scheduler(self, optimizer) -> List: + def get_scheduler(self, optimizer) -> list: """Set the scheduler for the optimizer. Args: @@ -495,7 +494,7 @@ def load_checkpoint( assert not self.training @staticmethod - def init_from_config(config: "GPTTrainerConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "GPTTrainerConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/layers/xtts/zh_num2words.py b/TTS/tts/layers/xtts/zh_num2words.py index 69b8dae952..360d9b06c8 100644 --- a/TTS/tts/layers/xtts/zh_num2words.py +++ b/TTS/tts/layers/xtts/zh_num2words.py @@ -392,7 +392,7 @@ # ================================================================================ # # basic class # ================================================================================ # -class ChineseChar(object): +class ChineseChar: """ 中文字符 每个字符对应简体和繁体, @@ -420,13 +420,13 @@ class ChineseNumberUnit(ChineseChar): """ def __init__(self, power, simplified, traditional, big_s, big_t): - super(ChineseNumberUnit, self).__init__(simplified, traditional) + super().__init__(simplified, traditional) self.power = power self.big_s = big_s self.big_t = big_t def __str__(self): - return "10^{}".format(self.power) + return f"10^{self.power}" @classmethod def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False): @@ -447,7 +447,7 @@ def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=Fals power=pow(2, index + 3), simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1] ) else: - raise ValueError("Counting type should be in {0} ({1} provided).".format(NUMBERING_TYPES, numbering_type)) + raise ValueError(f"Counting type should be in {NUMBERING_TYPES} ({numbering_type} provided).") class ChineseNumberDigit(ChineseChar): @@ -456,7 +456,7 @@ class ChineseNumberDigit(ChineseChar): """ def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None): - super(ChineseNumberDigit, self).__init__(simplified, traditional) + super().__init__(simplified, traditional) self.value = value self.big_s = big_s self.big_t = big_t @@ -477,7 +477,7 @@ class ChineseMath(ChineseChar): """ def __init__(self, simplified, traditional, symbol, expression=None): - super(ChineseMath, self).__init__(simplified, traditional) + super().__init__(simplified, traditional) self.symbol = symbol self.expression = expression self.big_s = simplified @@ -487,13 +487,13 @@ def __init__(self, simplified, traditional, symbol, expression=None): CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath -class NumberSystem(object): +class NumberSystem: """ 中文数字系统 """ -class MathSymbol(object): +class MathSymbol: """ 用于中文数字系统的数学符号 (繁/简体), e.g. positive = ['正', '正'] @@ -507,8 +507,7 @@ def __init__(self, positive, negative, point): self.point = point def __iter__(self): - for v in self.__dict__.values(): - yield v + yield from self.__dict__.values() # class OtherSymbol(object): @@ -640,7 +639,7 @@ def compute_value(integer_symbols): int_str = str(compute_value(int_part)) dec_str = "".join([str(d.value) for d in dec_part]) if dec_part: - return "{0}.{1}".format(int_str, dec_str) + return f"{int_str}.{dec_str}" else: return int_str @@ -686,7 +685,7 @@ def get_value(value_string, use_zeros=True): int_string = int_dec[0] dec_string = int_dec[1] else: - raise ValueError("invalid input num string with more than one dot: {}".format(number_string)) + raise ValueError(f"invalid input num string with more than one dot: {number_string}") if use_units and len(int_string) > 1: result_symbols = get_value(int_string) @@ -702,7 +701,7 @@ def get_value(value_string, use_zeros=True): if isinstance(v, CND) and v.value == 2: next_symbol = result_symbols[i + 1] if i < len(result_symbols) - 1 else None previous_symbol = result_symbols[i - 1] if i > 0 else None - if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))): + if isinstance(next_symbol, CNU) and isinstance(previous_symbol, CNU | type(None)): if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)): result_symbols[i] = liang @@ -1166,7 +1165,7 @@ def __call__(self, text): ) ndone = 0 - with open(args.ifile, "r", encoding="utf8") as istream, open(args.ofile, "w+", encoding="utf8") as ostream: + with open(args.ifile, encoding="utf8") as istream, open(args.ofile, "w+", encoding="utf8") as ostream: if args.format == "tsv": reader = csv.DictReader(istream, delimiter="\t") assert "TEXT" in reader.fieldnames diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index ebfa171c80..4746b13ea2 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -1,12 +1,11 @@ import logging -from typing import Dict, List, Union from TTS.utils.generic_utils import find_module logger = logging.getLogger(__name__) -def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS": +def setup_model(config: "Coqpit", samples: list[list] | list[dict] = None) -> "BaseTTS": logger.info("Using model: %s", config.model) # fetch the right model implementation. if "base_model" in config and config["base_model"] is not None: diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 28a52bc558..c2e29c7100 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import Dict, List, Union import torch from coqpit import Coqpit @@ -233,9 +232,7 @@ def _forward_mdn(self, o_en, y, y_lengths, x_mask): dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, y_mask) return dr_mas, mu, log_sigma, logp - def forward( - self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None - ): # pylint: disable=unused-argument + def forward(self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None): # pylint: disable=unused-argument """ Shapes: - x: :math:`[B, T_max]` @@ -288,7 +285,7 @@ def forward( } return outputs - @torch.no_grad() + @torch.inference_mode() def inference(self, x, aux_input={"d_vectors": None}): # pylint: disable=unused-argument """ Shapes: @@ -352,9 +349,7 @@ def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use train_audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": train_audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) @@ -367,9 +362,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: @@ -403,7 +396,7 @@ def on_epoch_start(self, trainer): self.phase = self._set_phase(trainer.config, trainer.total_steps_done) @staticmethod - def init_from_config(config: "AlignTTSConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "AlignTTSConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/models/bark.py b/TTS/tts/models/bark.py index c52c541b25..84814745a2 100644 --- a/TTS/tts/models/bark.py +++ b/TTS/tts/models/bark.py @@ -1,6 +1,6 @@ import os from dataclasses import dataclass -from typing import Optional +from pathlib import Path import numpy as np from coqpit import Coqpit @@ -42,10 +42,6 @@ def __init__( self.encodec = EncodecModel.encodec_model_24khz() self.encodec.set_target_bandwidth(6.0) - @property - def device(self): - return next(self.parameters()).device - def load_bark_models(self): self.semantic_model, self.config = load_model( ckpt_path=self.config.LOCAL_MODEL_PATHS["text"], device=self.device, config=self.config, model_type="text" @@ -68,7 +64,7 @@ def train_step( def text_to_semantic( self, text: str, - history_prompt: Optional[str] = None, + history_prompt: str | None = None, temp: float = 0.7, base=None, allow_early_stop=True, @@ -98,7 +94,7 @@ def text_to_semantic( def semantic_to_waveform( self, semantic_tokens: np.ndarray, - history_prompt: Optional[str] = None, + history_prompt: str | None = None, temp: float = 0.7, base=None, ): @@ -132,7 +128,7 @@ def semantic_to_waveform( def generate_audio( self, text: str, - history_prompt: Optional[str] = None, + history_prompt: str | None = None, text_temp: float = 0.7, waveform_temp: float = 0.7, base=None, @@ -194,9 +190,7 @@ def _set_voice_dirs(self, voice_dirs): return _voice_dirs # TODO: remove config from synthesize - def synthesize( - self, text, config, speaker_id="random", voice_dirs=None, **kwargs - ): # pylint: disable=unused-argument + def synthesize(self, text, config, speaker_id="random", voice_dirs=None, **kwargs): # pylint: disable=unused-argument """Synthesize speech with the given input text. Args: @@ -269,10 +263,12 @@ def load_checkpoint( fine_model_path = fine_model_path or os.path.join(checkpoint_dir, "fine_2.pt") hubert_tokenizer_path = hubert_tokenizer_path or os.path.join(checkpoint_dir, "tokenizer.pth") + # The paths in the default config start with /root/.local/share/tts and need to be fixed self.config.LOCAL_MODEL_PATHS["text"] = text_model_path self.config.LOCAL_MODEL_PATHS["coarse"] = coarse_model_path self.config.LOCAL_MODEL_PATHS["fine"] = fine_model_path self.config.LOCAL_MODEL_PATHS["hubert_tokenizer"] = hubert_tokenizer_path + self.config.CACHE_DIR = str(Path(text_model_path).parent) self.load_bark_models() diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py index 79cdf1a7d4..05f4ae168d 100644 --- a/TTS/tts/models/base_tacotron.py +++ b/TTS/tts/models/base_tacotron.py @@ -1,7 +1,6 @@ import copy import logging from abc import abstractmethod -from typing import Dict, Tuple import torch from coqpit import Coqpit @@ -62,7 +61,7 @@ def __init__( self.coarse_decoder = None @staticmethod - def _format_aux_input(aux_input: Dict) -> Dict: + def _format_aux_input(aux_input: dict) -> dict: """Set missing fields to their default values""" if aux_input: return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input) @@ -94,9 +93,7 @@ def forward(self): def inference(self): pass - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin """Load model checkpoint and set up internals. Args: @@ -141,7 +138,7 @@ def init_from_config(config: Coqpit): # TEST AND LOG FUNCTIONS # ########################## - def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: + def test_run(self, assets: dict) -> tuple[dict, dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -169,17 +166,19 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: use_griffin_lim=True, do_trim_silence=False, ) - test_audios["{}-audio".format(idx)] = outputs_dict["wav"] - test_figures["{}-prediction".format(idx)] = plot_spectrogram( + test_audios[f"{idx}-audio"] = outputs_dict["wav"] + test_figures[f"{idx}-prediction"] = plot_spectrogram( outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False ) - test_figures["{}-alignment".format(idx)] = plot_alignment( - outputs_dict["outputs"]["alignments"], output_fig=False - ) + test_figures[f"{idx}-alignment"] = plot_alignment(outputs_dict["outputs"]["alignments"], output_fig=False) return {"figures": test_figures, "audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.ap.sample_rate) logger.test_figures(steps, outputs["figures"]) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 91dd6e96d6..6a78cf603f 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -1,7 +1,6 @@ import logging import os import random -from typing import Dict, List, Tuple, Union import torch import torch.distributed as dist @@ -107,7 +106,7 @@ def adjust_speech_rate(self, gpt_latents, length_scale): print(f"Interpolation failed: {e}") return gpt_latents - def init_multispeaker(self, config: Coqpit, data: List = None): + def init_multispeaker(self, config: Coqpit, data: list = None): """Set up for multi-speaker TTS. Initialize a speaker embedding layer if needed and define expected embedding @@ -142,7 +141,7 @@ def init_multispeaker(self, config: Coqpit, data: List = None): self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) - def get_aux_input(self, **kwargs) -> Dict: + def get_aux_input(self, **kwargs) -> dict: """Prepare and return `aux_input` used by `forward()`""" return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None} @@ -193,7 +192,7 @@ def get_aux_input_from_test_sentences(self, sentence_info): "language_id": language_id, } - def format_batch(self, batch: Dict) -> Dict: + def format_batch(self, batch: dict) -> dict: """Generic batch formatting for `TTSDataset`. You must override this if you use a custom dataset. @@ -239,9 +238,9 @@ def format_batch(self, batch: Dict) -> Dict: extra_frames = dur.sum() - mel_lengths[idx] largest_idxs = torch.argsort(-dur)[:extra_frames] dur[largest_idxs] -= 1 - assert ( - dur.sum() == mel_lengths[idx] - ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + assert dur.sum() == mel_lengths[idx], ( + f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + ) durations[idx, : text_lengths[idx]] = dur # set stop targets wrt reduction factor @@ -313,9 +312,9 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1): def get_data_loader( self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: bool, - samples: Union[List[Dict], List[List]], + samples: list[dict] | list[list], verbose: bool, num_gpus: int, rank: int = None, @@ -394,7 +393,7 @@ def get_data_loader( def _get_test_aux_input( self, - ) -> Dict: + ) -> dict: d_vector = None if self.config.use_d_vector_file: d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings] @@ -411,7 +410,7 @@ def _get_test_aux_input( } return aux_inputs - def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: + def test_run(self, assets: dict) -> tuple[dict, dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -442,13 +441,11 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: use_griffin_lim=True, do_trim_silence=False, ) - test_audios["{}-audio".format(idx)] = outputs_dict["wav"] - test_figures["{}-prediction".format(idx)] = plot_spectrogram( + test_audios[f"{idx}-audio"] = outputs_dict["wav"] + test_figures[f"{idx}-prediction"] = plot_spectrogram( outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False ) - test_figures["{}-alignment".format(idx)] = plot_alignment( - outputs_dict["outputs"]["alignments"], output_fig=False - ) + test_figures[f"{idx}-alignment"] = plot_alignment(outputs_dict["outputs"]["alignments"], output_fig=False) return test_figures, test_audios def on_init_start(self, trainer): diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py index e6db116081..7b6103512c 100644 --- a/TTS/tts/models/delightful_tts.py +++ b/TTS/tts/models/delightful_tts.py @@ -3,7 +3,6 @@ from dataclasses import dataclass, field from itertools import chain from pathlib import Path -from typing import Dict, List, Optional, Tuple, Union import numpy as np import torch @@ -65,7 +64,7 @@ class ForwardTTSE2eF0Dataset(F0Dataset): def __init__( self, ap, - samples: Union[List[List], List[Dict]], + samples: list[list] | list[dict], cache_path: str = None, precompute_num_workers=0, normalize_f0=True, @@ -275,15 +274,15 @@ def collate_fn(self, batch): @dataclass class VocoderConfig(Coqpit): resblock_type_decoder: str = "1" - resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11]) - resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) - upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2]) + resblock_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [3, 7, 11]) + resblock_dilation_sizes_decoder: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) + upsample_rates_decoder: list[int] = field(default_factory=lambda: [8, 8, 2, 2]) upsample_initial_channel_decoder: int = 512 - upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4]) + upsample_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [16, 16, 4, 4]) use_spectral_norm_discriminator: bool = False - upsampling_rates_discriminator: List[int] = field(default_factory=lambda: [4, 4, 4, 4]) - periods_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11]) - pretrained_model_path: Optional[str] = None + upsampling_rates_discriminator: list[int] = field(default_factory=lambda: [4, 4, 4, 4]) + periods_discriminator: list[int] = field(default_factory=lambda: [2, 3, 5, 7, 11]) + pretrained_model_path: str | None = None @dataclass @@ -438,10 +437,6 @@ def __init__( periods=self.config.vocoder.periods_discriminator, ) - @property - def device(self): - return next(self.parameters()).device - @property def energy_scaler(self): return self.acoustic_model.energy_scaler @@ -557,7 +552,7 @@ def forward( attn_priors: torch.FloatTensor = None, d_vectors: torch.FloatTensor = None, speaker_idx: torch.LongTensor = None, - ) -> Dict: + ) -> dict: """Model's forward pass. Args: @@ -622,7 +617,7 @@ def forward( model_outputs["slice_ids"] = slice_ids return model_outputs - @torch.no_grad() + @torch.inference_mode() def inference( self, x, aux_input={"d_vectors": None, "speaker_ids": None}, pitch_transform=None, energy_transform=None ): @@ -646,7 +641,7 @@ def inference( model_outputs["model_outputs"] = vocoder_output return model_outputs - @torch.no_grad() + @torch.inference_mode() def inference_spec_decoder(self, x, aux_input={"d_vectors": None, "speaker_ids": None}): encoder_outputs = self.acoustic_model.inference( tokens=x, @@ -836,9 +831,7 @@ def _log(self, batch, outputs, name_prefix="train"): audios[f"{name_prefix}/vocoder_audio"] = sample_voice return figures, audios - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=no-self-use, unused-argument + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=no-self-use, unused-argument """Create visualizations and waveform examples. For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to @@ -1018,8 +1011,8 @@ def synthesize_with_gl(self, text: str, speaker_id, d_vector): } return return_dict - @torch.no_grad() - def test_run(self, assets) -> Tuple[Dict, Dict]: + @torch.inference_mode() + def test_run(self, assets) -> tuple[dict, dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -1045,18 +1038,22 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: d_vector=aux_inputs["d_vector"], ) # speaker_name = self.speaker_manager.speaker_names[aux_inputs["speaker_id"]] - test_audios["{}-audio".format(idx)] = outputs["wav"].T - test_audios["{}-audio_encoder".format(idx)] = outputs_gl["wav"].T - test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False) + test_audios[f"{idx}-audio"] = outputs["wav"].T + test_audios[f"{idx}-audio_encoder"] = outputs_gl["wav"].T + test_figures[f"{idx}-alignment"] = plot_alignment(outputs["alignments"], output_fig=False) return {"figures": test_figures, "audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.config.audio.sample_rate) logger.test_figures(steps, outputs["figures"]) - def format_batch(self, batch: Dict) -> Dict: + def format_batch(self, batch: dict) -> dict: """Compute speaker, langugage IDs and d_vector for the batch if necessary.""" speaker_ids = None d_vectors = None @@ -1164,9 +1161,9 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1): def get_data_loader( self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: bool, - samples: Union[List[Dict], List[List]], + samples: list[dict] | list[list], verbose: bool, num_gpus: int, rank: int = None, @@ -1221,7 +1218,7 @@ def get_data_loader( def get_criterion(self): return [VitsDiscriminatorLoss(self.config), DelightfulTTSLoss(self.config)] - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: """Initiate and return the GAN optimizers based on the config parameters. It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator. Returns: @@ -1236,7 +1233,7 @@ def get_optimizer(self) -> List: ) return [optimizer_disc, optimizer_gen] - def get_lr(self) -> List: + def get_lr(self) -> list: """Set the initial learning rates for each optimizer. Returns: @@ -1244,7 +1241,7 @@ def get_lr(self) -> List: """ return [self.config.lr_disc, self.config.lr_gen] - def get_scheduler(self, optimizer) -> List: + def get_scheduler(self, optimizer) -> list: """Set the schedulers for each optimizer. Args: @@ -1263,9 +1260,7 @@ def on_epoch_end(self, trainer): # pylint: disable=unused-argument self.energy_scaler.eval() @staticmethod - def init_from_config( - config: "DelightfulTTSConfig", samples: Union[List[List], List[Dict]] = None - ): # pylint: disable=unused-argument + def init_from_config(config: "DelightfulTTSConfig", samples: list[list] | list[dict] = None): # pylint: disable=unused-argument """Initiate model from config Args: diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py index d09e3ea91b..497ac3f63a 100644 --- a/TTS/tts/models/forward_tts.py +++ b/TTS/tts/models/forward_tts.py @@ -1,6 +1,5 @@ import logging from dataclasses import dataclass, field -from typing import Dict, List, Tuple, Union import torch from coqpit import Coqpit @@ -333,7 +332,7 @@ def format_durations(self, o_dr_log, x_mask): def _forward_encoder( self, x: torch.LongTensor, x_mask: torch.FloatTensor, g: torch.FloatTensor = None - ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: """Encoding forward pass. 1. Embed speaker IDs if multi-speaker mode. @@ -381,7 +380,7 @@ def _forward_decoder( x_mask: torch.FloatTensor, y_lengths: torch.IntTensor, g: torch.FloatTensor, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: """Decoding forward pass. 1. Compute the decoder output mask @@ -415,7 +414,7 @@ def _forward_pitch_predictor( x_mask: torch.IntTensor, pitch: torch.FloatTensor = None, dr: torch.IntTensor = None, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: """Pitch predictor forward pass. 1. Predict pitch from encoder outputs. @@ -451,7 +450,7 @@ def _forward_energy_predictor( x_mask: torch.IntTensor, energy: torch.FloatTensor = None, dr: torch.IntTensor = None, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: """Energy predictor forward pass. 1. Predict energy from encoder outputs. @@ -483,7 +482,7 @@ def _forward_energy_predictor( def _forward_aligner( self, x: torch.FloatTensor, y: torch.FloatTensor, x_mask: torch.IntTensor, y_mask: torch.IntTensor - ) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: """Aligner forward pass. 1. Compute a mask to apply to the attention map. @@ -522,7 +521,7 @@ def _forward_aligner( alignment_soft = alignment_soft.squeeze(1).transpose(1, 2) return o_alignment_dur, alignment_soft, alignment_logprob, alignment_mas - def _set_speaker_input(self, aux_input: Dict): + def _set_speaker_input(self, aux_input: dict): d_vectors = aux_input.get("d_vectors", None) speaker_ids = aux_input.get("speaker_ids", None) @@ -544,8 +543,8 @@ def forward( dr: torch.IntTensor = None, pitch: torch.FloatTensor = None, energy: torch.FloatTensor = None, - aux_input: Dict = {"d_vectors": None, "speaker_ids": None}, # pylint: disable=unused-argument - ) -> Dict: + aux_input: dict = {"d_vectors": None, "speaker_ids": None}, # pylint: disable=unused-argument + ) -> dict: """Model's forward pass. Args: @@ -628,7 +627,7 @@ def forward( } return outputs - @torch.no_grad() + @torch.inference_mode() def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument """Model's inference pass. @@ -771,9 +770,7 @@ def _create_logs(self, batch, outputs, ap): train_audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": train_audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) @@ -786,9 +783,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: @@ -805,7 +800,7 @@ def on_train_step_start(self, trainer): self.binary_loss_weight = min(trainer.epochs_done / self.config.binary_loss_warmup_epochs, 1.0) * 1.0 @staticmethod - def init_from_config(config: "ForwardTTSConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "ForwardTTSConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 5bf4713140..5d03b53dc6 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -1,6 +1,5 @@ import logging import math -from typing import Dict, List, Tuple, Union import torch from coqpit import Coqpit @@ -125,9 +124,9 @@ def init_multispeaker(self, config: Coqpit): config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 ) if self.speaker_manager is not None: - assert ( - config.d_vector_dim == self.speaker_manager.embedding_dim - ), " [!] d-vector dimension mismatch b/w config and speaker manager." + assert config.d_vector_dim == self.speaker_manager.embedding_dim, ( + " [!] d-vector dimension mismatch b/w config and speaker manager." + ) # init speaker embedding layer if config.use_speaker_embedding and not config.use_d_vector_file: logger.info("Init speaker_embedding layer.") @@ -162,7 +161,7 @@ def lock_act_norm_layers(self): if getattr(f, "set_ddi", False): f.set_ddi(False) - def _set_speaker_input(self, aux_input: Dict): + def _set_speaker_input(self, aux_input: dict): if aux_input is None: d_vectors = None speaker_ids = None @@ -179,7 +178,7 @@ def _set_speaker_input(self, aux_input: Dict): g = speaker_ids if speaker_ids is not None else d_vectors return g - def _speaker_embedding(self, aux_input: Dict) -> Union[torch.tensor, None]: + def _speaker_embedding(self, aux_input: dict) -> torch.Tensor | None: g = self._set_speaker_input(aux_input) # speaker embedding if g is not None: @@ -193,9 +192,7 @@ def _speaker_embedding(self, aux_input: Dict) -> Union[torch.tensor, None]: g = F.normalize(g).unsqueeze(-1) # [b, h, 1] return g - def forward( - self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} - ): # pylint: disable=dangerous-default-value + def forward(self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=dangerous-default-value """ Args: x (torch.Tensor): @@ -262,7 +259,7 @@ def forward( } return outputs - @torch.no_grad() + @torch.inference_mode() def inference_with_MAS( self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} ): # pylint: disable=dangerous-default-value @@ -318,10 +315,8 @@ def inference_with_MAS( } return outputs - @torch.no_grad() - def decoder_inference( - self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} - ): # pylint: disable=dangerous-default-value + @torch.inference_mode() + def decoder_inference(self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=dangerous-default-value """ Shapes: - y: :math:`[B, T, C]` @@ -341,10 +336,8 @@ def decoder_inference( outputs["logdet"] = logdet return outputs - @torch.no_grad() - def inference( - self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None} - ): # pylint: disable=dangerous-default-value + @torch.inference_mode() + def inference(self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}): # pylint: disable=dangerous-default-value x_lengths = aux_input["x_lengths"] g = self._speaker_embedding(aux_input) # embedding pass @@ -457,14 +450,12 @@ def _create_logs(self, batch, outputs, ap): train_audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": train_audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) - @torch.no_grad() + @torch.inference_mode() def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) @@ -473,8 +464,8 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) - @torch.no_grad() - def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: + @torch.inference_mode() + def test_run(self, assets: dict) -> tuple[dict, dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -503,11 +494,11 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: do_trim_silence=False, ) - test_audios["{}-audio".format(idx)] = outputs["wav"] - test_figures["{}-prediction".format(idx)] = plot_spectrogram( + test_audios[f"{idx}-audio"] = outputs["wav"] + test_figures[f"{idx}-prediction"] = plot_spectrogram( outputs["outputs"]["model_outputs"], self.ap, output_fig=False ) - test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False) + test_figures[f"{idx}-alignment"] = plot_alignment(outputs["alignments"], output_fig=False) return test_figures, test_audios def preprocess(self, y, y_lengths, y_max_length, attn=None): @@ -522,9 +513,7 @@ def preprocess(self, y, y_lengths, y_max_length, attn=None): def store_inverse(self): self.decoder.store_inverse() - def load_checkpoint( - self, config, checkpoint_path, eval=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: @@ -543,7 +532,7 @@ def on_train_step_start(self, trainer): self.run_data_dep_init = trainer.total_steps_done < self.data_dep_init_steps @staticmethod - def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "GlowTTSConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/models/neuralhmm_tts.py b/TTS/tts/models/neuralhmm_tts.py index 0b3fadafbf..2cbf425884 100644 --- a/TTS/tts/models/neuralhmm_tts.py +++ b/TTS/tts/models/neuralhmm_tts.py @@ -1,6 +1,5 @@ import logging import os -from typing import Dict, List, Union import torch from coqpit import Coqpit @@ -102,7 +101,7 @@ def __init__( self.register_buffer("mean", torch.tensor(0)) self.register_buffer("std", torch.tensor(1)) - def update_mean_std(self, statistics_dict: Dict): + def update_mean_std(self, statistics_dict: dict): self.mean.data = torch.tensor(statistics_dict["mean"]) self.std.data = torch.tensor(statistics_dict["std"]) @@ -174,10 +173,10 @@ def train_step(self, batch: dict, criterion: nn.Module): loss_dict.update(self._training_stats(batch)) return outputs, loss_dict - def eval_step(self, batch: Dict, criterion: nn.Module): + def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) - def _format_aux_input(self, aux_input: Dict, default_input_dict): + def _format_aux_input(self, aux_input: dict, default_input_dict): """Set missing fields to their default value. Args: @@ -195,7 +194,7 @@ def _format_aux_input(self, aux_input: Dict, default_input_dict): return format_aux_input(default_input_dict, aux_input) return default_input_dict - @torch.no_grad() + @torch.inference_mode() def inference( self, text: torch.Tensor, @@ -239,7 +238,7 @@ def get_criterion(): return NLLLoss() @staticmethod - def init_from_config(config: "NeuralhmmTTSConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "NeuralhmmTTSConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: @@ -346,17 +345,13 @@ def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use, unus audio = ap.inv_melspectrogram(inference_output["model_outputs"][0].T.cpu().numpy()) return figures, {"audios": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=unused-argument + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Log training progress.""" figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) - def eval_log( - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int - ): # pylint: disable=unused-argument + def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Compute and log evaluation metrics.""" # Plot model parameters histograms if isinstance(logger, TensorboardLogger): @@ -370,7 +365,11 @@ def eval_log( logger.eval_audios(steps, audios, self.ap.sample_rate) def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs[1], self.ap.sample_rate) logger.test_figures(steps, outputs[0]) diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py index 1c146b2eac..aad2e1f553 100644 --- a/TTS/tts/models/overflow.py +++ b/TTS/tts/models/overflow.py @@ -1,6 +1,5 @@ import logging import os -from typing import Dict, List, Union import torch from coqpit import Coqpit @@ -116,7 +115,7 @@ def __init__( self.register_buffer("mean", torch.tensor(0)) self.register_buffer("std", torch.tensor(1)) - def update_mean_std(self, statistics_dict: Dict): + def update_mean_std(self, statistics_dict: dict): self.mean.data = torch.tensor(statistics_dict["mean"]) self.std.data = torch.tensor(statistics_dict["std"]) @@ -188,10 +187,10 @@ def train_step(self, batch: dict, criterion: nn.Module): loss_dict.update(self._training_stats(batch)) return outputs, loss_dict - def eval_step(self, batch: Dict, criterion: nn.Module): + def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) - def _format_aux_input(self, aux_input: Dict, default_input_dict): + def _format_aux_input(self, aux_input: dict, default_input_dict): """Set missing fields to their default value. Args: @@ -209,7 +208,7 @@ def _format_aux_input(self, aux_input: Dict, default_input_dict): return format_aux_input(default_input_dict, aux_input) return default_input_dict - @torch.no_grad() + @torch.inference_mode() def inference( self, text: torch.Tensor, @@ -255,7 +254,7 @@ def get_criterion(): return NLLLoss() @staticmethod - def init_from_config(config: "OverFlowConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "OverFlowConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: @@ -363,17 +362,13 @@ def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use, unus audio = ap.inv_melspectrogram(inference_output["model_outputs"][0].T.cpu().numpy()) return figures, {"audios": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=unused-argument + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Log training progress.""" figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) - def eval_log( - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int - ): # pylint: disable=unused-argument + def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Compute and log evaluation metrics.""" # Plot model parameters histograms if isinstance(logger, TensorboardLogger): @@ -387,7 +382,11 @@ def eval_log( logger.eval_audios(steps, audios, self.ap.sample_rate) def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs[1], self.ap.sample_rate) logger.test_figures(steps, outputs[0]) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 5d3efd2021..59173691f7 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from typing import Dict, List, Tuple, Union - import torch from torch import nn from trainer.trainer_utils import get_optimizer, get_scheduler @@ -218,7 +214,7 @@ def forward( # pylint: disable=dangerous-default-value ) return outputs - @torch.no_grad() + @torch.inference_mode() def inference(self, text_input, aux_input=None): aux_input = self._format_aux_input(aux_input) inputs = self.embedding(text_input) @@ -280,7 +276,7 @@ def before_backward_pass(self, loss_dict, optimizer) -> None: loss_dict["capacitron_vae_beta_loss"].backward() optimizer.first_step() - def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dict]: + def train_step(self, batch: dict, criterion: torch.nn.Module) -> tuple[dict, dict]: """Perform a single training step by fetching the right set of samples from the batch. Args: @@ -332,7 +328,7 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dic loss_dict["align_error"] = align_error return outputs, loss_dict - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: if self.use_capacitron_vae: return CapacitronOptimizer(self.config, self.named_parameters()) return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self) @@ -380,9 +376,7 @@ def _create_logs(self, batch, outputs, ap): audio = ap.inv_spectrogram(pred_linear_spec.T) return figures, {"audio": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) @@ -396,7 +390,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_audios(steps, audios, self.ap.sample_rate) @staticmethod - def init_from_config(config: "TacotronConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "TacotronConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 2716a39786..e924d82d42 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from typing import Dict, List, Union - import torch from torch import nn from trainer.trainer_utils import get_optimizer, get_scheduler @@ -238,7 +234,7 @@ def forward( # pylint: disable=dangerous-default-value ) return outputs - @torch.no_grad() + @torch.inference_mode() def inference(self, text, aux_input=None): """Forward pass for inference with no Teacher-Forcing. @@ -309,7 +305,7 @@ def before_backward_pass(self, loss_dict, optimizer) -> None: loss_dict["capacitron_vae_beta_loss"].backward() optimizer.first_step() - def train_step(self, batch: Dict, criterion: torch.nn.Module): + def train_step(self, batch: dict, criterion: torch.nn.Module): """A single training step. Forward pass and loss computation. Args: @@ -360,7 +356,7 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module): loss_dict["align_error"] = align_error return outputs, loss_dict - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: if self.use_capacitron_vae: return CapacitronOptimizer(self.config, self.named_parameters()) return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self) @@ -403,9 +399,7 @@ def _create_logs(self, batch, outputs, ap): audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use """Log training progress.""" figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) @@ -420,7 +414,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_audios(steps, audios, self.ap.sample_rate) @staticmethod - def init_from_config(config: "Tacotron2Config", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "Tacotron2Config", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py index 738e9dd9b3..a42d577676 100644 --- a/TTS/tts/models/tortoise.py +++ b/TTS/tts/models/tortoise.py @@ -342,7 +342,6 @@ def __init__(self, config: Coqpit): else self.args.autoregressive_batch_size ) self.enable_redaction = self.args.enable_redaction - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if self.enable_redaction: self.aligner = Wav2VecAlignment() @@ -685,9 +684,9 @@ def inference( text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).to(self.device) text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary. - assert ( - text_tokens.shape[-1] < 400 - ), "Too much text provided. Break the text up into separate segments and re-try inference." + assert text_tokens.shape[-1] < 400, ( + "Too much text provided. Break the text up into separate segments and re-try inference." + ) if voice_samples is not None: ( diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 7ec2519236..3b6cee7ead 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -3,7 +3,8 @@ import os from dataclasses import dataclass, field, replace from itertools import chain -from typing import Dict, List, Tuple, Union +from pathlib import Path +from typing import Any import numpy as np import torch @@ -400,12 +401,12 @@ class VitsArgs(Coqpit): dilation_rate_flow: int = 1 num_layers_flow: int = 4 resblock_type_decoder: str = "1" - resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11]) - resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) - upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2]) + resblock_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [3, 7, 11]) + resblock_dilation_sizes_decoder: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) + upsample_rates_decoder: list[int] = field(default_factory=lambda: [8, 8, 2, 2]) upsample_initial_channel_decoder: int = 512 - upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4]) - periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11]) + upsample_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [16, 16, 4, 4]) + periods_multi_period_discriminator: list[int] = field(default_factory=lambda: [2, 3, 5, 7, 11]) use_sdp: bool = True noise_scale: float = 1.0 inference_noise_scale: float = 0.667 @@ -418,7 +419,7 @@ class VitsArgs(Coqpit): use_speaker_embedding: bool = False num_speakers: int = 0 speakers_file: str = None - d_vector_file: List[str] = None + d_vector_file: list[str] = None speaker_embedding_channels: int = 256 use_d_vector_file: bool = False d_vector_dim: int = 0 @@ -565,10 +566,6 @@ def __init__( use_spectral_norm=self.args.use_spectral_norm_disriminator, ) - @property - def device(self): - return next(self.parameters()).device - def init_multispeaker(self, config: Coqpit): """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer or with external `d_vectors` computed from a speaker encoder model. @@ -683,7 +680,7 @@ def on_init_end(self, trainer): # pylint: disable=W0613 raise RuntimeError(" [!] The weights of Text Encoder was not reinit check it !") logger.info("Text Encoder was reinit.") - def get_aux_input(self, aux_input: Dict): + def get_aux_input(self, aux_input: dict): sid, g, lid, _ = self._set_cond_input(aux_input) return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid} @@ -713,7 +710,7 @@ def _freeze_layers(self): param.requires_grad = False @staticmethod - def _set_cond_input(aux_input: Dict): + def _set_cond_input(aux_input: dict): """Set the speaker conditioning input based on the multi-speaker mode.""" sid, g, lid, durations = None, None, None, None if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None: @@ -735,7 +732,7 @@ def _set_cond_input(aux_input: Dict): return sid, g, lid, durations - def _set_speaker_input(self, aux_input: Dict): + def _set_speaker_input(self, aux_input: dict): d_vectors = aux_input.get("d_vectors", None) speaker_ids = aux_input.get("speaker_ids", None) @@ -808,7 +805,7 @@ def forward( # pylint: disable=dangerous-default-value y_lengths: torch.tensor, waveform: torch.tensor, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}, - ) -> Dict: + ) -> dict: """Forward pass of the model. Args: @@ -927,7 +924,7 @@ def _set_x_lengths(x, aux_input): return aux_input["x_lengths"] return torch.tensor(x.shape[1:2]).to(x.device) - @torch.no_grad() + @torch.inference_mode() def inference( self, x, @@ -1014,7 +1011,7 @@ def inference( } return outputs - @torch.no_grad() + @torch.inference_mode() def inference_voice_conversion( self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None ): @@ -1055,8 +1052,8 @@ def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt): assert self.num_speakers > 0, "num_speakers have to be larger than 0." # speaker embedding if self.args.use_speaker_embedding and not self.args.use_d_vector_file: - g_src = self.emb_g(torch.from_numpy((np.array(speaker_cond_src))).unsqueeze(0)).unsqueeze(-1) - g_tgt = self.emb_g(torch.from_numpy((np.array(speaker_cond_tgt))).unsqueeze(0)).unsqueeze(-1) + g_src = self.emb_g(torch.from_numpy(np.array(speaker_cond_src)).unsqueeze(0)).unsqueeze(-1) + g_tgt = self.emb_g(torch.from_numpy(np.array(speaker_cond_tgt)).unsqueeze(0)).unsqueeze(-1) elif not self.args.use_speaker_embedding and self.args.use_d_vector_file: g_src = F.normalize(speaker_cond_src).unsqueeze(-1) g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1) @@ -1069,7 +1066,7 @@ def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt): o_hat = self.waveform_decoder(z_hat * y_mask, g=g_tgt) return o_hat, y_mask, (z, z_p, z_hat) - def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]: + def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> tuple[dict, dict]: """Perform a single training step. Run the model forward pass and compute losses. Args: @@ -1189,9 +1186,7 @@ def _log(self, ap, batch, outputs, name_prefix="train"): # pylint: disable=unus ) return figures, audios - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=no-self-use """Create visualizations and waveform examples. For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to @@ -1209,7 +1204,7 @@ def train_log( logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) - @torch.no_grad() + @torch.inference_mode() def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int): return self.train_step(batch, criterion, optimizer_idx) @@ -1266,8 +1261,8 @@ def get_aux_input_from_test_sentences(self, sentence_info): "language_name": language_name, } - @torch.no_grad() - def test_run(self, assets) -> Tuple[Dict, Dict]: + @torch.inference_mode() + def test_run(self, assets) -> tuple[dict, dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -1293,17 +1288,21 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: use_griffin_lim=True, do_trim_silence=False, ).values() - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False) + test_audios[f"{idx}-audio"] = wav + test_figures[f"{idx}-alignment"] = plot_alignment(alignment.permute(2, 1, 0), output_fig=False) return {"figures": test_figures, "audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.ap.sample_rate) logger.test_figures(steps, outputs["figures"]) - def format_batch(self, batch: Dict) -> Dict: + def format_batch(self, batch: dict) -> dict: """Compute speaker, langugage IDs and d_vector for the batch if necessary.""" speaker_ids = None language_ids = None @@ -1367,9 +1366,9 @@ def format_batch_on_device(self, batch): ) if self.args.encoder_sample_rate: - assert batch["spec"].shape[2] == int( - batch["mel"].shape[2] / self.interpolate_factor - ), f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}" + assert batch["spec"].shape[2] == int(batch["mel"].shape[2] / self.interpolate_factor), ( + f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}" + ) else: assert batch["spec"].shape[2] == batch["mel"].shape[2], f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}" @@ -1426,9 +1425,9 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1, is_eval=F def get_data_loader( self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: bool, - samples: Union[List[Dict], List[List]], + samples: list[dict] | list[list], verbose: bool, num_gpus: int, rank: int = None, @@ -1490,7 +1489,7 @@ def get_data_loader( ) return loader - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: """Initiate and return the GAN optimizers based on the config parameters. It returns 2 optimizers in a list. First one is for the discriminator @@ -1508,7 +1507,7 @@ def get_optimizer(self) -> List: ) return [optimizer0, optimizer1] - def get_lr(self) -> List: + def get_lr(self) -> list: """Set the initial learning rates for each optimizer. Returns: @@ -1516,7 +1515,7 @@ def get_lr(self) -> List: """ return [self.config.lr_disc, self.config.lr_gen] - def get_scheduler(self, optimizer) -> List: + def get_scheduler(self, optimizer) -> list: """Set the schedulers for each optimizer. Args: @@ -1539,9 +1538,7 @@ def get_criterion(self): return [VitsDiscriminatorLoss(self.config), VitsGeneratorLoss(self.config)] - def load_checkpoint( - self, config, checkpoint_path, eval=False, strict=True, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, strict=True, cache=False): # pylint: disable=unused-argument, redefined-builtin """Load the model checkpoint and setup for training or inference""" state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) # compat band-aid for the pre-trained models to not use the encoder baked into the model @@ -1568,9 +1565,7 @@ def load_checkpoint( self.eval() assert not self.training - def load_fairseq_checkpoint( - self, config, checkpoint_dir, eval=False, strict=True - ): # pylint: disable=unused-argument, redefined-builtin + def load_fairseq_checkpoint(self, config, checkpoint_dir, eval=False, strict=True): # pylint: disable=unused-argument, redefined-builtin """Load VITS checkpoints released by fairseq here: https://github.com/facebookresearch/fairseq/tree/main/examples/mms Performs some changes for compatibility. @@ -1585,13 +1580,16 @@ def load_fairseq_checkpoint( self.disc = None # set paths - config_file = os.path.join(checkpoint_dir, "config.json") - checkpoint_file = os.path.join(checkpoint_dir, "G_100000.pth") - vocab_file = os.path.join(checkpoint_dir, "vocab.txt") + checkpoint_dir = Path(checkpoint_dir) + config_file = checkpoint_dir / "config.json" + checkpoint_file = checkpoint_dir / "model.pth" + if not checkpoint_file.is_file(): + checkpoint_file = checkpoint_dir / "G_100000.pth" + vocab_file = checkpoint_dir / "vocab.txt" # set config params - with open(config_file, "r", encoding="utf-8") as file: + with open(config_file, encoding="utf-8") as f: # Load the JSON data as a dictionary - config_org = json.load(file) + config_org = json.load(f) self.config.audio.sample_rate = config_org["data"]["sampling_rate"] # self.config.add_blank = config['add_blank'] # set tokenizer @@ -1613,7 +1611,7 @@ def load_fairseq_checkpoint( assert not self.training @staticmethod - def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "VitsConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: @@ -1626,15 +1624,15 @@ def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict] upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item() if not config.model_args.encoder_sample_rate: - assert ( - upsample_rate == config.audio.hop_length - ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}" + assert upsample_rate == config.audio.hop_length, ( + f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}" + ) else: encoder_to_vocoder_upsampling_factor = config.audio.sample_rate / config.model_args.encoder_sample_rate effective_hop_length = config.audio.hop_length * encoder_to_vocoder_upsampling_factor - assert ( - upsample_rate == effective_hop_length - ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}" + assert upsample_rate == effective_hop_length, ( + f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}" + ) ap = AudioProcessor.init_from_config(config) tokenizer, new_config = TTSTokenizer.init_from_config(config) @@ -1825,7 +1823,7 @@ def to_config(self) -> "CharactersConfig": class FairseqVocab(BaseVocabulary): - def __init__(self, vocab: str): + def __init__(self, vocab: str | os.PathLike[Any]): super(FairseqVocab).__init__() self.vocab = vocab @@ -1835,7 +1833,7 @@ def vocab(self): return self._vocab @vocab.setter - def vocab(self, vocab_file): + def vocab(self, vocab_file: str | os.PathLike[Any]): with open(vocab_file, encoding="utf-8") as f: self._vocab = [x.replace("\n", "") for x in f.readlines()] self.blank = self._vocab[0] diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 38091d7cff..833e2ddaa2 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -2,7 +2,6 @@ import os from dataclasses import dataclass from pathlib import Path -from typing import Optional import librosa import torch @@ -239,10 +238,6 @@ def init_models(self): cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer, ) - @property - def device(self): - return next(self.parameters()).device - @torch.inference_mode() def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = 6): """Compute the conditioning latents for the GPT model from the given audio. @@ -384,9 +379,9 @@ def synthesize(self, text, config, speaker_wav, language, speaker_id=None, speed as latents used at inference. """ - assert ( - "zh-cn" if language == "zh" else language in self.config.languages - ), f" ❗ Language {language} is not supported. Supported languages are {self.config.languages}" + assert "zh-cn" if language == "zh" else language in self.config.languages, ( + f" ❗ Language {language} is not supported. Supported languages are {self.config.languages}" + ) # Use generally found best tuning knobs for generation. settings = { "temperature": config.temperature, @@ -526,9 +521,9 @@ def inference( sent = sent.strip().lower() text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device) - assert ( - text_tokens.shape[-1] < self.args.gpt_max_text_tokens - ), " ❗ XTTS can only generate text with a maximum of 400 tokens." + assert text_tokens.shape[-1] < self.args.gpt_max_text_tokens, ( + " ❗ XTTS can only generate text with a maximum of 400 tokens." + ) with torch.no_grad(): gpt_codes = self.gpt.generate( @@ -631,9 +626,9 @@ def inference_stream( sent = sent.strip().lower() text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device) - assert ( - text_tokens.shape[-1] < self.args.gpt_max_text_tokens - ), " ❗ XTTS can only generate text with a maximum of 400 tokens." + assert text_tokens.shape[-1] < self.args.gpt_max_text_tokens, ( + " ❗ XTTS can only generate text with a maximum of 400 tokens." + ) fake_inputs = self.gpt.compute_embeddings( gpt_cond_latent.to(self.device), @@ -722,13 +717,13 @@ def get_compatible_checkpoint_state_dict(self, model_path): def load_checkpoint( self, config: "XttsConfig", - checkpoint_dir: Optional[str] = None, - checkpoint_path: Optional[str] = None, - vocab_path: Optional[str] = None, + checkpoint_dir: str | None = None, + checkpoint_path: str | None = None, + vocab_path: str | None = None, eval: bool = True, strict: bool = True, use_deepspeed: bool = False, - speaker_file_path: Optional[str] = None, + speaker_file_path: str | None = None, ): """ Loads a checkpoint from disk and initializes the model's state and tokenizer. diff --git a/TTS/tts/utils/data.py b/TTS/tts/utils/data.py index 22e46b683a..d0269060c8 100644 --- a/TTS/tts/utils/data.py +++ b/TTS/tts/utils/data.py @@ -11,7 +11,7 @@ def _pad_data(x, length): def prepare_data(inputs): - max_len = max((len(x) for x in inputs)) + max_len = max(len(x) for x in inputs) return np.stack([_pad_data(x, max_len) for x in inputs]) @@ -23,7 +23,7 @@ def _pad_tensor(x, length): def prepare_tensor(inputs, out_steps): - max_len = max((x.shape[1] for x in inputs)) + max_len = max(x.shape[1] for x in inputs) remainder = max_len % out_steps pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len return np.stack([_pad_tensor(x, pad_len) for x in inputs]) @@ -46,7 +46,7 @@ def _pad_stop_target(x: np.ndarray, length: int, pad_val=1) -> np.ndarray: def prepare_stop_target(inputs, out_steps): """Pad row vectors with 1.""" - max_len = max((x.shape[0] for x in inputs)) + max_len = max(x.shape[0] for x in inputs) remainder = max_len % out_steps pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len return np.stack([_pad_stop_target(x, pad_len) for x in inputs]) diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py index ff10f751f2..a3648eff4b 100644 --- a/TTS/tts/utils/helpers.py +++ b/TTS/tts/utils/helpers.py @@ -1,5 +1,3 @@ -from typing import Optional - import numpy as np import torch from scipy.stats import betabinom @@ -35,7 +33,7 @@ def inverse_transform(self, X): # from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 -def sequence_mask(sequence_length: torch.Tensor, max_len: Optional[int] = None) -> torch.Tensor: +def sequence_mask(sequence_length: torch.Tensor, max_len: int | None = None) -> torch.Tensor: """Create a sequence mask for filtering padding in a sequence tensor. Args: @@ -107,9 +105,9 @@ def rand_segments( _x_lenghts[len_diff < 0] = segment_size len_diff = _x_lenghts - segment_size else: - assert all( - len_diff > 0 - ), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}" + assert all(len_diff > 0), ( + f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}" + ) segment_indices = (torch.rand([B]).type_as(x) * (len_diff + 1)).long() ret = segment(x, segment_indices, segment_size, pad_short=pad_short) return ret, segment_indices @@ -164,7 +162,7 @@ def generate_path(duration: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: def generate_attention( - duration: torch.Tensor, x_mask: torch.Tensor, y_mask: Optional[torch.Tensor] = None + duration: torch.Tensor, x_mask: torch.Tensor, y_mask: torch.Tensor | None = None ) -> torch.Tensor: """Generate an attention map from the linear scale durations. diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index c72de2d4e6..5ce7759dd8 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -1,5 +1,5 @@ import os -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional import fsspec import numpy as np @@ -27,8 +27,8 @@ class LanguageManager(BaseIDManager): def __init__( self, - language_ids_file_path: Union[str, os.PathLike[Any]] = "", - config: Optional[Coqpit] = None, + language_ids_file_path: str | os.PathLike[Any] = "", + config: Coqpit | None = None, ): super().__init__(id_file_path=language_ids_file_path) @@ -40,11 +40,11 @@ def num_languages(self) -> int: return len(list(self.name_to_id.keys())) @property - def language_names(self) -> List: + def language_names(self) -> list: return list(self.name_to_id.keys()) @staticmethod - def parse_language_ids_from_config(c: Coqpit) -> Dict: + def parse_language_ids_from_config(c: Coqpit) -> dict: """Set language id from config. Args: @@ -70,13 +70,13 @@ def set_language_ids_from_config(self, c: Coqpit) -> None: self.name_to_id = self.parse_language_ids_from_config(c) @staticmethod - def parse_ids_from_data(items: List, parse_key: str) -> Any: + def parse_ids_from_data(items: list, parse_key: str) -> Any: raise NotImplementedError - def set_ids_from_data(self, items: List, parse_key: str) -> Any: + def set_ids_from_data(self, items: list, parse_key: str) -> Any: raise NotImplementedError - def save_ids_to_file(self, file_path: Union[str, os.PathLike[Any]]) -> None: + def save_ids_to_file(self, file_path: str | os.PathLike[Any]) -> None: """Save language IDs to a json file. Args: diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py index 3a715dd75d..49e93454f2 100644 --- a/TTS/tts/utils/managers.py +++ b/TTS/tts/utils/managers.py @@ -1,7 +1,7 @@ import json import os import random -from typing import Any, Dict, List, Tuple, Union +from typing import Any import fsspec import numpy as np @@ -13,7 +13,7 @@ from TTS.utils.generic_utils import is_pytorch_at_least_2_4 -def load_file(path: Union[str, os.PathLike[Any]]): +def load_file(path: str | os.PathLike[Any]): path = str(path) if path.endswith(".json"): with fsspec.open(path, "r") as f: @@ -25,7 +25,7 @@ def load_file(path: Union[str, os.PathLike[Any]]): raise ValueError("Unsupported file type") -def save_file(obj: Any, path: Union[str, os.PathLike[Any]]): +def save_file(obj: Any, path: str | os.PathLike[Any]): path = str(path) if path.endswith(".json"): with fsspec.open(path, "w") as f: @@ -42,23 +42,23 @@ class BaseIDManager: It defines common `ID` manager specific functions. """ - def __init__(self, id_file_path: Union[str, os.PathLike[Any]] = ""): + def __init__(self, id_file_path: str | os.PathLike[Any] = ""): self.name_to_id = {} if id_file_path: self.load_ids_from_file(id_file_path) @staticmethod - def _load_json(json_file_path: Union[str, os.PathLike[Any]]) -> Dict: + def _load_json(json_file_path: str | os.PathLike[Any]) -> dict: with fsspec.open(str(json_file_path), "r") as f: return json.load(f) @staticmethod - def _save_json(json_file_path: Union[str, os.PathLike[Any]], data: dict) -> None: + def _save_json(json_file_path: str | os.PathLike[Any], data: dict) -> None: with fsspec.open(str(json_file_path), "w") as f: json.dump(data, f, indent=4) - def set_ids_from_data(self, items: List, parse_key: str) -> None: + def set_ids_from_data(self, items: list, parse_key: str) -> None: """Set IDs from data samples. Args: @@ -66,7 +66,7 @@ def set_ids_from_data(self, items: List, parse_key: str) -> None: """ self.name_to_id = self.parse_ids_from_data(items, parse_key=parse_key) - def load_ids_from_file(self, file_path: Union[str, os.PathLike[Any]]) -> None: + def load_ids_from_file(self, file_path: str | os.PathLike[Any]) -> None: """Set IDs from a file. Args: @@ -74,7 +74,7 @@ def load_ids_from_file(self, file_path: Union[str, os.PathLike[Any]]) -> None: """ self.name_to_id = load_file(file_path) - def save_ids_to_file(self, file_path: Union[str, os.PathLike[Any]]) -> None: + def save_ids_to_file(self, file_path: str | os.PathLike[Any]) -> None: """Save IDs to a json file. Args: @@ -96,7 +96,7 @@ def get_random_id(self) -> Any: return None @staticmethod - def parse_ids_from_data(items: List, parse_key: str) -> Tuple[Dict]: + def parse_ids_from_data(items: list, parse_key: str) -> tuple[dict]: """Parse IDs from data samples retured by `load_tts_samples()`. Args: @@ -133,10 +133,10 @@ class EmbeddingManager(BaseIDManager): def __init__( self, - embedding_file_path: Union[Union[str, os.PathLike[Any]], list[Union[str, os.PathLike[Any]]]] = "", - id_file_path: Union[str, os.PathLike[Any]] = "", - encoder_model_path: Union[str, os.PathLike[Any]] = "", - encoder_config_path: Union[str, os.PathLike[Any]] = "", + embedding_file_path: str | os.PathLike[Any] | list[str | os.PathLike[Any]] = "", + id_file_path: str | os.PathLike[Any] = "", + encoder_model_path: str | os.PathLike[Any] = "", + encoder_config_path: str | os.PathLike[Any] = "", use_cuda: bool = False, ): super().__init__(id_file_path=id_file_path) @@ -179,7 +179,7 @@ def embedding_names(self): """Get embedding names.""" return list(self.embeddings_by_names.keys()) - def save_embeddings_to_file(self, file_path: Union[str, os.PathLike[Any]]) -> None: + def save_embeddings_to_file(self, file_path: str | os.PathLike[Any]) -> None: """Save embeddings to a json file. Args: @@ -188,7 +188,7 @@ def save_embeddings_to_file(self, file_path: Union[str, os.PathLike[Any]]) -> No save_file(self.embeddings, file_path) @staticmethod - def read_embeddings_from_file(file_path: Union[str, os.PathLike[Any]]): + def read_embeddings_from_file(file_path: str | os.PathLike[Any]): """Load embeddings from a json file. Args: @@ -207,7 +207,7 @@ def read_embeddings_from_file(file_path: Union[str, os.PathLike[Any]]): embeddings_by_names[x["name"]].append(x["embedding"]) return name_to_id, clip_ids, embeddings, embeddings_by_names - def load_embeddings_from_file(self, file_path: Union[str, os.PathLike[Any]]) -> None: + def load_embeddings_from_file(self, file_path: str | os.PathLike[Any]) -> None: """Load embeddings from a json file. Args: @@ -217,7 +217,7 @@ def load_embeddings_from_file(self, file_path: Union[str, os.PathLike[Any]]) -> file_path ) - def load_embeddings_from_list_of_files(self, file_paths: list[Union[str, os.PathLike[Any]]]) -> None: + def load_embeddings_from_list_of_files(self, file_paths: list[str | os.PathLike[Any]]) -> None: """Load embeddings from a list of json files and don't allow duplicate keys. Args: @@ -242,7 +242,7 @@ def load_embeddings_from_list_of_files(self, file_paths: list[Union[str, os.Path # reset name_to_id to get the right speaker ids self.name_to_id = {name: i for i, name in enumerate(self.name_to_id)} - def get_embedding_by_clip(self, clip_idx: str) -> List: + def get_embedding_by_clip(self, clip_idx: str) -> list: """Get embedding by clip ID. Args: @@ -253,7 +253,7 @@ def get_embedding_by_clip(self, clip_idx: str) -> List: """ return self.embeddings[clip_idx]["embedding"] - def get_embeddings_by_name(self, idx: str) -> List[List]: + def get_embeddings_by_name(self, idx: str) -> list[list]: """Get all embeddings of a speaker. Args: @@ -264,7 +264,7 @@ def get_embeddings_by_name(self, idx: str) -> List[List]: """ return self.embeddings_by_names[idx] - def get_embeddings_by_names(self) -> Dict: + def get_embeddings_by_names(self) -> dict: """Get all embeddings by names. Returns: @@ -313,11 +313,11 @@ def get_random_embedding(self) -> Any: return None - def get_clips(self) -> List: + def get_clips(self) -> list: return sorted(self.embeddings.keys()) def init_encoder( - self, model_path: Union[str, os.PathLike[Any]], config_path: Union[str, os.PathLike[Any]], use_cuda=False + self, model_path: str | os.PathLike[Any], config_path: str | os.PathLike[Any], use_cuda=False ) -> None: """Initialize a speaker encoder model. @@ -334,9 +334,8 @@ def init_encoder( ) self.encoder_ap = AudioProcessor(**self.encoder_config.audio) - def compute_embedding_from_clip( - self, wav_file: Union[Union[str, os.PathLike[Any]], List[Union[str, os.PathLike[Any]]]] - ) -> list: + @torch.inference_mode() + def compute_embedding_from_clip(self, wav_file: str | os.PathLike[Any] | list[str | os.PathLike[Any]]) -> list: """Compute a embedding from a given audio file. Args: @@ -373,7 +372,7 @@ def _compute(wav_file: str): embedding = _compute(wav_file) return embedding[0].tolist() - def compute_embeddings(self, feats: Union[torch.Tensor, np.ndarray]) -> List: + def compute_embeddings(self, feats: torch.Tensor | np.ndarray) -> list: """Compute embedding from features. Args: diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 89c56583f5..6fab27de5a 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,7 +1,7 @@ import json import logging import os -from typing import Any, Dict, List, Optional, Union +from typing import Any import fsspec import numpy as np @@ -56,11 +56,11 @@ class SpeakerManager(EmbeddingManager): def __init__( self, - data_items: Optional[list[list[Any]]] = None, + data_items: list[list[Any]] | None = None, d_vectors_file_path: str = "", - speaker_id_file_path: Union[str, os.PathLike[Any]] = "", - encoder_model_path: Union[str, os.PathLike[Any]] = "", - encoder_config_path: Union[str, os.PathLike[Any]] = "", + speaker_id_file_path: str | os.PathLike[Any] = "", + encoder_model_path: str | os.PathLike[Any] = "", + encoder_config_path: str | os.PathLike[Any] = "", use_cuda: bool = False, ): super().__init__( @@ -82,11 +82,11 @@ def num_speakers(self): def speaker_names(self): return list(self.name_to_id.keys()) - def get_speakers(self) -> List: + def get_speakers(self) -> list: return self.name_to_id @staticmethod - def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager": + def init_from_config(config: "Coqpit", samples: list[list] | list[dict] = None) -> "SpeakerManager": """Initialize a speaker manager from config Args: @@ -150,7 +150,7 @@ def save_speaker_mapping(out_path, speaker_mapping): json.dump(speaker_mapping, f, indent=4) -def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager: +def get_speaker_manager(c: Coqpit, data: list = None, restore_path: str = None, out_path: str = None) -> SpeakerManager: """Initiate a `SpeakerManager` instance by the provided config. Args: @@ -185,9 +185,9 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, elif not c.use_d_vector_file: # restor speaker manager with speaker ID file. speaker_ids_from_data = speaker_manager.name_to_id speaker_manager.load_ids_from_file(speakers_file) - assert all( - speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data - ), " [!] You cannot introduce new speakers to a pre-trained model." + assert all(speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data), ( + " [!] You cannot introduce new speakers to a pre-trained model." + ) elif c.use_d_vector_file and c.d_vector_file: # new speaker manager with external speaker embeddings. speaker_manager.load_embeddings_from_file(c.d_vector_file) diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py index eddf05db3f..660370a832 100644 --- a/TTS/tts/utils/ssim.py +++ b/TTS/tts/utils/ssim.py @@ -1,6 +1,5 @@ # Adopted from https://github.com/photosynthesis-team/piq -from typing import List, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -24,11 +23,11 @@ def _reduce(x: torch.Tensor, reduction: str = "mean") -> torch.Tensor: def _validate_input( - tensors: List[torch.Tensor], - dim_range: Tuple[int, int] = (0, -1), - data_range: Tuple[float, float] = (0.0, -1.0), + tensors: list[torch.Tensor], + dim_range: tuple[int, int] = (0, -1), + data_range: tuple[float, float] = (0.0, -1.0), # size_dim_range: Tuple[float, float] = (0., -1.), - size_range: Optional[Tuple[int, int]] = None, + size_range: tuple[int, int] | None = None, ) -> None: r"""Check that input(-s) satisfies the requirements Args: @@ -50,16 +49,16 @@ def _validate_input( if size_range is None: assert t.size() == x.size(), f"Expected tensors with same size, got {t.size()} and {x.size()}" else: - assert ( - t.size()[size_range[0] : size_range[1]] == x.size()[size_range[0] : size_range[1]] - ), f"Expected tensors with same size at given dimensions, got {t.size()} and {x.size()}" + assert t.size()[size_range[0] : size_range[1]] == x.size()[size_range[0] : size_range[1]], ( + f"Expected tensors with same size at given dimensions, got {t.size()} and {x.size()}" + ) if dim_range[0] == dim_range[1]: assert t.dim() == dim_range[0], f"Expected number of dimensions to be {dim_range[0]}, got {t.dim()}" elif dim_range[0] < dim_range[1]: - assert ( - dim_range[0] <= t.dim() <= dim_range[1] - ), f"Expected number of dimensions to be between {dim_range[0]} and {dim_range[1]}, got {t.dim()}" + assert dim_range[0] <= t.dim() <= dim_range[1], ( + f"Expected number of dimensions to be between {dim_range[0]} and {dim_range[1]}, got {t.dim()}" + ) if data_range[0] < data_range[1]: assert data_range[0] <= t.min(), f"Expected values to be greater or equal to {data_range[0]}, got {t.min()}" @@ -89,13 +88,13 @@ def ssim( y: torch.Tensor, kernel_size: int = 11, kernel_sigma: float = 1.5, - data_range: Union[int, float] = 1.0, + data_range: int | float = 1.0, reduction: str = "mean", full: bool = False, downsample: bool = True, k1: float = 0.01, k2: float = 0.03, -) -> List[torch.Tensor]: +) -> list[torch.Tensor]: r"""Interface of Structural Similarity (SSIM) index. Inputs supposed to be in range ``[0, data_range]``. To match performance with skimage and tensorflow set ``'downsample' = True``. @@ -218,7 +217,7 @@ def __init__( k2: float = 0.03, downsample: bool = True, reduction: str = "mean", - data_range: Union[int, float] = 1.0, + data_range: int | float = 1.0, ) -> None: super().__init__() @@ -270,7 +269,7 @@ def _ssim_per_channel( kernel: torch.Tensor, k1: float = 0.01, k2: float = 0.03, -) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: +) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: r"""Calculate Structural Similarity (SSIM) index for X and Y per channel. Args: @@ -286,8 +285,7 @@ def _ssim_per_channel( """ if x.size(-1) < kernel.size(-1) or x.size(-2) < kernel.size(-2): raise ValueError( - f"Kernel size can't be greater than actual input size. Input size: {x.size()}. " - f"Kernel size: {kernel.size()}" + f"Kernel size can't be greater than actual input size. Input size: {x.size()}. Kernel size: {kernel.size()}" ) c1 = k1**2 @@ -321,7 +319,7 @@ def _ssim_per_channel_complex( kernel: torch.Tensor, k1: float = 0.01, k2: float = 0.03, -) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: +) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: r"""Calculate Structural Similarity (SSIM) index for Complex X and Y per channel. Args: @@ -338,8 +336,7 @@ def _ssim_per_channel_complex( n_channels = x.size(1) if x.size(-2) < kernel.size(-1) or x.size(-3) < kernel.size(-2): raise ValueError( - f"Kernel size can't be greater than actual input size. Input size: {x.size()}. " - f"Kernel size: {kernel.size()}" + f"Kernel size can't be greater than actual input size. Input size: {x.size()}. Kernel size: {kernel.size()}" ) c1 = k1**2 diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 5dc4cc569f..c09c3f5aa2 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -1,13 +1,9 @@ -from typing import Dict, Optional, Union - import numpy as np import torch from torch import nn -def numpy_to_torch( - np_array: np.ndarray, dtype: torch.dtype, device: Union[str, torch.device] = "cpu" -) -> Optional[torch.Tensor]: +def numpy_to_torch(np_array: np.ndarray, dtype: torch.dtype, device: str | torch.device = "cpu") -> torch.Tensor | None: if np_array is None: return None return torch.as_tensor(np_array, dtype=dtype, device=device) @@ -31,7 +27,7 @@ def run_model_torch( style_text: str = None, d_vector: torch.Tensor = None, language_id: torch.Tensor = None, -) -> Dict: +) -> dict: """Run a torch model for inference. It does not support batch inference. Args: @@ -75,14 +71,14 @@ def inv_spectrogram(postnet_output, ap, CONFIG): return wav -def id_to_torch(aux_id, device: Union[str, torch.device] = "cpu") -> Optional[torch.Tensor]: +def id_to_torch(aux_id, device: str | torch.device = "cpu") -> torch.Tensor | None: if aux_id is not None: aux_id = np.asarray(aux_id) aux_id = torch.from_numpy(aux_id).to(device) return aux_id -def embedding_to_torch(d_vector, device: Union[str, torch.device] = "cpu") -> Optional[torch.Tensor]: +def embedding_to_torch(d_vector, device: str | torch.device = "cpu") -> torch.Tensor | None: if d_vector is not None: d_vector = np.asarray(d_vector) d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor) diff --git a/TTS/tts/utils/text/bangla/phonemizer.py b/TTS/tts/utils/text/bangla/phonemizer.py index cddcb00fd5..1537240380 100644 --- a/TTS/tts/utils/text/bangla/phonemizer.py +++ b/TTS/tts/utils/text/bangla/phonemizer.py @@ -45,7 +45,7 @@ def tag_text(text: str): # create start and end text = "start" + text + "end" # tag text - parts = re.split("[\u0600-\u06FF]+", text) + parts = re.split("[\u0600-\u06ff]+", text) # remove non chars parts = [p for p in parts if p.strip()] # unique parts diff --git a/TTS/tts/utils/text/characters.py b/TTS/tts/utils/text/characters.py index 4bf9bf6bd5..f8beaef036 100644 --- a/TTS/tts/utils/text/characters.py +++ b/TTS/tts/utils/text/characters.py @@ -1,6 +1,5 @@ import logging from dataclasses import replace -from typing import Dict from TTS.tts.configs.shared_configs import CharactersConfig @@ -47,7 +46,7 @@ class BaseVocabulary: vocab (Dict): A dictionary of characters and their corresponding indices. """ - def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None): + def __init__(self, vocab: dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None): self.vocab = vocab self.pad = pad self.blank = blank @@ -290,9 +289,9 @@ def _create_vocab(self): self.vocab = _vocab + list(self._punctuations) if self.is_unique: duplicates = {x for x in self.vocab if self.vocab.count(x) > 1} - assert ( - len(self.vocab) == len(self._char_to_id) == len(self._id_to_char) - ), f" [!] There are duplicate characters in the character set. {duplicates}" + assert len(self.vocab) == len(self._char_to_id) == len(self._id_to_char), ( + f" [!] There are duplicate characters in the character set. {duplicates}" + ) def char_to_id(self, char: str) -> int: try: diff --git a/TTS/tts/utils/text/chinese_mandarin/numbers.py b/TTS/tts/utils/text/chinese_mandarin/numbers.py index 4787ea6100..3e6a043918 100644 --- a/TTS/tts/utils/text/chinese_mandarin/numbers.py +++ b/TTS/tts/utils/text/chinese_mandarin/numbers.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # Licensed under WTFPL or the Unlicense or CC0. # This uses Python 3, but it's easy to port to Python 2 by changing diff --git a/TTS/tts/utils/text/chinese_mandarin/phonemizer.py b/TTS/tts/utils/text/chinese_mandarin/phonemizer.py index e9d62e9d06..4dccdd5778 100644 --- a/TTS/tts/utils/text/chinese_mandarin/phonemizer.py +++ b/TTS/tts/utils/text/chinese_mandarin/phonemizer.py @@ -1,5 +1,3 @@ -from typing import List - try: import jieba import pypinyin @@ -9,7 +7,7 @@ from .pinyinToPhonemes import PINYIN_DICT -def _chinese_character_to_pinyin(text: str) -> List[str]: +def _chinese_character_to_pinyin(text: str) -> list[str]: pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True) pinyins_flat_list = [item for sublist in pinyins for item in sublist] return pinyins_flat_list @@ -25,9 +23,9 @@ def _chinese_pinyin_to_phoneme(pinyin: str) -> str: def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str: tokenized_text = jieba.cut(text, HMM=False) tokenized_text = " ".join(tokenized_text) - pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text) + pinyined_text: list[str] = _chinese_character_to_pinyin(tokenized_text) - results: List[str] = [] + results: list[str] = [] for token in pinyined_text: if token[-1] in "12345": # TODO transform to is_pinyin() diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index f496b9f0dd..795ab246d2 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -1,7 +1,6 @@ """Set of default text cleaners""" import re -from typing import Optional from unicodedata import normalize from anyascii import anyascii @@ -47,7 +46,7 @@ def remove_aux_symbols(text: str) -> str: return text -def replace_symbols(text: str, lang: Optional[str] = "en") -> str: +def replace_symbols(text: str, lang: str | None = "en") -> str: """Replace symbols based on the language tag. Args: diff --git a/TTS/tts/utils/text/cmudict.py b/TTS/tts/utils/text/cmudict.py index f206fb043b..9c0df06196 100644 --- a/TTS/tts/utils/text/cmudict.py +++ b/TTS/tts/utils/text/cmudict.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - import re VALID_SYMBOLS = [ @@ -121,7 +119,7 @@ def get_arpabet(word, cmudict, punctuation_symbols): word = word[:-1] arpabet = cmudict.lookup(word) if arpabet is not None: - return first_symbol + "{%s}" % arpabet[0] + last_symbol + return first_symbol + "{%s}" % arpabet[0] + last_symbol # noqa: UP031 return first_symbol + word + last_symbol diff --git a/TTS/tts/utils/text/english/abbreviations.py b/TTS/tts/utils/text/english/abbreviations.py index cd93c13c8e..20042b255b 100644 --- a/TTS/tts/utils/text/english/abbreviations.py +++ b/TTS/tts/utils/text/english/abbreviations.py @@ -2,7 +2,7 @@ # List of (regular expression, replacement) pairs for abbreviations in english: abbreviations_en = [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("mrs", "misess"), ("mr", "mister"), diff --git a/TTS/tts/utils/text/english/number_norm.py b/TTS/tts/utils/text/english/number_norm.py index e8377ede87..be2a4b3084 100644 --- a/TTS/tts/utils/text/english/number_norm.py +++ b/TTS/tts/utils/text/english/number_norm.py @@ -1,7 +1,6 @@ -""" from https://github.com/keithito/tacotron """ +"""from https://github.com/keithito/tacotron""" import re -from typing import Dict import inflect @@ -21,7 +20,7 @@ def _expand_decimal_point(m): return m.group(1).replace(".", " point ") -def __expand_currency(value: str, inflection: Dict[float, str]) -> str: +def __expand_currency(value: str, inflection: dict[float, str]) -> str: parts = value.replace(",", "").split(".") if len(parts) > 2: return f"{value} {inflection[2]}" # Unexpected format @@ -85,7 +84,11 @@ def _expand_number(m): if num % 100 == 0: return _inflect.number_to_words(num // 100) + " hundred" return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ") - return _inflect.number_to_words(num, andword="") + try: + text = _inflect.number_to_words(num, andword="") + except inflect.NumOutOfRangeError: + text = _inflect.number_to_words(num, group=1).replace(", ", " ") + return text def normalize_numbers(text): diff --git a/TTS/tts/utils/text/french/abbreviations.py b/TTS/tts/utils/text/french/abbreviations.py index f580dfed7b..e317bbbf3a 100644 --- a/TTS/tts/utils/text/french/abbreviations.py +++ b/TTS/tts/utils/text/french/abbreviations.py @@ -2,7 +2,7 @@ # List of (regular expression, replacement) pairs for abbreviations in french: abbreviations_fr = [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("M", "monsieur"), ("Mlle", "mademoiselle"), @@ -38,7 +38,7 @@ ("boul", "boulevard"), ] ] + [ - (re.compile("\\b%s" % x[0]), x[1]) + (re.compile(f"\\b{x[0]}"), x[1]) for x in [ ("Mlle", "mademoiselle"), ("Mlles", "mesdemoiselles"), diff --git a/TTS/tts/utils/text/korean/ko_dictionary.py b/TTS/tts/utils/text/korean/ko_dictionary.py index 9b739339c6..706f9f5daf 100644 --- a/TTS/tts/utils/text/korean/ko_dictionary.py +++ b/TTS/tts/utils/text/korean/ko_dictionary.py @@ -1,4 +1,3 @@ -# coding: utf-8 # Add the word you want to the dictionary. etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"} diff --git a/TTS/tts/utils/text/korean/korean.py b/TTS/tts/utils/text/korean/korean.py index 423aeed377..1b1e0ca0fb 100644 --- a/TTS/tts/utils/text/korean/korean.py +++ b/TTS/tts/utils/text/korean/korean.py @@ -1,4 +1,3 @@ -# coding: utf-8 # Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py import re diff --git a/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py b/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py index 3c4a35bbfa..3be7354636 100644 --- a/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py @@ -1,5 +1,3 @@ -from typing import Dict - from TTS.tts.utils.text.bangla.phonemizer import bangla_text_to_phonemes from TTS.tts.utils.text.phonemizers.base import BasePhonemizer @@ -41,7 +39,7 @@ def _phonemize(self, text, separator): return self.phonemize_bn(text, separator) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict: return {"bn": "Bangla"} def version(self) -> str: diff --git a/TTS/tts/utils/text/phonemizers/base.py b/TTS/tts/utils/text/phonemizers/base.py index 5e701df458..6cc6ec0b37 100644 --- a/TTS/tts/utils/text/phonemizers/base.py +++ b/TTS/tts/utils/text/phonemizers/base.py @@ -1,6 +1,5 @@ import abc import logging -from typing import List, Tuple from TTS.tts.utils.text.punctuation import Punctuation @@ -37,7 +36,7 @@ class BasePhonemizer(abc.ABC): def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False): # ensure the backend is installed on the system if not self.is_available(): - raise RuntimeError("{} not installed on your system".format(self.name())) # pragma: nocover + raise RuntimeError(f"{self.name()} not installed on your system") # pragma: nocover # ensure the backend support the requested language self._language = self._init_language(language) @@ -53,7 +52,7 @@ def _init_language(self, language): """ if not self.is_supported_language(language): - raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend") + raise RuntimeError(f'language "{language}" is not supported by the {self.name()} backend') return language @property @@ -93,7 +92,7 @@ def is_supported_language(self, language): def _phonemize(self, text, separator): """The main phonemization method""" - def _phonemize_preprocess(self, text) -> Tuple[List[str], List]: + def _phonemize_preprocess(self, text) -> tuple[list[str], list]: """Preprocess the text before phonemization 1. remove spaces diff --git a/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py b/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py index e5fcab6e09..fa4a515d1a 100644 --- a/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py @@ -1,5 +1,3 @@ -from typing import Dict - from TTS.tts.utils.text.belarusian.phonemizer import belarusian_text_to_phonemes from TTS.tts.utils.text.phonemizers.base import BasePhonemizer @@ -34,7 +32,7 @@ def _phonemize(self, text, separator): return self.phonemize_be(text, separator) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict: return {"be": "Belarusian"} def version(self) -> str: diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py index a15df716e7..dbcb8994a7 100644 --- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py +++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py @@ -5,7 +5,6 @@ import subprocess import tempfile from pathlib import Path -from typing import Optional from packaging.version import Version @@ -104,7 +103,7 @@ class ESpeak(BasePhonemizer): def __init__( self, language: str, - backend: Optional[str] = None, + backend: str | None = None, punctuations: str = Punctuation.default_puncs(), keep_puncs: bool = True, ): @@ -184,7 +183,7 @@ def phonemize_espeak(self, text: str, separator: str = "|", *, tie: bool = False else: args.append("--ipa=1") if tie: - args.append("--tie=%s" % tie) + args.append(f"--tie={tie}") tmp = tempfile.NamedTemporaryFile(mode="w+t", delete=False, encoding="utf8") tmp.write(text) diff --git a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py index f3e9c9abd4..836fccf5b8 100644 --- a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py +++ b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py @@ -1,5 +1,4 @@ import importlib -from typing import List import gruut from gruut_ipa import IPA @@ -114,7 +113,7 @@ def is_supported_language(self, language): return gruut.is_language_supported(language) @staticmethod - def supported_languages() -> List: + def supported_languages() -> list: """Get a dictionary of supported languages. Returns: diff --git a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py index 878e5e5296..b3b3ba4db7 100644 --- a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py @@ -1,5 +1,3 @@ -from typing import Dict - from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes from TTS.tts.utils.text.phonemizers.base import BasePhonemizer @@ -51,7 +49,7 @@ def phonemize(self, text: str, separator="|", language=None) -> str: return self._phonemize(text, separator) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict: return {"ja-jp": "Japanese (Japan)"} def version(self) -> str: diff --git a/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py b/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py index 0bdba2137b..93930d064e 100644 --- a/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py @@ -1,5 +1,3 @@ -from typing import Dict - from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes from TTS.tts.utils.text.phonemizers.base import BasePhonemizer @@ -44,7 +42,7 @@ def phonemize(self, text: str, separator: str = "", character: str = "hangeul", return self._phonemize(text, separator, character) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict: return {"ko-kr": "hangeul(korean)"} def version(self) -> str: diff --git a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py index 1a9e98b091..87fb940f6b 100644 --- a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py @@ -1,5 +1,4 @@ import logging -from typing import Dict, List from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name @@ -19,7 +18,7 @@ class MultiPhonemizer: lang_to_phonemizer = {} - def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None: # pylint: disable=dangerous-default-value + def __init__(self, lang_to_phonemizer_name: dict = {}) -> None: # pylint: disable=dangerous-default-value for k, v in lang_to_phonemizer_name.items(): if v == "" and k in DEF_LANG_TO_PHONEMIZER.keys(): lang_to_phonemizer_name[k] = DEF_LANG_TO_PHONEMIZER[k] @@ -29,7 +28,7 @@ def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None: # pylint: disab self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name) @staticmethod - def init_phonemizers(lang_to_phonemizer_name: Dict) -> Dict: + def init_phonemizers(lang_to_phonemizer_name: dict) -> dict: lang_to_phonemizer = {} for k, v in lang_to_phonemizer_name.items(): lang_to_phonemizer[k] = get_phonemizer_by_name(v, language=k) @@ -44,7 +43,7 @@ def phonemize(self, text, separator="|", language=""): raise ValueError("Language must be set for multi-phonemizer to phonemize.") return self.lang_to_phonemizer[language].phonemize(text, separator) - def supported_languages(self) -> List: + def supported_languages(self) -> list: return list(self.lang_to_phonemizer.keys()) def print_logs(self, level: int = 0): diff --git a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py index 41480c4173..9e70b03a0c 100644 --- a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py @@ -1,5 +1,3 @@ -from typing import Dict - from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes from TTS.tts.utils.text.phonemizers.base import BasePhonemizer @@ -41,7 +39,7 @@ def _phonemize(self, text, separator): return self.phonemize_zh_cn(text, separator) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict: return {"zh-cn": "Chinese (China)"} def version(self) -> str: diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py index f653cdf13f..07a8753884 100644 --- a/TTS/tts/utils/text/tokenizer.py +++ b/TTS/tts/utils/text/tokenizer.py @@ -1,5 +1,6 @@ import logging -from typing import Callable, Dict, List, Union +from collections.abc import Callable +from typing import Union from TTS.tts.utils.text import cleaners from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes @@ -43,7 +44,7 @@ def __init__( use_phonemes=False, text_cleaner: Callable = None, characters: "BaseCharacters" = None, - phonemizer: Union["Phonemizer", Dict] = None, + phonemizer: Union["Phonemizer", dict] = None, add_blank: bool = False, use_eos_bos=False, ): @@ -65,7 +66,7 @@ def characters(self, new_characters): self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None - def encode(self, text: str) -> List[int]: + def encode(self, text: str) -> list[int]: """Encodes a string of text as a sequence of IDs.""" token_ids = [] for char in text: @@ -80,14 +81,14 @@ def encode(self, text: str) -> List[int]: logger.warning("Character %s not found in the vocabulary. Discarding it.", repr(char)) return token_ids - def decode(self, token_ids: List[int]) -> str: + def decode(self, token_ids: list[int]) -> str: """Decodes a sequence of IDs to a string of text.""" text = "" for token_id in token_ids: text += self.characters.id_to_char(token_id) return text - def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument + def text_to_ids(self, text: str, language: str = None) -> list[int]: # pylint: disable=unused-argument """Converts a string of text to a sequence of token IDs. Args: @@ -121,15 +122,15 @@ def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: text = self.pad_with_bos_eos(text) return text - def ids_to_text(self, id_sequence: List[int]) -> str: + def ids_to_text(self, id_sequence: list[int]) -> str: """Converts a sequence of token IDs to a string of text.""" return self.decode(id_sequence) - def pad_with_bos_eos(self, char_sequence: List[str]): + def pad_with_bos_eos(self, char_sequence: list[str]): """Pads a sequence with the special BOS and EOS characters.""" return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id] - def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False): + def intersperse_blank_char(self, char_sequence: list[str], use_blank_char: bool = False): """Intersperses the blank character between characters in a sequence. Use the ```blank``` character if defined else use the ```pad``` character. @@ -163,7 +164,7 @@ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None): """ # init cleaners text_cleaner = None - if isinstance(config.text_cleaner, (str, list)): + if isinstance(config.text_cleaner, str | list): text_cleaner = getattr(cleaners, config.text_cleaner) # init characters diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py index 0cba7fc8a8..7fd4259178 100644 --- a/TTS/utils/audio/numpy_transforms.py +++ b/TTS/utils/audio/numpy_transforms.py @@ -1,7 +1,7 @@ import logging import os from io import BytesIO -from typing import Any, Optional, Union +from typing import Any import librosa import numpy as np @@ -21,7 +21,7 @@ def build_mel_basis( fft_size: int, num_mels: int, mel_fmin: int, - mel_fmax: Optional[int] = None, + mel_fmax: int | None = None, **kwargs, ) -> np.ndarray: """Build melspectrogram basis. @@ -177,8 +177,8 @@ def stft( *, y: np.ndarray, fft_size: int, - hop_length: Optional[int] = None, - win_length: Optional[int] = None, + hop_length: int | None = None, + win_length: int | None = None, pad_mode: str = "reflect", window: str = "hann", center: bool = True, @@ -205,8 +205,8 @@ def stft( def istft( *, y: np.ndarray, - hop_length: Optional[int] = None, - win_length: Optional[int] = None, + hop_length: int | None = None, + win_length: int | None = None, window: str = "hann", center: bool = True, **kwargs, @@ -248,8 +248,8 @@ def compute_stft_paddings(*, x: np.ndarray, hop_length: int, pad_two_sides: bool def compute_f0( *, x: np.ndarray, - pitch_fmax: Optional[float] = None, - pitch_fmin: Optional[float] = None, + pitch_fmax: float | None = None, + pitch_fmin: float | None = None, hop_length: int, win_length: int, sample_rate: int, @@ -408,7 +408,7 @@ def rms_volume_norm(*, x: np.ndarray, db_level: float = -27.0, **kwargs) -> np.n def load_wav( - *, filename: Union[str, os.PathLike[Any]], sample_rate: Optional[int] = None, resample: bool = False, **kwargs + *, filename: str | os.PathLike[Any], sample_rate: int | None = None, resample: bool = False, **kwargs ) -> np.ndarray: """Read a wav file using Librosa and optionally resample, silence trim, volume normalize. @@ -437,7 +437,7 @@ def load_wav( def save_wav( *, wav: np.ndarray, - path: Union[str, os.PathLike[Any]], + path: str | os.PathLike[Any], sample_rate: int, pipe_out=None, do_rms_norm: bool = False, diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py index bf07333aea..55b8575aa4 100644 --- a/TTS/utils/audio/processor.py +++ b/TTS/utils/audio/processor.py @@ -1,6 +1,6 @@ import logging import os -from typing import Any, Optional, Union +from typing import Any import librosa import numpy as np @@ -222,9 +222,9 @@ def __init__( self.hop_length = hop_length self.win_length = win_length assert min_level_db != 0.0, " [!] min_level_db is 0" - assert ( - self.win_length <= self.fft_size - ), f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}" + assert self.win_length <= self.fft_size, ( + f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}" + ) members = vars(self) logger.info("Setting up Audio Processor...") for key, value in members.items(): @@ -283,7 +283,9 @@ def normalize(self, S: np.ndarray) -> np.ndarray: S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm if self.clip_norm: S_norm = np.clip( - S_norm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type + S_norm, + -self.max_norm, # pylint: disable=invalid-unary-operand-type + self.max_norm, ) return S_norm S_norm = self.max_norm * S_norm @@ -318,7 +320,9 @@ def denormalize(self, S: np.ndarray) -> np.ndarray: if self.symmetric_norm: if self.clip_norm: S_denorm = np.clip( - S_denorm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type + S_denorm, + -self.max_norm, # pylint: disable=invalid-unary-operand-type + self.max_norm, ) S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db return S_denorm + self.ref_level_db @@ -351,9 +355,9 @@ def load_stats(self, stats_path: str) -> tuple[np.array, np.array, np.array, np. if key in skip_parameters: continue if key not in ["sample_rate", "trim_db"]: - assert ( - stats_config[key] == self.__dict__[key] - ), f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" + assert stats_config[key] == self.__dict__[key], ( + f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" + ) return mel_mean, mel_std, linear_mean, linear_std, stats_config # pylint: disable=attribute-defined-outside-init @@ -549,7 +553,7 @@ def sound_norm(x: np.ndarray) -> np.ndarray: return volume_norm(x=x) ### save and load ### - def load_wav(self, filename: Union[str, os.PathLike[Any]], sr: Optional[int] = None) -> np.ndarray: + def load_wav(self, filename: str | os.PathLike[Any], sr: int | None = None) -> np.ndarray: """Read a wav file using Librosa and optionally resample, silence trim, volume normalize. Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before. @@ -576,9 +580,7 @@ def load_wav(self, filename: Union[str, os.PathLike[Any]], sr: Optional[int] = N x = rms_volume_norm(x=x, db_level=self.db_level) return x - def save_wav( - self, wav: np.ndarray, path: Union[str, os.PathLike[Any]], sr: Optional[int] = None, pipe_out=None - ) -> None: + def save_wav(self, wav: np.ndarray, path: str | os.PathLike[Any], sr: int | None = None, pipe_out=None) -> None: """Save a waveform to a file using Scipy. Args: diff --git a/TTS/utils/callbacks.py b/TTS/utils/callbacks.py deleted file mode 100644 index 511d215c65..0000000000 --- a/TTS/utils/callbacks.py +++ /dev/null @@ -1,105 +0,0 @@ -class TrainerCallback: - @staticmethod - def on_init_start(trainer) -> None: - if hasattr(trainer.model, "module"): - if hasattr(trainer.model.module, "on_init_start"): - trainer.model.module.on_init_start(trainer) - else: - if hasattr(trainer.model, "on_init_start"): - trainer.model.on_init_start(trainer) - - if hasattr(trainer.criterion, "on_init_start"): - trainer.criterion.on_init_start(trainer) - - if hasattr(trainer.optimizer, "on_init_start"): - trainer.optimizer.on_init_start(trainer) - - @staticmethod - def on_init_end(trainer) -> None: - if hasattr(trainer.model, "module"): - if hasattr(trainer.model.module, "on_init_end"): - trainer.model.module.on_init_end(trainer) - else: - if hasattr(trainer.model, "on_init_end"): - trainer.model.on_init_end(trainer) - - if hasattr(trainer.criterion, "on_init_end"): - trainer.criterion.on_init_end(trainer) - - if hasattr(trainer.optimizer, "on_init_end"): - trainer.optimizer.on_init_end(trainer) - - @staticmethod - def on_epoch_start(trainer) -> None: - if hasattr(trainer.model, "module"): - if hasattr(trainer.model.module, "on_epoch_start"): - trainer.model.module.on_epoch_start(trainer) - else: - if hasattr(trainer.model, "on_epoch_start"): - trainer.model.on_epoch_start(trainer) - - if hasattr(trainer.criterion, "on_epoch_start"): - trainer.criterion.on_epoch_start(trainer) - - if hasattr(trainer.optimizer, "on_epoch_start"): - trainer.optimizer.on_epoch_start(trainer) - - @staticmethod - def on_epoch_end(trainer) -> None: - if hasattr(trainer.model, "module"): - if hasattr(trainer.model.module, "on_epoch_end"): - trainer.model.module.on_epoch_end(trainer) - else: - if hasattr(trainer.model, "on_epoch_end"): - trainer.model.on_epoch_end(trainer) - - if hasattr(trainer.criterion, "on_epoch_end"): - trainer.criterion.on_epoch_end(trainer) - - if hasattr(trainer.optimizer, "on_epoch_end"): - trainer.optimizer.on_epoch_end(trainer) - - @staticmethod - def on_train_step_start(trainer) -> None: - if hasattr(trainer.model, "module"): - if hasattr(trainer.model.module, "on_train_step_start"): - trainer.model.module.on_train_step_start(trainer) - else: - if hasattr(trainer.model, "on_train_step_start"): - trainer.model.on_train_step_start(trainer) - - if hasattr(trainer.criterion, "on_train_step_start"): - trainer.criterion.on_train_step_start(trainer) - - if hasattr(trainer.optimizer, "on_train_step_start"): - trainer.optimizer.on_train_step_start(trainer) - - @staticmethod - def on_train_step_end(trainer) -> None: - if hasattr(trainer.model, "module"): - if hasattr(trainer.model.module, "on_train_step_end"): - trainer.model.module.on_train_step_end(trainer) - else: - if hasattr(trainer.model, "on_train_step_end"): - trainer.model.on_train_step_end(trainer) - - if hasattr(trainer.criterion, "on_train_step_end"): - trainer.criterion.on_train_step_end(trainer) - - if hasattr(trainer.optimizer, "on_train_step_end"): - trainer.optimizer.on_train_step_end(trainer) - - @staticmethod - def on_keyboard_interrupt(trainer) -> None: - if hasattr(trainer.model, "module"): - if hasattr(trainer.model.module, "on_keyboard_interrupt"): - trainer.model.module.on_keyboard_interrupt(trainer) - else: - if hasattr(trainer.model, "on_keyboard_interrupt"): - trainer.model.on_keyboard_interrupt(trainer) - - if hasattr(trainer.criterion, "on_keyboard_interrupt"): - trainer.criterion.on_keyboard_interrupt(trainer) - - if hasattr(trainer.optimizer, "on_keyboard_interrupt"): - trainer.optimizer.on_keyboard_interrupt(trainer) diff --git a/TTS/utils/capacitron_optimizer.py b/TTS/utils/capacitron_optimizer.py index 7206ffd508..01f303f98d 100644 --- a/TTS/utils/capacitron_optimizer.py +++ b/TTS/utils/capacitron_optimizer.py @@ -1,4 +1,4 @@ -from typing import Generator +from collections.abc import Generator from trainer.trainer_utils import get_optimizer diff --git a/TTS/utils/download.py b/TTS/utils/download.py index e94b1d68c8..75ef9164f6 100644 --- a/TTS/utils/download.py +++ b/TTS/utils/download.py @@ -7,8 +7,9 @@ import urllib import urllib.request import zipfile +from collections.abc import Iterable from os.path import expanduser -from typing import Any, Iterable, List, Optional +from typing import Any from torch.utils.model_zoo import tqdm @@ -16,7 +17,7 @@ def stream_url( - url: str, start_byte: Optional[int] = None, block_size: int = 32 * 1024, progress_bar: bool = True + url: str, start_byte: int | None = None, block_size: int = 32 * 1024, progress_bar: bool = True ) -> Iterable: """Stream url by chunk @@ -36,7 +37,7 @@ def stream_url( req = urllib.request.Request(url) if start_byte: - req.headers["Range"] = "bytes={}-".format(start_byte) + req.headers["Range"] = f"bytes={start_byte}-" with ( urllib.request.urlopen(req) as upointer, @@ -61,8 +62,8 @@ def stream_url( def download_url( url: str, download_folder: str, - filename: Optional[str] = None, - hash_value: Optional[str] = None, + filename: str | None = None, + hash_value: str | None = None, hash_type: str = "sha256", progress_bar: bool = True, resume: bool = False, @@ -88,10 +89,10 @@ def download_url( filepath = os.path.join(download_folder, filename) if resume and os.path.exists(filepath): mode = "ab" - local_size: Optional[int] = os.path.getsize(filepath) + local_size: int | None = os.path.getsize(filepath) elif not resume and os.path.exists(filepath): - raise RuntimeError("{} already exists. Delete the file manually and retry.".format(filepath)) + raise RuntimeError(f"{filepath} already exists. Delete the file manually and retry.") else: mode = "wb" local_size = None @@ -100,7 +101,7 @@ def download_url( with open(filepath, "rb") as file_obj: if validate_file(file_obj, hash_value, hash_type): return - raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath)) + raise RuntimeError(f"The hash of {filepath} does not match. Delete the file manually and retry.") with open(filepath, mode) as fpointer: for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar): @@ -108,7 +109,7 @@ def download_url( with open(filepath, "rb") as file_obj: if hash_value and not validate_file(file_obj, hash_value, hash_type): - raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath)) + raise RuntimeError(f"The hash of {filepath} does not match. Delete the file manually and retry.") def validate_file(file_obj: Any, hash_value: str, hash_type: str = "sha256") -> bool: @@ -140,7 +141,7 @@ def validate_file(file_obj: Any, hash_value: str, hash_type: str = "sha256") -> return hash_func.hexdigest() == hash_value -def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bool = False) -> List[str]: +def extract_archive(from_path: str, to_path: str | None = None, overwrite: bool = False) -> list[str]: """Extract archive. Args: from_path (str): the path of the archive. diff --git a/TTS/utils/downloaders.py b/TTS/utils/downloaders.py index 8705873982..c06c2649ad 100644 --- a/TTS/utils/downloaders.py +++ b/TTS/utils/downloaders.py @@ -1,6 +1,5 @@ import logging import os -from typing import Optional from TTS.utils.download import download_kaggle_dataset, download_url, extract_archive @@ -21,7 +20,7 @@ def download_ljspeech(path: str): extract_archive(archive) -def download_vctk(path: str, use_kaggle: Optional[bool] = False): +def download_vctk(path: str, use_kaggle: bool | None = False): """Download and extract VCTK dataset. Args: @@ -49,7 +48,7 @@ def download_tweb(path: str): download_kaggle_dataset("bryanpark/the-world-english-bible-speech-dataset", "TWEB", path) -def download_libri_tts(path: str, subset: Optional[str] = "all"): +def download_libri_tts(path: str, subset: str | None = "all"): """Download and extract libri tts dataset. Args: diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 54bb5ba825..e1df6f6ed4 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -1,11 +1,11 @@ -# -*- coding: utf-8 -*- import datetime import importlib import logging import os import re +from collections.abc import Callable from pathlib import Path -from typing import Any, Callable, Dict, Optional, TextIO, TypeVar, Union +from typing import Any, TextIO, TypeVar import torch from packaging.version import Version @@ -16,11 +16,11 @@ _T = TypeVar("_T") -def exists(val: Union[_T, None]) -> TypeIs[_T]: +def exists(val: _T | None) -> TypeIs[_T]: return val is not None -def default(val: Union[_T, None], d: Union[_T, Callable[[], _T]]) -> _T: +def default(val: _T | None, d: _T | Callable[[], _T]) -> _T: if exists(val): return val return d() if callable(d) else d @@ -31,6 +31,7 @@ def to_camel(text): text = re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) text = text.replace("Tts", "TTS") text = text.replace("vc", "VC") + text = text.replace("Knn", "KNN") return text @@ -68,7 +69,7 @@ def get_import_path(obj: object) -> str: return ".".join([type(obj).__module__, type(obj).__name__]) -def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict: +def format_aux_input(def_args: dict, kwargs: dict) -> dict: """Format kwargs to hande auxilary inputs to models. Args: @@ -79,9 +80,9 @@ def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict: Dict: arguments with formatted auxilary inputs. """ kwargs = kwargs.copy() - for name in def_args: + for name, arg in def_args.items(): if name not in kwargs or kwargs[name] is None: - kwargs[name] = def_args[name] + kwargs[name] = arg return kwargs @@ -107,9 +108,9 @@ def setup_logger( logger_name: str, level: int = logging.INFO, *, - formatter: Optional[logging.Formatter] = None, - stream: Optional[TextIO] = None, - log_dir: Optional[Union[str, os.PathLike[Any]]] = None, + formatter: logging.Formatter | None = None, + stream: TextIO | None = None, + log_dir: str | os.PathLike[Any] | None = None, log_name: str = "log", ) -> None: """Set up a logger. @@ -145,6 +146,6 @@ def is_pytorch_at_least_2_4() -> bool: return Version(torch.__version__) >= Version("2.4") -def optional_to_str(x: Optional[Any]) -> str: +def optional_to_str(x: Any | None) -> str: """Convert input to string, using empty string if input is None.""" return "" if x is None else str(x) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index d7d4deab9d..20d6ab226b 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -6,7 +6,7 @@ import zipfile from pathlib import Path from shutil import copyfile, rmtree -from typing import Any, Optional, TypedDict, Union +from typing import Any, TypedDict import fsspec import requests @@ -15,6 +15,7 @@ from typing_extensions import Required from TTS.config import load_config, read_json_with_comments +from TTS.vc.configs.knnvc_config import KNNVCConfig logger = logging.getLogger(__name__) @@ -26,12 +27,12 @@ class ModelItem(TypedDict, total=False): license: str author: str contact: str - commit: Optional[str] + commit: str | None model_hash: str tos_required: bool - default_vocoder: Optional[str] - model_url: Union[str, list[str]] - github_rls_url: Union[str, list[str]] + default_vocoder: str | None + model_url: str | list[str] + github_rls_url: str | list[str] hf_url: list[str] @@ -48,7 +49,7 @@ class ModelItem(TypedDict, total=False): } -class ModelManager(object): +class ModelManager: tqdm_progress = None """Manage TTS models defined in .models.json. It provides an interface to list and download @@ -65,8 +66,8 @@ class ModelManager(object): def __init__( self, - models_file: Optional[Union[str, os.PathLike[Any]]] = None, - output_prefix: Optional[Union[str, os.PathLike[Any]]] = None, + models_file: str | os.PathLike[Any] | None = None, + output_prefix: str | os.PathLike[Any] | None = None, progress_bar: bool = False, ) -> None: super().__init__() @@ -83,7 +84,7 @@ def __init__( path = Path(__file__).parent / "../.models.json" self.read_models_file(path) - def read_models_file(self, file_path: Union[str, os.PathLike[Any]]) -> None: + def read_models_file(self, file_path: str | os.PathLike[Any]) -> None: """Read .models.json as a dict Args: @@ -267,13 +268,13 @@ def set_model_url(model_item: ModelItem) -> ModelItem: model_item["model_url"] = model_item["github_rls_url"] elif "hf_url" in model_item: model_item["model_url"] = model_item["hf_url"] - elif "fairseq" in model_item["model_name"]: + elif "fairseq" in model_item.get("model_name", ""): model_item["model_url"] = "https://dl.fbaipublicfiles.com/mms/tts/" - elif "xtts" in model_item["model_name"]: + elif "xtts" in model_item.get("model_name", ""): model_item["model_url"] = "https://huggingface.co/coqui/" return model_item - def _set_model_item(self, model_name: str) -> tuple[ModelItem, str, str, Optional[str]]: + def _set_model_item(self, model_name: str) -> tuple[ModelItem, str, str, str | None]: # fetch model info from the dict if "fairseq" in model_name: model_type, lang, dataset, model = model_name.split("/") @@ -367,6 +368,9 @@ def create_dir_and_download_model(self, model_name: str, model_item: ModelItem, logger.exception("Failed to download the model file to %s", output_path) rmtree(output_path) raise e + checkpoints = list(Path(output_path).glob("*.pt*")) + if len(checkpoints) == 1: + checkpoints[0].rename(checkpoints[0].parent / "model.pth") self.print_model_license(model_item=model_item) def check_if_configs_are_equal(self, model_name: str, model_item: ModelItem, output_path: Path) -> None: @@ -385,7 +389,7 @@ def check_if_configs_are_equal(self, model_name: str, model_item: ModelItem, out logger.info("%s is already downloaded however it has been changed. Redownloading it...", model_name) self.create_dir_and_download_model(model_name, model_item, output_path) - def download_model(self, model_name: str) -> tuple[Path, Optional[Path], ModelItem]: + def download_model(self, model_name: str) -> tuple[Path, Path | None, ModelItem]: """Download model files given the full model name. Model name is in the format 'type/language/dataset/model' @@ -431,11 +435,14 @@ def download_model(self, model_name: str) -> tuple[Path, Optional[Path], ModelIt output_model_path = output_path output_config_path = None if ( - model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name and "xtts" not in model_name + model not in ["tortoise-v2", "bark", "knnvc"] and "fairseq" not in model_name and "xtts" not in model_name ): # TODO:This is stupid but don't care for now. output_model_path, output_config_path = self._find_files(output_path) else: output_config_path = output_model_path / "config.json" + if model == "knnvc" and not output_config_path.exists(): + knnvc_config = KNNVCConfig() + knnvc_config.save_json(output_config_path) # update paths in the config.json self._update_paths(output_path, output_config_path) return output_model_path, output_config_path, model_item @@ -464,7 +471,7 @@ def _find_files(output_path: Path) -> tuple[Path, Path]: return model_file, config_file @staticmethod - def _find_speaker_encoder(output_path: Path) -> Optional[Path]: + def _find_speaker_encoder(output_path: Path) -> Path | None: """Find the speaker encoder file in the output path Args: @@ -516,7 +523,7 @@ def _update_paths(self, output_path: Path, config_path: Path) -> None: self._update_path("model_args.speaker_encoder_config_path", speaker_encoder_config_path, config_path) @staticmethod - def _update_path(field_name: str, new_path: Optional[Path], config_path: Path) -> None: + def _update_path(field_name: str, new_path: Path | None, config_path: Path) -> None: """Update the path in the model config.json for the current environment after download""" if new_path is not None and new_path.is_file(): config = load_config(str(config_path)) @@ -612,9 +619,7 @@ def _download_tar_file(file_url: str, output_folder: Path, progress_bar: bool) - rmtree(output_folder / tar_names[0]) @staticmethod - def _download_model_files( - file_urls: list[str], output_folder: Union[str, os.PathLike[Any]], progress_bar: bool - ) -> None: + def _download_model_files(file_urls: list[str], output_folder: str | os.PathLike[Any], progress_bar: bool) -> None: """Download the github releases""" output_folder = Path(output_folder) for file_url in file_urls: diff --git a/TTS/utils/radam.py b/TTS/utils/radam.py index cbd14990f3..b893d115c9 100644 --- a/TTS/utils/radam.py +++ b/TTS/utils/radam.py @@ -9,16 +9,16 @@ class RAdam(Optimizer): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): if lr < 0.0: - raise ValueError("Invalid learning rate: {}".format(lr)) + raise ValueError(f"Invalid learning rate: {lr}") if eps < 0.0: - raise ValueError("Invalid epsilon value: {}".format(eps)) + raise ValueError(f"Invalid epsilon value: {eps}") if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}") if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}") self.degenerated_to_sgd = degenerated_to_sgd - if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): + if isinstance(params, list | tuple) and len(params) > 0 and isinstance(params[0], dict): for param in params: if "betas" in param and (param["betas"][0] != betas[0] or param["betas"][1] != betas[1]): param["buffer"] = [[None, None, None] for _ in range(10)] diff --git a/TTS/utils/samplers.py b/TTS/utils/samplers.py index b08a763a33..d24733977a 100644 --- a/TTS/utils/samplers.py +++ b/TTS/utils/samplers.py @@ -1,6 +1,6 @@ import math import random -from typing import Callable, List, Union +from collections.abc import Callable from torch.utils.data.sampler import BatchSampler, Sampler, SubsetRandomSampler @@ -49,9 +49,9 @@ def __init__( label_key="class_name", ): super().__init__(dataset_items) - assert ( - batch_size % (num_classes_in_batch * num_gpus) == 0 - ), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)." + assert batch_size % (num_classes_in_batch * num_gpus) == 0, ( + "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)." + ) label_indices = {} for idx, item in enumerate(dataset_items): @@ -176,7 +176,7 @@ def __init__( data, batch_size, drop_last, - sort_key: Union[Callable, List] = identity, + sort_key: Callable | list = identity, bucket_size_multiplier=100, ): super().__init__(sampler, batch_size, drop_last) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 517cb7d2b2..cebb094a48 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -2,7 +2,7 @@ import os import time from pathlib import Path -from typing import Any, List, Optional, Union +from typing import Any import numpy as np import pysbd @@ -30,18 +30,18 @@ class Synthesizer(nn.Module): def __init__( self, *, - tts_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None, - tts_config_path: Optional[Union[str, os.PathLike[Any]]] = None, - tts_speakers_file: Optional[Union[str, os.PathLike[Any]]] = None, - tts_languages_file: Optional[Union[str, os.PathLike[Any]]] = None, - vocoder_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None, - vocoder_config: Optional[Union[str, os.PathLike[Any]]] = None, - encoder_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None, - encoder_config: Optional[Union[str, os.PathLike[Any]]] = None, - vc_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None, - vc_config: Optional[Union[str, os.PathLike[Any]]] = None, - model_dir: Optional[Union[str, os.PathLike[Any]]] = None, - voice_dir: Optional[Union[str, os.PathLike[Any]]] = None, + tts_checkpoint: str | os.PathLike[Any] | None = None, + tts_config_path: str | os.PathLike[Any] | None = None, + tts_speakers_file: str | os.PathLike[Any] | None = None, + tts_languages_file: str | os.PathLike[Any] | None = None, + vocoder_checkpoint: str | os.PathLike[Any] | None = None, + vocoder_config: str | os.PathLike[Any] | None = None, + encoder_checkpoint: str | os.PathLike[Any] | None = None, + encoder_config: str | os.PathLike[Any] | None = None, + vc_checkpoint: str | os.PathLike[Any] | None = None, + vc_config: str | os.PathLike[Any] | None = None, + model_dir: str | os.PathLike[Any] | None = None, + voice_dir: str | os.PathLike[Any] | None = None, use_cuda: bool = False, ) -> None: """General 🐸 TTS interface for inference. It takes a tts and a vocoder @@ -98,12 +98,12 @@ def __init__( if tts_checkpoint: self._load_tts(self.tts_checkpoint, self.tts_config_path, use_cuda) - if vocoder_checkpoint: - self._load_vocoder(self.vocoder_checkpoint, self.vocoder_config, use_cuda) - if vc_checkpoint and model_dir == "": self._load_vc(self.vc_checkpoint, self.vc_config, use_cuda) + if vocoder_checkpoint: + self._load_vocoder(self.vocoder_checkpoint, self.vocoder_config, use_cuda) + if model_dir: if "fairseq" in model_dir: self._load_fairseq_from_dir(model_dir, use_cuda) @@ -139,7 +139,9 @@ def _load_vc(self, vc_checkpoint: str, vc_config_path: str, use_cuda: bool) -> N """ # pylint: disable=global-statement self.vc_config = load_config(vc_config_path) - self.output_sample_rate = self.vc_config.audio["output_sample_rate"] + self.output_sample_rate = self.vc_config.audio.get( + "output_sample_rate", self.vc_config.audio.get("sample_rate", None) + ) self.vc_model = setup_vc_model(config=self.vc_config) self.vc_model.load_checkpoint(self.vc_config, vc_checkpoint) if use_cuda: @@ -246,7 +248,7 @@ def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> N if use_cuda: self.vocoder_model.cuda() - def split_into_sentences(self, text) -> List[str]: + def split_into_sentences(self, text) -> list[str]: """Split give text into sentences. Args: @@ -257,7 +259,7 @@ def split_into_sentences(self, text) -> List[str]: """ return self.seg.segment(text) - def save_wav(self, wav: List[int], path: str, pipe_out=None) -> None: + def save_wav(self, wav: list[int], path: str, pipe_out=None) -> None: """Save the waveform as a file. Args: @@ -272,9 +274,21 @@ def save_wav(self, wav: List[int], path: str, pipe_out=None) -> None: wav = np.array(wav) save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate, pipe_out=pipe_out) - def voice_conversion(self, source_wav: str, target_wav: str) -> List[int]: - output_wav = self.vc_model.voice_conversion(source_wav, target_wav) - return output_wav + def voice_conversion(self, source_wav: str, target_wav: str | list[str], **kwargs) -> list[int]: + start_time = time.time() + + if not isinstance(target_wav, list): + target_wav = [target_wav] + output = self.vc_model.voice_conversion(source_wav, target_wav, **kwargs) + if self.vocoder_model is not None: + output = self.vocoder_model.inference(output) + + output = output.squeeze() + process_time = time.time() - start_time + audio_time = len(output) / self.output_sample_rate + logger.info("Processing time: %.3f", process_time) + logger.info("Real-time factor: %.3f", process_time / audio_time) + return output def tts( self, @@ -288,7 +302,7 @@ def tts( reference_speaker_name=None, split_sentences: bool = True, **kwargs, - ) -> List[int]: + ) -> list[int]: """🐸 TTS magic. Run all the models and generate speech. Args: diff --git a/TTS/vc/configs/freevc_config.py b/TTS/vc/configs/freevc_config.py index d600bfb1f4..37f8048b7f 100644 --- a/TTS/vc/configs/freevc_config.py +++ b/TTS/vc/configs/freevc_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List, Optional from coqpit import Coqpit @@ -47,7 +46,7 @@ class FreeVCAudioConfig(Coqpit): win_length: int = field(default=1280) n_mel_channels: int = field(default=80) mel_fmin: float = field(default=0.0) - mel_fmax: Optional[float] = field(default=None) + mel_fmax: float | None = field(default=None) @dataclass @@ -122,11 +121,11 @@ class FreeVCArgs(Coqpit): kernel_size: int = field(default=3) p_dropout: float = field(default=0.1) resblock: str = field(default="1") - resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11]) - resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) - upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2]) + resblock_kernel_sizes: list[int] = field(default_factory=lambda: [3, 7, 11]) + resblock_dilation_sizes: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) + upsample_rates: list[int] = field(default_factory=lambda: [10, 8, 2, 2]) upsample_initial_channel: int = field(default=512) - upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4]) + upsample_kernel_sizes: list[int] = field(default_factory=lambda: [16, 16, 4, 4]) n_layers_q: int = field(default=3) use_spectral_norm: bool = field(default=False) gin_channels: int = field(default=256) @@ -269,7 +268,7 @@ class FreeVCConfig(BaseVCConfig): # use d-vectors use_d_vector_file: bool = False - d_vector_file: List[str] = None + d_vector_file: list[str] = None d_vector_dim: int = None def __post_init__(self): diff --git a/TTS/vc/configs/knnvc_config.py b/TTS/vc/configs/knnvc_config.py new file mode 100644 index 0000000000..7728ea0a9b --- /dev/null +++ b/TTS/vc/configs/knnvc_config.py @@ -0,0 +1,59 @@ +from dataclasses import dataclass, field + +from coqpit import Coqpit + +from TTS.config.shared_configs import BaseAudioConfig +from TTS.vc.configs.shared_configs import BaseVCConfig + + +@dataclass +class KNNVCAudioConfig(BaseAudioConfig): + """Audio configuration. + + Args: + sample_rate (int): + The sampling rate of the input waveform. + """ + + sample_rate: int = field(default=16000) + + +@dataclass +class KNNVCArgs(Coqpit): + """Model arguments. + + Args: + ssl_dim (int): + The dimension of the self-supervised learning embedding. + """ + + ssl_dim: int = field(default=1024) + + +@dataclass +class KNNVCConfig(BaseVCConfig): + """Parameters. + + Args: + model (str): + Model name. Do not change unless you know what you are doing. + + model_args (KNNVCArgs): + Model architecture arguments. Defaults to `KNNVCArgs()`. + + audio (KNNVCAudioConfig): + Audio processing configuration. Defaults to `KNNVCAudioConfig()`. + + wavlm_layer (int): + WavLM layer to use for feature extraction. + + topk (int): + k in the kNN -- the number of nearest neighbors to average over + """ + + model: str = "knnvc" + model_args: KNNVCArgs = field(default_factory=KNNVCArgs) + audio: KNNVCAudioConfig = field(default_factory=KNNVCAudioConfig) + + wavlm_layer: int = 6 + topk: int = 4 diff --git a/TTS/vc/configs/openvoice_config.py b/TTS/vc/configs/openvoice_config.py index 261cdd6f47..167a61ddb3 100644 --- a/TTS/vc/configs/openvoice_config.py +++ b/TTS/vc/configs/openvoice_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import Optional from coqpit import Coqpit @@ -187,13 +186,13 @@ class OpenVoiceConfig(BaseVCConfig): # multi-speaker settings # use speaker embedding layer num_speakers: int = 0 - speakers_file: Optional[str] = None + speakers_file: str | None = None speaker_embedding_channels: int = 256 # use d-vectors use_d_vector_file: bool = False - d_vector_file: Optional[list[str]] = None - d_vector_dim: Optional[int] = None + d_vector_file: list[str] | None = None + d_vector_dim: int | None = None def __post_init__(self) -> None: for key, val in self.model_args.items(): diff --git a/TTS/vc/configs/shared_configs.py b/TTS/vc/configs/shared_configs.py index b2fe63d29d..b84a97e487 100644 --- a/TTS/vc/configs/shared_configs.py +++ b/TTS/vc/configs/shared_configs.py @@ -1,12 +1,11 @@ from dataclasses import dataclass, field -from typing import List from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig @dataclass class BaseVCConfig(BaseTrainingConfig): - """Shared parameters among all the tts models. + """Shared parameters among all the VC models. Args: @@ -132,7 +131,7 @@ class BaseVCConfig(BaseTrainingConfig): shuffle: bool = False drop_last: bool = False # dataset - datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) + datasets: list[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # optimizer optimizer: str = "radam" optimizer_params: dict = None @@ -140,7 +139,7 @@ class BaseVCConfig(BaseTrainingConfig): lr_scheduler: str = None lr_scheduler_params: dict = field(default_factory=lambda: {}) # testing - test_sentences: List[str] = field(default_factory=lambda: []) + test_sentences: list[str] = field(default_factory=lambda: []) # evaluation eval_split_max_size: int = None eval_split_size: float = 0.01 diff --git a/TTS/vc/layers/freevc/modules.py b/TTS/vc/layers/freevc/modules.py index c34f22d701..92df39b5e0 100644 --- a/TTS/vc/layers/freevc/modules.py +++ b/TTS/vc/layers/freevc/modules.py @@ -48,7 +48,7 @@ def forward(self, x, x_mask): class WN(torch.nn.Module): def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): - super(WN, self).__init__() + super().__init__() assert kernel_size % 2 == 1 self.hidden_channels = hidden_channels self.kernel_size = (kernel_size,) @@ -122,7 +122,7 @@ def remove_weight_norm(self): class ResBlock1(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): - super(ResBlock1, self).__init__() + super().__init__() self.convs1 = nn.ModuleList( [ weight_norm( @@ -198,7 +198,7 @@ def remove_weight_norm(self): class ResBlock2(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3)): - super(ResBlock2, self).__init__() + super().__init__() self.convs = nn.ModuleList( [ weight_norm( diff --git a/TTS/vc/layers/freevc/speaker_encoder/audio.py b/TTS/vc/layers/freevc/speaker_encoder/audio.py index 5fa317ce45..5d14bf2f19 100644 --- a/TTS/vc/layers/freevc/speaker_encoder/audio.py +++ b/TTS/vc/layers/freevc/speaker_encoder/audio.py @@ -1,5 +1,4 @@ from pathlib import Path -from typing import Optional, Union # import webrtcvad import librosa @@ -16,7 +15,7 @@ int16_max = (2**15) - 1 -def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int] = None): +def preprocess_wav(fpath_or_wav: str | Path | np.ndarray, source_sr: int | None = None): """ Applies the preprocessing operations used in training the Speaker Encoder to a waveform either on disk or in memory. The waveform will be resampled to match the data hyperparameters. diff --git a/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py b/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py index a6d5bcf942..d2f4ffe394 100644 --- a/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py +++ b/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py @@ -1,6 +1,5 @@ import logging from time import perf_counter as timer -from typing import List, Union import numpy as np import torch @@ -22,12 +21,8 @@ class SpeakerEncoder(nn.Module): - def __init__(self, weights_fpath, device: Union[str, torch.device] = None): - """ - :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). - If None, defaults to cuda if it is available on your machine, otherwise the model will - run on cpu. Outputs are always returned on the cpu, as numpy arrays. - """ + def __init__(self, weights_fpath): + """FreeVC speaker encoder.""" super().__init__() # Define the network @@ -35,13 +30,6 @@ def __init__(self, weights_fpath, device: Union[str, torch.device] = None): self.linear = nn.Linear(model_hidden_size, model_embedding_size) self.relu = nn.ReLU() - # Get the target device - if device is None: - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - elif isinstance(device, str): - device = torch.device(device) - self.device = device - # Load the pretrained model'speaker weights # weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt") # if not weights_fpath.exists(): @@ -52,8 +40,11 @@ def __init__(self, weights_fpath, device: Union[str, torch.device] = None): checkpoint = load_fsspec(weights_fpath, map_location="cpu") self.load_state_dict(checkpoint["model_state"], strict=False) - self.to(device) - logger.info("Loaded the voice encoder model on %s in %.2f seconds.", device.type, timer() - start) + logger.info("Loaded the voice encoder model in %.2f seconds.", timer() - start) + + @property + def device(self): + return next(self.parameters()).device def forward(self, mels: torch.FloatTensor): """ @@ -97,7 +88,7 @@ def compute_partial_slices(n_samples: int, rate, min_coverage): assert 0 < min_coverage <= 1 # Compute how many frames separate two partial utterances - samples_per_frame = int((sampling_rate * mel_window_step / 1000)) + samples_per_frame = int(sampling_rate * mel_window_step / 1000) n_frames = int(np.ceil((n_samples + 1) / samples_per_frame)) frame_step = int(np.round((sampling_rate / rate) / samples_per_frame)) assert 0 < frame_step, "The rate is too high" @@ -123,7 +114,7 @@ def compute_partial_slices(n_samples: int, rate, min_coverage): return wav_slices, mel_slices - def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75): + def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75) -> torch.Tensor: """ Computes an embedding for a single utterance. The utterance is divided in partial utterances and an embedding is computed for each. The complete utterance embedding is the @@ -143,8 +134,8 @@ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_ then the last partial utterance will be considered by zero-padding the audio. Otherwise, it will be discarded. If there aren't enough frames for one partial utterance, this parameter is ignored so that the function always returns at least one slice. - :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If - is True, the partial utterances as a numpy array of float32 of shape + :return: the embedding as a float tensor of shape (model_embedding_size,). If + is True, the partial utterances as a float tensor of shape (n_partials, model_embedding_size) and the wav partials as a list of slices will also be returned. """ @@ -160,24 +151,26 @@ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_ mels = np.array([mel[s] for s in mel_slices]) with torch.no_grad(): mels = torch.from_numpy(mels).to(self.device) - partial_embeds = self(mels).cpu().numpy() + partial_embeds = self(mels) # Compute the utterance embedding from the partial embeddings - raw_embed = np.mean(partial_embeds, axis=0) - embed = raw_embed / np.linalg.norm(raw_embed, 2) + raw_embed = partial_embeds.mean(dim=0) + embed = raw_embed / torch.norm(raw_embed, p=2) if return_partials: return embed, partial_embeds, wav_slices return embed - def embed_speaker(self, wavs: List[np.ndarray], **kwargs): + def embed_speaker(self, wavs: list[np.ndarray], **kwargs): """ Compute the embedding of a collection of wavs (presumably from the same speaker) by averaging their embedding and L2-normalizing it. :param wavs: list of wavs a numpy arrays of float32. :param kwargs: extra arguments to embed_utterance() - :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). + :return: the embedding as a float tensor of shape (model_embedding_size,). """ - raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) for wav in wavs], axis=0) - return raw_embed / np.linalg.norm(raw_embed, 2) + raw_embed = torch.mean( + torch.stack([self.embed_utterance(wav, return_partials=False, **kwargs) for wav in wavs]), dim=0 + ) + return raw_embed / torch.norm(raw_embed, p=2) diff --git a/TTS/vc/layers/freevc/wavlm/__init__.py b/TTS/vc/layers/freevc/wavlm/__init__.py index 62f7e74aaf..d9c3858f89 100644 --- a/TTS/vc/layers/freevc/wavlm/__init__.py +++ b/TTS/vc/layers/freevc/wavlm/__init__.py @@ -13,7 +13,7 @@ model_uri = "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/WavLM-Large.pt" -def get_wavlm(device="cpu"): +def get_wavlm(device="cpu") -> WavLM: """Download the model and return the model object.""" output_path = get_user_data_dir("tts") diff --git a/TTS/vc/layers/freevc/wavlm/modules.py b/TTS/vc/layers/freevc/wavlm/modules.py index 37c1a6e877..cf31a866de 100644 --- a/TTS/vc/layers/freevc/wavlm/modules.py +++ b/TTS/vc/layers/freevc/wavlm/modules.py @@ -9,7 +9,6 @@ import math import warnings -from typing import Dict, Optional, Tuple import torch import torch.nn.functional as F @@ -89,7 +88,7 @@ class Swish(nn.Module): def __init__(self): """Construct an MultiHeadedAttention object.""" - super(Swish, self).__init__() + super().__init__() self.act = torch.nn.Sigmoid() def forward(self, x): @@ -98,7 +97,7 @@ def forward(self, x): class GLU_Linear(nn.Module): def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True): - super(GLU_Linear, self).__init__() + super().__init__() self.glu_type = glu_type self.output_dim = output_dim @@ -158,7 +157,7 @@ def get_activation_fn(activation: str): elif activation == "glu": return lambda x: x else: - raise RuntimeError("--activation-fn {} not supported".format(activation)) + raise RuntimeError(f"--activation-fn {activation} not supported") def init_bert_params(module): @@ -219,7 +218,7 @@ def quant_noise(module, p, block_size): return module # supported modules - assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)) + assert isinstance(module, nn.Linear | nn.Embedding | nn.Conv2d) # test whether module.weight has the right sizes wrt block_size is_conv = module.weight.ndim == 4 @@ -331,7 +330,7 @@ def __init__( self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( - "Self-attention requires query, key and " "value to be of the same size" + "Self-attention requires query, key and value to be of the same size" ) k_bias = True @@ -424,17 +423,17 @@ def compute_bias(self, query_length, key_length): def forward( self, query, - key: Optional[Tensor], - value: Optional[Tensor], - key_padding_mask: Optional[Tensor] = None, - incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + key: Tensor | None, + value: Tensor | None, + key_padding_mask: Tensor | None = None, + incremental_state: dict[str, dict[str, Tensor | None]] | None = None, need_weights: bool = True, static_kv: bool = False, - attn_mask: Optional[Tensor] = None, + attn_mask: Tensor | None = None, before_softmax: bool = False, need_head_weights: bool = False, - position_bias: Optional[Tensor] = None, - ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]: + position_bias: Tensor | None = None, + ) -> tuple[Tensor, Tensor | None, Tensor | None]: """Input shape: Time x Batch x Channel Args: @@ -605,7 +604,7 @@ def forward( else: assert v is not None v = torch.cat([prev_value, v], dim=1) - prev_key_padding_mask: Optional[Tensor] = None + prev_key_padding_mask: Tensor | None = None if "prev_key_padding_mask" in saved_state: prev_key_padding_mask = saved_state["prev_key_padding_mask"] assert k is not None and v is not None @@ -700,7 +699,7 @@ def forward( assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn = self.out_proj(attn) - attn_weights: Optional[Tensor] = None + attn_weights: Tensor | None = None if need_weights: attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0) if not need_head_weights: @@ -711,12 +710,12 @@ def forward( @staticmethod def _append_prev_key_padding_mask( - key_padding_mask: Optional[Tensor], - prev_key_padding_mask: Optional[Tensor], + key_padding_mask: Tensor | None, + prev_key_padding_mask: Tensor | None, batch_size: int, src_len: int, static_kv: bool, - ) -> Optional[Tensor]: + ) -> Tensor | None: # saved key padding masks have shape (bsz, seq_len) if prev_key_padding_mask is not None and static_kv: new_key_padding_mask = prev_key_padding_mask @@ -748,19 +747,19 @@ def _append_prev_key_padding_mask( return new_key_padding_mask def _get_input_buffer( - self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] - ) -> Dict[str, Optional[Tensor]]: + self, incremental_state: dict[str, dict[str, Tensor | None]] | None + ) -> dict[str, Tensor | None]: result = self.get_incremental_state(incremental_state, "attn_state") if result is not None: return result else: - empty_result: Dict[str, Optional[Tensor]] = {} + empty_result: dict[str, Tensor | None] = {} return empty_result def _set_input_buffer( self, - incremental_state: Dict[str, Dict[str, Optional[Tensor]]], - buffer: Dict[str, Optional[Tensor]], + incremental_state: dict[str, dict[str, Tensor | None]], + buffer: dict[str, Tensor | None], ): return self.set_incremental_state(incremental_state, "attn_state", buffer) diff --git a/TTS/vc/layers/freevc/wavlm/wavlm.py b/TTS/vc/layers/freevc/wavlm/wavlm.py index 775f3e5979..6358662e18 100644 --- a/TTS/vc/layers/freevc/wavlm/wavlm.py +++ b/TTS/vc/layers/freevc/wavlm/wavlm.py @@ -9,7 +9,7 @@ import logging import math -from typing import List, Optional, Tuple +from typing import Any import numpy as np import torch @@ -33,8 +33,8 @@ def compute_mask_indices( - shape: Tuple[int, int], - padding_mask: Optional[torch.Tensor], + shape: tuple[int, int], + padding_mask: torch.Tensor | None, mask_prob: float, mask_length: int, mask_type: str = "static", @@ -68,8 +68,7 @@ def compute_mask_indices( all_num_mask = int( # add a random number for probabilistic rounding - mask_prob * all_sz / float(mask_length) - + np.random.rand() + mask_prob * all_sz / float(mask_length) + np.random.rand() ) all_num_mask = max(min_masks, all_num_mask) @@ -80,8 +79,7 @@ def compute_mask_indices( sz = all_sz - padding_mask[i].long().sum().item() num_mask = int( # add a random number for probabilistic rounding - mask_prob * sz / float(mask_length) - + np.random.rand() + mask_prob * sz / float(mask_length) + np.random.rand() ) num_mask = max(min_masks, num_mask) else: @@ -155,9 +153,7 @@ def arrange(s, e, length, keep_length): class WavLMConfig: def __init__(self, cfg=None): - self.extractor_mode: str = ( - "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) - ) + self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) self.encoder_layers: int = 12 # num encoder layers in the transformer self.encoder_embed_dim: int = 768 # encoder embedding dimension @@ -166,9 +162,7 @@ def __init__(self, cfg=None): self.activation_fn: str = "gelu" # activation function to use self.layer_norm_first: bool = False # apply layernorm first in the transformer - self.conv_feature_layers: str = ( - "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] - ) + self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] self.conv_bias: bool = False # include bias in conv encoder self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this @@ -225,7 +219,7 @@ def __init__( cfg: WavLMConfig, ) -> None: super().__init__() - logger.info(f"WavLM Config: {cfg.__dict__}") + logger.info("WavLM Config: %s", cfg.__dict__) self.cfg = cfg feature_enc_layers = eval(cfg.conv_feature_layers) @@ -317,12 +311,12 @@ def forward_padding_mask( def extract_features( self, source: torch.Tensor, - padding_mask: Optional[torch.Tensor] = None, + padding_mask: torch.Tensor | None = None, mask: bool = False, ret_conv: bool = False, - output_layer: Optional[int] = None, + output_layer: int | None = None, ret_layer_results: bool = False, - ): + ) -> tuple[torch.Tensor, dict[str, Any]]: if self.feature_grad_mult > 0: features = self.feature_extractor(source) if self.feature_grad_mult != 1.0: @@ -367,7 +361,7 @@ def extract_features( class ConvFeatureExtractionModel(nn.Module): def __init__( self, - conv_layers: List[Tuple[int, int, int]], + conv_layers: list[tuple[int, int, int]], dropout: float = 0.0, mode: str = "default", conv_bias: bool = False, diff --git a/TTS/vc/models/__init__.py b/TTS/vc/models/__init__.py index a9807d7006..859eaeb2a7 100644 --- a/TTS/vc/models/__init__.py +++ b/TTS/vc/models/__init__.py @@ -1,15 +1,21 @@ import importlib import logging import re -from typing import Dict, List, Union + +from TTS.vc.configs.shared_configs import BaseVCConfig +from TTS.vc.models.base_vc import BaseVC logger = logging.getLogger(__name__) -def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseVC": +def setup_model(config: BaseVCConfig) -> BaseVC: logger.info("Using model: %s", config.model) # fetch the right model implementation. - if "model" in config and config["model"].lower() == "freevc": + if config["model"].lower() == "freevc": MyModel = importlib.import_module("TTS.vc.models.freevc").FreeVC - model = MyModel.init_from_config(config, samples) - return model + elif config["model"].lower() == "knnvc": + MyModel = importlib.import_module("TTS.vc.models.knnvc").KNNVC + else: + msg = f"Model {config.model} does not exist!" + raise ValueError(msg) + return MyModel.init_from_config(config) diff --git a/TTS/vc/models/base_vc.py b/TTS/vc/models/base_vc.py index 22ffd0095c..a953b901e8 100644 --- a/TTS/vc/models/base_vc.py +++ b/TTS/vc/models/base_vc.py @@ -1,7 +1,7 @@ import logging import os import random -from typing import Any, Optional, Union +from typing import Any import torch import torch.distributed as dist @@ -37,9 +37,9 @@ class BaseVC(BaseTrainerModel): def __init__( self, config: Coqpit, - ap: AudioProcessor, - speaker_manager: Optional[SpeakerManager] = None, - language_manager: Optional[LanguageManager] = None, + ap: AudioProcessor | None = None, + speaker_manager: SpeakerManager | None = None, + language_manager: LanguageManager | None = None, ) -> None: super().__init__() self.config = config @@ -51,7 +51,7 @@ def __init__( def _set_model_args(self, config: Coqpit) -> None: """Setup model args based on the config type (`ModelConfig` or `ModelArgs`). - `ModelArgs` has all the fields reuqired to initialize the model architecture. + `ModelArgs` has all the fields required to initialize the model architecture. `ModelConfig` has all the fields required for training, inference and containes `ModelArgs`. @@ -69,7 +69,7 @@ def _set_model_args(self, config: Coqpit) -> None: else: raise ValueError("config must be either a *Config or *Args") - def init_multispeaker(self, config: Coqpit, data: Optional[list[Any]] = None) -> None: + def init_multispeaker(self, config: Coqpit, data: list[Any] | None = None) -> None: """Initialize a speaker embedding layer if needen and define expected embedding channel size for defining `in_channels` size of the connected layers. @@ -106,7 +106,7 @@ def get_aux_input(self, **kwargs: Any) -> dict[str, Any]: """Prepare and return `aux_input` used by `forward()`""" return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None} - def get_aux_input_from_test_sentences(self, sentence_info: Union[str, list[str]]) -> dict[str, Any]: + def get_aux_input_from_test_sentences(self, sentence_info: str | list[str]) -> dict[str, Any]: if hasattr(self.config, "model_args"): config = self.config.model_args else: @@ -199,9 +199,9 @@ def format_batch(self, batch: dict[str, Any]) -> dict[str, Any]: extra_frames = dur.sum() - mel_lengths[idx] largest_idxs = torch.argsort(-dur)[:extra_frames] dur[largest_idxs] -= 1 - assert ( - dur.sum() == mel_lengths[idx] - ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + assert dur.sum() == mel_lengths[idx], ( + f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + ) durations[idx, : text_lengths[idx]] = dur # set stop targets wrt reduction factor @@ -275,10 +275,10 @@ def get_data_loader( config: Coqpit, assets: dict, is_eval: bool, - samples: Union[list[dict], list[list]], + samples: list[dict] | list[list], verbose: bool, num_gpus: int, - rank: Optional[int] = None, + rank: int | None = None, ) -> "DataLoader": if is_eval and not config.run_eval: loader = None @@ -402,13 +402,11 @@ def test_run(self, assets: dict) -> tuple[dict, dict]: use_griffin_lim=True, do_trim_silence=False, ) - test_audios["{}-audio".format(idx)] = outputs_dict["wav"] - test_figures["{}-prediction".format(idx)] = plot_spectrogram( + test_audios[f"{idx}-audio"] = outputs_dict["wav"] + test_figures[f"{idx}-prediction"] = plot_spectrogram( outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False ) - test_figures["{}-alignment".format(idx)] = plot_alignment( - outputs_dict["outputs"]["alignments"], output_fig=False - ) + test_figures[f"{idx}-alignment"] = plot_alignment(outputs_dict["outputs"]["alignments"], output_fig=False) return test_figures, test_audios def on_init_start(self, trainer: Trainer) -> None: diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py index c654219c39..59af40a836 100644 --- a/TTS/vc/models/freevc.py +++ b/TTS/vc/models/freevc.py @@ -1,5 +1,4 @@ import logging -from typing import Dict, List, Optional, Tuple, Union import librosa import numpy as np @@ -102,7 +101,7 @@ def __init__( upsample_kernel_sizes, gin_channels=0, ): - super(Generator, self).__init__() + super().__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) @@ -165,7 +164,7 @@ def remove_weight_norm(self): class MultiPeriodDiscriminator(torch.nn.Module): def __init__(self, use_spectral_norm=False): - super(MultiPeriodDiscriminator, self).__init__() + super().__init__() periods = [2, 3, 5, 7, 11] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] @@ -190,7 +189,7 @@ def forward(self, y, y_hat): class SpeakerEncoder(torch.nn.Module): def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256): - super(SpeakerEncoder, self).__init__() + super().__init__() self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True) self.linear = nn.Linear(model_hidden_size, model_embedding_size) self.relu = nn.ReLU() @@ -233,7 +232,7 @@ def embed_utterance(self, mel, partial_frames=128, partial_hop=64): class FreeVC(BaseVC): """ - Papaer:: + Paper:: https://arxiv.org/abs/2210.15418# Paper Abstract:: @@ -306,15 +305,11 @@ def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None): self.wavlm = get_wavlm() - @property - def device(self): - return next(self.parameters()).device - def load_pretrained_speaker_encoder(self): """Load pretrained speaker encoder model as mentioned in the paper.""" logger.info("Loading pretrained speaker encoder model ...") self.enc_spk_ex = SpeakerEncoderEx( - "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/speaker_encoder.pt", device=self.device + "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/speaker_encoder.pt" ) def init_multispeaker(self, config: Coqpit): @@ -335,15 +330,15 @@ def forward( self, c: torch.Tensor, spec: torch.Tensor, - g: Optional[torch.Tensor] = None, - mel: Optional[torch.Tensor] = None, - c_lengths: Optional[torch.Tensor] = None, - spec_lengths: Optional[torch.Tensor] = None, - ) -> Tuple[ + g: torch.Tensor | None = None, + mel: torch.Tensor | None = None, + c_lengths: torch.Tensor | None = None, + spec_lengths: torch.Tensor | None = None, + ) -> tuple[ torch.Tensor, torch.Tensor, torch.Tensor, - Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], ]: """ Forward pass of the model. @@ -389,8 +384,8 @@ def forward( return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - @torch.no_grad() - def inference(self, c, g=None, mel=None, c_lengths=None): + @torch.inference_mode() + def inference(self, c, g=None, c_lengths=None): """ Inference pass of the model @@ -405,9 +400,6 @@ def inference(self, c, g=None, mel=None, c_lengths=None): """ if c_lengths is None: c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) - if not self.use_spk: - g = self.enc_spk.embed_utterance(mel) - g = g.unsqueeze(-1) z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths) z = self.flow(z_p, c_mask, g=g, reverse=True) o = self.dec(z * c_mask, g=g) @@ -438,51 +430,52 @@ def load_audio(self, wav): return wav.float() @torch.inference_mode() - def voice_conversion(self, src, tgt): + def voice_conversion(self, src: str | torch.Tensor, tgt: list[str | torch.Tensor]): """ Voice conversion pass of the model. Args: src (str or torch.Tensor): Source utterance. - tgt (str or torch.Tensor): Target utterance. + tgt (list of str or torch.Tensor): Target utterances. Returns: torch.Tensor: Output tensor. """ - wav_tgt = self.load_audio(tgt).cpu().numpy() - wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) - - if self.config.model_args.use_spk: - g_tgt = self.enc_spk_ex.embed_utterance(wav_tgt) - g_tgt = torch.from_numpy(g_tgt)[None, :, None].to(self.device) - else: - wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(self.device) - mel_tgt = mel_spectrogram_torch( - wav_tgt, - self.config.audio.filter_length, - self.config.audio.n_mel_channels, - self.config.audio.input_sample_rate, - self.config.audio.hop_length, - self.config.audio.win_length, - self.config.audio.mel_fmin, - self.config.audio.mel_fmax, - ) # src wav_src = self.load_audio(src) c = self.extract_wavlm_features(wav_src[None, :]) - if self.config.model_args.use_spk: - audio = self.inference(c, g=g_tgt) - else: - audio = self.inference(c, mel=mel_tgt.transpose(1, 2)) - audio = audio[0][0].data.cpu().float().numpy() - return audio + # tgt + g_tgts = [] + for tg in tgt: + wav_tgt = self.load_audio(tg).cpu().numpy() + wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) + + if self.config.model_args.use_spk: + g_tgts.append(self.enc_spk_ex.embed_utterance(wav_tgt)[None, :, None]) + else: + wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(self.device) + mel_tgt = mel_spectrogram_torch( + wav_tgt, + self.config.audio.filter_length, + self.config.audio.n_mel_channels, + self.config.audio.input_sample_rate, + self.config.audio.hop_length, + self.config.audio.win_length, + self.config.audio.mel_fmin, + self.config.audio.mel_fmax, + ) + g_tgts.append(self.enc_spk.embed_utterance(mel_tgt.transpose(1, 2)).unsqueeze(-1)) + + g_tgt = torch.stack(g_tgts).mean(dim=0) + audio = self.inference(c, g=g_tgt) + return audio[0][0].data.cpu().float().numpy() def eval_step(): ... @staticmethod - def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: FreeVCConfig) -> "FreeVC": model = FreeVC(config) return model diff --git a/TTS/vc/models/knnvc.py b/TTS/vc/models/knnvc.py new file mode 100644 index 0000000000..c31f52e749 --- /dev/null +++ b/TTS/vc/models/knnvc.py @@ -0,0 +1,181 @@ +import logging +import os +from typing import Any, TypeAlias + +import torch +import torch.nn.functional as F +import torchaudio +from coqpit import Coqpit + +from TTS.vc.configs.knnvc_config import KNNVCConfig +from TTS.vc.layers.freevc.wavlm import get_wavlm +from TTS.vc.models.base_vc import BaseVC + +logger = logging.getLogger(__name__) + +PathOrTensor: TypeAlias = str | os.PathLike[Any] | torch.Tensor + + +class KNNVC(BaseVC): + """ + Paper:: + https://arxiv.org/abs/2305.18975 + + Paper Abstract:: + Any-to-any voice conversion aims to transform source speech + into a target voice with just a few examples of the target speaker as a + reference. Recent methods produce convincing conversions, but at the cost of + increased complexity -- making results difficult to reproduce and build on. + Instead, we keep it simple. We propose k-nearest neighbors voice conversion + (kNN-VC): a straightforward yet effective method for any-to-any conversion. + First, we extract self-supervised representations of the source and reference + speech. To convert to the target speaker, we replace each frame of the source + representation with its nearest neighbor in the reference. Finally, a pretrained + vocoder synthesizes audio from the converted representation. Objective and + subjective evaluations show that kNN-VC improves speaker similarity with similar + intelligibility scores to existing methods. + + Samples:: + https://bshall.github.io/knn-vc + + Original code:: + https://github.com/bshall/knn-vc + + Examples: + >>> from TTS.vc.configs.knnvc_config import KNNVCConfig + >>> from TTS.vc.models.knnvc import KNNVC + >>> config = KNNVCConfig() + >>> model = KNNVC(config) + """ + + def __init__(self, config: Coqpit): + super().__init__(config) + self.ssl_dim = self.args.ssl_dim + self.wavlm = get_wavlm() + + @staticmethod + def init_from_config(config: KNNVCConfig) -> "KNNVC": + return KNNVC(config) + + @torch.inference_mode() + def get_features(self, audio: PathOrTensor, vad_trigger_level=0) -> torch.Tensor: + """Return features for the given waveform with output shape (seq_len, dim). + + Optionally perform VAD trimming on start/end with `vad_trigger_level`. + """ + # load audio + if isinstance(audio, torch.Tensor): + x: torch.Tensor = audio + sr = self.config.audio.sample_rate + if x.dim() == 1: + x = x[None] + else: + x, sr = torchaudio.load(audio, normalize=True) + + if not sr == self.config.audio.sample_rate: + logger.info("Resampling %d to %d in %s", sr, self.config.audio.sample_rate, audio) + x = torchaudio.functional.resample(x, orig_freq=sr, new_freq=self.config.audio.sample_rate) + sr = self.config.audio.sample_rate + + # trim silence from front and back + if vad_trigger_level > 1e-3: + transform = torchaudio.transforms.Vad(sample_rate=sr, trigger_level=vad_trigger_level) + x_front_trim = transform(x) + waveform_reversed = torch.flip(x_front_trim, (-1,)) + waveform_reversed_front_trim = transform(waveform_reversed) + x = torch.flip(waveform_reversed_front_trim, (-1,)) + + # extract the representation of each layer + wav_input_16khz = x.to(self.device) + features = self.wavlm.extract_features( + wav_input_16khz, output_layer=self.config.wavlm_layer, ret_layer_results=False + )[0] + return features.squeeze(0) + + def get_matching_set(self, wavs: list[PathOrTensor], vad_trigger_level=7) -> torch.Tensor: + """Get concatenated wavlm features for the matching set using all waveforms in `wavs`. + + Wavs are specified as either a list of paths or list of loaded waveform tensors of + shape (channels, T), assumed to be of 16kHz sample rate. + """ + feats = [] + for p in wavs: + feats.append(self.get_features(p, vad_trigger_level=vad_trigger_level)) + + feats = torch.concat(feats, dim=0).cpu() + return feats + + @staticmethod + def fast_cosine_dist(source_feats: torch.Tensor, matching_pool: torch.Tensor) -> torch.Tensor: + """Like torch.cdist, but fixed dim=-1 and for cosine distance.""" + source_norms = torch.norm(source_feats, p=2, dim=-1) + matching_norms = torch.norm(matching_pool, p=2, dim=-1) + dotprod = ( + -(torch.cdist(source_feats[None], matching_pool[None], p=2)[0] ** 2) + + source_norms[:, None] ** 2 + + matching_norms[None] ** 2 + ) + dotprod /= 2 + + dists = 1 - (dotprod / (source_norms[:, None] * matching_norms[None])) + return dists + + @torch.inference_mode() + def match( + self, + query_seq: torch.Tensor, + matching_set: torch.Tensor, + synth_set: torch.Tensor | None = None, + topk: int | None = None, + target_duration: float | None = None, + ) -> torch.Tensor: + """Given `query_seq`, `matching_set`, and `synth_set` tensors of shape (N, dim), perform kNN regression matching + with k=`topk`. + + Args: + `query_seq`: Tensor (N1, dim) of the input/source query features. + `matching_set`: Tensor (N2, dim) of the matching set used as the 'training set' for the kNN algorithm. + `synth_set`: optional Tensor (N2, dim) corresponding to the matching set. We use the matching set to assign + each query vector to a vector in the matching set, and then use the corresponding vector from + the synth set during HiFiGAN synthesis. + By default, and for best performance, this should be identical to the matching set. + `topk`: k in the kNN -- the number of nearest neighbors to average over. + `target_duration`: if set to a float, interpolate waveform duration to be equal to this value in seconds. + + Returns: + - converted features (1, N, dim) + """ + if topk is None: + topk = self.config.topk + synth_set = matching_set.to(self.device) if synth_set is None else synth_set.to(self.device) + matching_set = matching_set.to(self.device) + query_seq = query_seq.to(self.device) + + if target_duration is not None: + target_samples = int(target_duration * self.config.audio.sample_rate) + scale_factor = (target_samples / self.hop_length) / query_seq.shape[0] # n_targ_feats / n_input_feats + query_seq = F.interpolate(query_seq.T[None], scale_factor=scale_factor, mode="linear")[0].T + + dists = self.fast_cosine_dist(query_seq, matching_set) + best = dists.topk(k=topk, largest=False, dim=-1) + out_feats = synth_set[best.indices].mean(dim=1) + return out_feats.unsqueeze(0) + + def load_checkpoint(self, vc_config: KNNVCConfig, _vc_checkpoint: str | os.PathLike[Any]) -> None: + """kNN-VC does not use checkpoints.""" + + def forward(self) -> None: ... + def inference(self) -> None: ... + + @torch.inference_mode() + def voice_conversion( + self, + source: PathOrTensor, + target: list[PathOrTensor], + topk: int | None = None, + ) -> torch.Tensor: + if not isinstance(target, list): + target = [target] + source_features = self.get_features(source) + matching_set = self.get_matching_set(target) + return self.match(source_features, matching_set, topk=topk) diff --git a/TTS/vc/models/openvoice.py b/TTS/vc/models/openvoice.py index 135b0861b9..1049a580c7 100644 --- a/TTS/vc/models/openvoice.py +++ b/TTS/vc/models/openvoice.py @@ -1,8 +1,9 @@ import json import logging import os +from collections.abc import Mapping from pathlib import Path -from typing import Any, Mapping, Optional, Union +from typing import Any import librosa import numpy as np @@ -117,7 +118,7 @@ class OpenVoice(BaseVC): October 2023, serving as the backend of MyShell. """ - def __init__(self, config: Coqpit, speaker_manager: Optional[SpeakerManager] = None) -> None: + def __init__(self, config: Coqpit, speaker_manager: SpeakerManager | None = None) -> None: super().__init__(config, None, speaker_manager, None) self.init_multispeaker(config) @@ -174,15 +175,11 @@ def __init__(self, config: Coqpit, speaker_manager: Optional[SpeakerManager] = N self.ref_enc = ReferenceEncoder(self.spec_channels, self.gin_channels) - @property - def device(self) -> torch.device: - return next(self.parameters()).device - @staticmethod def init_from_config(config: OpenVoiceConfig) -> "OpenVoice": return OpenVoice(config) - def init_multispeaker(self, config: Coqpit, data: Optional[list[Any]] = None) -> None: + def init_multispeaker(self, config: Coqpit, data: list[Any] | None = None) -> None: """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer or with external `d_vectors` computed from a speaker encoder model. @@ -199,7 +196,7 @@ def init_multispeaker(self, config: Coqpit, data: Optional[list[Any]] = None) -> def load_checkpoint( self, config: OpenVoiceConfig, - checkpoint_path: Union[str, os.PathLike[Any]], + checkpoint_path: str | os.PathLike[Any], eval: bool = False, strict: bool = True, cache: bool = False, @@ -223,16 +220,16 @@ def train_step(self) -> None: ... def eval_step(self) -> None: ... @staticmethod - def _set_x_lengths(x: torch.Tensor, aux_input: Mapping[str, Optional[torch.Tensor]]) -> torch.Tensor: + def _set_x_lengths(x: torch.Tensor, aux_input: Mapping[str, torch.Tensor | None]) -> torch.Tensor: if "x_lengths" in aux_input and aux_input["x_lengths"] is not None: return aux_input["x_lengths"] - return torch.tensor(x.shape[1:2]).to(x.device) + return torch.tensor(x.shape[-1:]).to(x.device) - @torch.no_grad() + @torch.inference_mode() def inference( self, x: torch.Tensor, - aux_input: Mapping[str, Optional[torch.Tensor]] = {"x_lengths": None, "g_src": None, "g_tgt": None}, + aux_input: Mapping[str, torch.Tensor | None] = {"x_lengths": None, "g_src": None, "g_tgt": None}, ) -> dict[str, torch.Tensor]: """ Inference pass of the model @@ -271,7 +268,7 @@ def inference( "z_hat": z_hat, } - def load_audio(self, wav: Union[str, npt.NDArray[np.float32], torch.Tensor, list[float]]) -> torch.Tensor: + def load_audio(self, wav: str | npt.NDArray[np.float32] | torch.Tensor | list[float]) -> torch.Tensor: """Read and format the input audio.""" if isinstance(wav, str): out = torch.from_numpy(librosa.load(wav, sr=self.config.audio.input_sample_rate)[0]) @@ -283,9 +280,8 @@ def load_audio(self, wav: Union[str, npt.NDArray[np.float32], torch.Tensor, list out = wav return out.to(self.device).float() - def extract_se(self, audio: Union[str, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]: - audio_ref = self.load_audio(audio) - y = torch.FloatTensor(audio_ref) + def extract_se(self, audio: str | torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + y = self.load_audio(audio) y = y.to(self.device) y = y.unsqueeze(0) spec = wav_to_spec( @@ -301,19 +297,23 @@ def extract_se(self, audio: Union[str, torch.Tensor]) -> tuple[torch.Tensor, tor return g, spec @torch.inference_mode() - def voice_conversion(self, src: Union[str, torch.Tensor], tgt: Union[str, torch.Tensor]) -> npt.NDArray[np.float32]: + def voice_conversion(self, src: str | torch.Tensor, tgt: list[str | torch.Tensor]) -> npt.NDArray[np.float32]: """ Voice conversion pass of the model. Args: src (str or torch.Tensor): Source utterance. - tgt (str or torch.Tensor): Target utterance. + tgt (list of str or torch.Tensor): Target utterance. Returns: Output numpy array. """ src_se, src_spec = self.extract_se(src) - tgt_se, _ = self.extract_se(tgt) + tgt_ses = [] + for tg in tgt: + tgt_se, _ = self.extract_se(tg) + tgt_ses.append(tgt_se) + tgt_se = torch.stack(tgt_ses).mean(dim=0) aux_input = {"g_src": src_se, "g_tgt": tgt_se} audio = self.inference(src_spec, aux_input) diff --git a/TTS/vocoder/configs/hifigan_config.py b/TTS/vocoder/configs/hifigan_config.py index 9a102f0c89..60dde496b2 100644 --- a/TTS/vocoder/configs/hifigan_config.py +++ b/TTS/vocoder/configs/hifigan_config.py @@ -5,7 +5,7 @@ @dataclass class HifiganConfig(BaseGANVocoderConfig): - """Defines parameters for FullBand MelGAN vocoder. + """Defines parameters for HifiGAN vocoder. Example: diff --git a/TTS/vocoder/configs/univnet_config.py b/TTS/vocoder/configs/univnet_config.py index 67f324cfce..85662831ee 100644 --- a/TTS/vocoder/configs/univnet_config.py +++ b/TTS/vocoder/configs/univnet_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import Dict from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig @@ -96,7 +95,7 @@ class UnivnetConfig(BaseGANVocoderConfig): # model specific params discriminator_model: str = "univnet_discriminator" generator_model: str = "univnet_generator" - generator_model_params: Dict = field( + generator_model_params: dict = field( default_factory=lambda: { "in_channels": 64, "out_channels": 1, @@ -121,7 +120,7 @@ class UnivnetConfig(BaseGANVocoderConfig): # loss weights - overrides stft_loss_weight: float = 2.5 - stft_loss_params: Dict = field( + stft_loss_params: dict = field( default_factory=lambda: { "n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], @@ -133,7 +132,7 @@ class UnivnetConfig(BaseGANVocoderConfig): hinge_G_loss_weight: float = 0 feat_match_loss_weight: float = 0 l1_spec_loss_weight: float = 0 - l1_spec_loss_params: Dict = field( + l1_spec_loss_params: dict = field( default_factory=lambda: { "use_mel": True, "sample_rate": 22050, @@ -153,7 +152,7 @@ class UnivnetConfig(BaseGANVocoderConfig): # lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) lr_scheduler_disc: str = None # one of the schedulers from https:#pytorch.org/docs/stable/optim.html # lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) - optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.5, 0.9], "weight_decay": 0.0}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.5, 0.9], "weight_decay": 0.0}) steps_to_start_discriminator: int = 200000 def __post_init__(self): diff --git a/TTS/vocoder/datasets/__init__.py b/TTS/vocoder/datasets/__init__.py index 04462817a8..cef6a50b05 100644 --- a/TTS/vocoder/datasets/__init__.py +++ b/TTS/vocoder/datasets/__init__.py @@ -1,5 +1,3 @@ -from typing import List - from coqpit import Coqpit from torch.utils.data import Dataset @@ -10,7 +8,7 @@ from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List) -> Dataset: +def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: list) -> Dataset: if config.model.lower() in "gan": dataset = GANDataset( ap=ap, diff --git a/TTS/vocoder/datasets/gan_dataset.py b/TTS/vocoder/datasets/gan_dataset.py index 0806c0d496..076545f8a2 100644 --- a/TTS/vocoder/datasets/gan_dataset.py +++ b/TTS/vocoder/datasets/gan_dataset.py @@ -32,7 +32,7 @@ def __init__( super().__init__() self.ap = ap self.item_list = items - self.compute_feat = not isinstance(items[0], (tuple, list)) + self.compute_feat = not isinstance(items[0], tuple | list) self.seq_len = seq_len self.hop_len = hop_len self.pad_short = pad_short @@ -128,9 +128,9 @@ def load_item(self, idx): # correct the audio length wrt padding applied in stft audio = np.pad(audio, (0, self.hop_len), mode="edge") audio = audio[: mel.shape[-1] * self.hop_len] - assert ( - mel.shape[-1] * self.hop_len == audio.shape[-1] - ), f" [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}" + assert mel.shape[-1] * self.hop_len == audio.shape[-1], ( + f" [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}" + ) audio = torch.from_numpy(audio).float().unsqueeze(0) mel = torch.from_numpy(mel).float().squeeze(0) diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py index 6f34bccb7c..435330bebe 100644 --- a/TTS/vocoder/datasets/wavegrad_dataset.py +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -2,7 +2,6 @@ import os import random from multiprocessing import Manager -from typing import List, Tuple import numpy as np import torch @@ -65,7 +64,7 @@ def __getitem__(self, idx): item = self.load_item(idx) return item - def load_test_samples(self, num_samples: int) -> List[Tuple]: + def load_test_samples(self, num_samples: int) -> list[tuple]: """Return test samples. Args: @@ -103,9 +102,9 @@ def load_item(self, idx): audio = np.pad( audio, (0, self.seq_len + self.pad_short - len(audio)), mode="constant", constant_values=0.0 ) - assert ( - audio.shape[-1] >= self.seq_len + self.pad_short - ), f"{audio.shape[-1]} vs {self.seq_len + self.pad_short}" + assert audio.shape[-1] >= self.seq_len + self.pad_short, ( + f"{audio.shape[-1]} vs {self.seq_len + self.pad_short}" + ) # correct the audio length wrt hop length p = (audio.shape[-1] // self.hop_len + 1) * self.hop_len - audio.shape[-1] diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 4c4f5c48df..ffb71177c5 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -18,7 +18,7 @@ class WaveRNNDataset(Dataset): def __init__(self, ap, items, seq_len, hop_len, pad, mode, mulaw, is_training=True, return_segments=True): super().__init__() self.ap = ap - self.compute_feat = not isinstance(items[0], (tuple, list)) + self.compute_feat = not isinstance(items[0], tuple | list) self.item_list = items self.seq_len = seq_len self.hop_len = hop_len diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py index 8d4dd725ef..81a1f30884 100644 --- a/TTS/vocoder/layers/losses.py +++ b/TTS/vocoder/layers/losses.py @@ -1,5 +1,3 @@ -from typing import Dict, Union - import torch from torch import nn from torch.nn import functional as F @@ -226,9 +224,9 @@ class GeneratorLoss(nn.Module): def __init__(self, C): super().__init__() - assert not ( - C.use_mse_gan_loss and C.use_hinge_gan_loss - ), " [!] Cannot use HingeGANLoss and MSEGANLoss together." + assert not (C.use_mse_gan_loss and C.use_hinge_gan_loss), ( + " [!] Cannot use HingeGANLoss and MSEGANLoss together." + ) self.use_stft_loss = C.use_stft_loss if "use_stft_loss" in C else False self.use_subband_stft_loss = C.use_subband_stft_loss if "use_subband_stft_loss" in C else False @@ -313,9 +311,9 @@ class DiscriminatorLoss(nn.Module): def __init__(self, C): super().__init__() - assert not ( - C.use_mse_gan_loss and C.use_hinge_gan_loss - ), " [!] Cannot use HingeGANLoss and MSEGANLoss together." + assert not (C.use_mse_gan_loss and C.use_hinge_gan_loss), ( + " [!] Cannot use HingeGANLoss and MSEGANLoss together." + ) self.use_mse_gan_loss = C.use_mse_gan_loss self.use_hinge_gan_loss = C.use_hinge_gan_loss @@ -352,7 +350,7 @@ def forward(self, scores_fake, scores_real): class WaveRNNLoss(nn.Module): - def __init__(self, wave_rnn_mode: Union[str, int]): + def __init__(self, wave_rnn_mode: str | int): super().__init__() if wave_rnn_mode == "mold": self.loss_func = discretized_mix_logistic_loss @@ -363,6 +361,6 @@ def __init__(self, wave_rnn_mode: Union[str, int]): else: raise ValueError(" [!] Unknown mode for Wavernn.") - def forward(self, y_hat, y) -> Dict: + def forward(self, y_hat, y) -> dict: loss = self.loss_func(y_hat, y) return {"loss": loss} diff --git a/TTS/vocoder/layers/lvc_block.py b/TTS/vocoder/layers/lvc_block.py index 8913a1132e..ab1a56e7fc 100644 --- a/TTS/vocoder/layers/lvc_block.py +++ b/TTS/vocoder/layers/lvc_block.py @@ -175,9 +175,9 @@ def location_variable_convolution(x, kernel, bias, dilation, hop_size): batch, _, in_length = x.shape batch, _, out_channels, kernel_size, kernel_length = kernel.shape - assert in_length == ( - kernel_length * hop_size - ), f"length of (x, kernel) is not matched, {in_length} vs {kernel_length * hop_size}" + assert in_length == (kernel_length * hop_size), ( + f"length of (x, kernel) is not matched, {in_length} vs {kernel_length * hop_size}" + ) padding = dilation * int((kernel_size - 1) / 2) x = F.pad(x, (padding, padding), "constant", 0) # (batch, in_channels, in_length + 2*padding) diff --git a/TTS/vocoder/layers/wavegrad.py b/TTS/vocoder/layers/wavegrad.py index 9f1512c6d4..187e7062e2 100644 --- a/TTS/vocoder/layers/wavegrad.py +++ b/TTS/vocoder/layers/wavegrad.py @@ -74,7 +74,7 @@ def shif_and_scale(x, scale, shift): class UBlock(nn.Module): def __init__(self, input_size, hidden_size, factor, dilation): super().__init__() - assert isinstance(dilation, (list, tuple)) + assert isinstance(dilation, list | tuple) assert len(dilation) == 4 self.factor = factor diff --git a/TTS/vocoder/models/__init__.py b/TTS/vocoder/models/__init__.py index b6a1850484..481d234a54 100644 --- a/TTS/vocoder/models/__init__.py +++ b/TTS/vocoder/models/__init__.py @@ -5,11 +5,13 @@ from coqpit import Coqpit from TTS.utils.generic_utils import to_camel +from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig, BaseVocoderConfig +from TTS.vocoder.models.base_vocoder import BaseVocoder logger = logging.getLogger(__name__) -def setup_model(config: Coqpit): +def setup_model(config: BaseVocoderConfig) -> BaseVocoder: """Load models directly from configuration.""" if "discriminator_model" in config and "generator_model" in config: MyModel = importlib.import_module("TTS.vocoder.models.gan") @@ -26,19 +28,20 @@ def setup_model(config: Coqpit): try: MyModel = getattr(MyModel, to_camel(config.model)) except ModuleNotFoundError as e: - raise ValueError(f"Model {config.model} not exist!") from e + raise ValueError(f"Model {config.model} does not exist!") from e logger.info("Vocoder model: %s", config.model) return MyModel.init_from_config(config) -def setup_generator(c): +def setup_generator(c: BaseGANVocoderConfig): """TODO: use config object as arguments""" logger.info("Generator model: %s", c.generator_model) MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) MyModel = getattr(MyModel, to_camel(c.generator_model)) # this is to preserve the Wavernn class name (instead of Wavernn) if c.generator_model.lower() in "hifigan_generator": - model = MyModel(in_channels=c.audio["num_mels"], out_channels=1, **c.generator_model_params) + c.generator_model_params["in_channels"] = c.generator_model_params.get("in_channels", c.audio["num_mels"]) + model = MyModel(out_channels=1, **c.generator_model_params) elif c.generator_model.lower() in "melgan_generator": model = MyModel( in_channels=c.audio["num_mels"], @@ -94,8 +97,8 @@ def setup_generator(c): return model -def setup_discriminator(c): - """TODO: use config objekt as arguments""" +def setup_discriminator(c: BaseGANVocoderConfig): + """TODO: use config object as arguments""" logger.info("Discriminator model: %s", c.discriminator_model) if "parallel_wavegan" in c.discriminator_model: MyModel = importlib.import_module("TTS.vocoder.models.parallel_wavegan_discriminator") @@ -104,7 +107,7 @@ def setup_discriminator(c): MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) if c.discriminator_model in "hifigan_discriminator": model = MyModel() - if c.discriminator_model in "random_window_discriminator": + elif c.discriminator_model in "random_window_discriminator": model = MyModel( cond_channels=c.audio["num_mels"], hop_length=c.audio["hop_length"], @@ -113,7 +116,7 @@ def setup_discriminator(c): cond_disc_out_channels=c.discriminator_model_params["cond_disc_out_channels"], window_sizes=c.discriminator_model_params["window_sizes"], ) - if c.discriminator_model in "melgan_multiscale_discriminator": + elif c.discriminator_model in "melgan_multiscale_discriminator": model = MyModel( in_channels=1, out_channels=1, @@ -122,7 +125,7 @@ def setup_discriminator(c): max_channels=c.discriminator_model_params["max_channels"], downsample_factors=c.discriminator_model_params["downsample_factors"], ) - if c.discriminator_model == "residual_parallel_wavegan_discriminator": + elif c.discriminator_model == "residual_parallel_wavegan_discriminator": model = MyModel( in_channels=1, out_channels=1, @@ -137,7 +140,7 @@ def setup_discriminator(c): nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, ) - if c.discriminator_model == "parallel_wavegan_discriminator": + elif c.discriminator_model == "parallel_wavegan_discriminator": model = MyModel( in_channels=1, out_channels=1, @@ -149,6 +152,8 @@ def setup_discriminator(c): nonlinear_activation_params={"negative_slope": 0.2}, bias=True, ) - if c.discriminator_model == "univnet_discriminator": + elif c.discriminator_model == "univnet_discriminator": model = MyModel() + else: + raise NotImplementedError(f"Model {c.discriminator_model} not implemented!") return model diff --git a/TTS/vocoder/models/fullband_melgan_generator.py b/TTS/vocoder/models/fullband_melgan_generator.py index ee25559af0..292d3323bb 100644 --- a/TTS/vocoder/models/fullband_melgan_generator.py +++ b/TTS/vocoder/models/fullband_melgan_generator.py @@ -24,7 +24,7 @@ def __init__( num_res_blocks=num_res_blocks, ) - @torch.no_grad() + @torch.inference_mode() def inference(self, cond_features): cond_features = cond_features.to(self.layers[1].weight.device) cond_features = torch.nn.functional.pad( diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py index 8792950a56..ba3852e795 100644 --- a/TTS/vocoder/models/gan.py +++ b/TTS/vocoder/models/gan.py @@ -1,5 +1,4 @@ from inspect import signature -from typing import Dict, List, Tuple import numpy as np import torch @@ -65,7 +64,7 @@ def inference(self, x: torch.Tensor) -> torch.Tensor: """ return self.model_g.inference(x) - def train_step(self, batch: Dict, criterion: Dict, optimizer_idx: int) -> Tuple[Dict, Dict]: + def train_step(self, batch: dict, criterion: dict, optimizer_idx: int) -> tuple[dict, dict]: """Compute model outputs and the loss values. `optimizer_idx` selects the generator or the discriminator for network on the current pass. @@ -185,7 +184,7 @@ def train_step(self, batch: Dict, criterion: Dict, optimizer_idx: int) -> Tuple[ outputs = {"model_outputs": self.y_hat_g} return outputs, loss_dict - def _log(self, name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, Dict]: + def _log(self, name: str, ap: AudioProcessor, batch: dict, outputs: dict) -> tuple[dict, dict]: """Logging shared by the training and evaluation. Args: @@ -205,22 +204,32 @@ def _log(self, name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tup return figures, audios def train_log( - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument - ) -> Tuple[Dict, np.ndarray]: + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument + ) -> tuple[dict, np.ndarray]: """Call `_log()` for training.""" figures, audios = self._log("eval", self.ap, batch, outputs) logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) - @torch.no_grad() - def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]: + @torch.inference_mode() + def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> tuple[dict, dict]: """Call `train_step()` with `no_grad()`""" self.train_disc = True # Avoid a bug in the Training with the missing discriminator loss return self.train_step(batch, criterion, optimizer_idx) def eval_log( - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument - ) -> Tuple[Dict, np.ndarray]: + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument + ) -> tuple[dict, np.ndarray]: """Call `_log()` for evaluation.""" figures, audios = self._log("eval", self.ap, batch, outputs) logger.eval_figures(steps, figures) @@ -259,7 +268,7 @@ def on_train_step_start(self, trainer) -> None: """ self.train_disc = trainer.total_steps_done >= self.config.steps_to_start_discriminator - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: """Initiate and return the GAN optimizers based on the config parameters. It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator. @@ -275,7 +284,7 @@ def get_optimizer(self) -> List: ) return [optimizer2, optimizer1] - def get_lr(self) -> List: + def get_lr(self) -> list: """Set the initial learning rates for each optimizer. Returns: @@ -283,7 +292,7 @@ def get_lr(self) -> List: """ return [self.config.lr_disc, self.config.lr_gen] - def get_scheduler(self, optimizer) -> List: + def get_scheduler(self, optimizer) -> list: """Set the schedulers for each optimizer. Args: @@ -297,7 +306,7 @@ def get_scheduler(self, optimizer) -> List: return [scheduler2, scheduler1] @staticmethod - def format_batch(batch: List) -> Dict: + def format_batch(batch: list) -> dict: """Format the batch for training. Args: @@ -316,9 +325,9 @@ def format_batch(batch: List) -> Dict: def get_data_loader( # pylint: disable=no-self-use, unused-argument self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: True, - samples: List, + samples: list, verbose: bool, num_gpus: int, rank: int = None, # pylint: disable=unused-argument diff --git a/TTS/vocoder/models/hifigan_generator.py b/TTS/vocoder/models/hifigan_generator.py index 8273d02037..308b12ab56 100644 --- a/TTS/vocoder/models/hifigan_generator.py +++ b/TTS/vocoder/models/hifigan_generator.py @@ -179,6 +179,7 @@ def __init__( conv_post_weight_norm=True, conv_post_bias=True, cond_in_each_up_layer=False, + pre_linear=None, ): r"""HiFiGAN Generator with Multi-Receptive Field Fusion (MRF) @@ -198,6 +199,7 @@ def __init__( for each consecutive upsampling layer. upsample_factors (List[int]): upsampling factors (stride) for each upsampling layer. inference_padding (int): constant padding applied to the input at inference time. Defaults to 5. + pre_linear (int): If not None, add nn.Linear(pre_linear, in_channels) before the convolutions. """ super().__init__() self.inference_padding = inference_padding @@ -206,6 +208,8 @@ def __init__( self.cond_in_each_up_layer = cond_in_each_up_layer # initial upsampling layers + if pre_linear is not None: + self.lin_pre = nn.Linear(pre_linear, in_channels) self.conv_pre = weight_norm(Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3)) resblock = ResBlock1 if resblock_type == "1" else ResBlock2 # upsampling layers @@ -258,6 +262,9 @@ def forward(self, x, g=None): x: [B, C, T] Tensor: [B, 1, T] """ + if hasattr(self, "lin_pre"): + x = self.lin_pre(x) + x = x.permute(0, 2, 1) o = self.conv_pre(x) if hasattr(self, "cond_layer"): o = o + self.cond_layer(g) @@ -280,7 +287,7 @@ def forward(self, x, g=None): o = torch.tanh(o) return o - @torch.no_grad() + @torch.inference_mode() def inference(self, c): """ Args: @@ -306,9 +313,7 @@ def remove_weight_norm(self): remove_parametrizations(self.conv_pre, "weight") remove_parametrizations(self.conv_post, "weight") - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/vocoder/models/melgan_generator.py b/TTS/vocoder/models/melgan_generator.py index 03c971afa4..53ed700755 100644 --- a/TTS/vocoder/models/melgan_generator.py +++ b/TTS/vocoder/models/melgan_generator.py @@ -84,9 +84,7 @@ def remove_weight_norm(self): except ValueError: layer.remove_weight_norm() - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/vocoder/models/multiband_melgan_generator.py b/TTS/vocoder/models/multiband_melgan_generator.py index 25d6590659..6eee712db3 100644 --- a/TTS/vocoder/models/multiband_melgan_generator.py +++ b/TTS/vocoder/models/multiband_melgan_generator.py @@ -32,7 +32,7 @@ def pqmf_analysis(self, x): def pqmf_synthesis(self, x): return self.pqmf_layer.synthesis(x) - @torch.no_grad() + @torch.inference_mode() def inference(self, cond_features): cond_features = cond_features.to(self.layers[1].weight.device) cond_features = torch.nn.functional.pad( diff --git a/TTS/vocoder/models/parallel_wavegan_discriminator.py b/TTS/vocoder/models/parallel_wavegan_discriminator.py index 211d45d91c..02ad60e0ff 100644 --- a/TTS/vocoder/models/parallel_wavegan_discriminator.py +++ b/TTS/vocoder/models/parallel_wavegan_discriminator.py @@ -71,7 +71,7 @@ def forward(self, x): def apply_weight_norm(self): def _apply_weight_norm(m): - if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d): torch.nn.utils.parametrizations.weight_norm(m) self.apply(_apply_weight_norm) @@ -174,7 +174,7 @@ def forward(self, x): def apply_weight_norm(self): def _apply_weight_norm(m): - if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d): torch.nn.utils.parametrizations.weight_norm(m) self.apply(_apply_weight_norm) diff --git a/TTS/vocoder/models/parallel_wavegan_generator.py b/TTS/vocoder/models/parallel_wavegan_generator.py index e60c8781f0..71b38d4c0d 100644 --- a/TTS/vocoder/models/parallel_wavegan_generator.py +++ b/TTS/vocoder/models/parallel_wavegan_generator.py @@ -108,9 +108,9 @@ def forward(self, c): # perform upsampling if c is not None and self.upsample_net is not None: c = self.upsample_net(c) - assert ( - c.shape[-1] == x.shape[-1] - ), f" [!] Upsampling scale does not match the expected output. {c.shape} vs {x.shape}" + assert c.shape[-1] == x.shape[-1], ( + f" [!] Upsampling scale does not match the expected output. {c.shape} vs {x.shape}" + ) # encode to hidden representation x = self.first_conv(x) @@ -127,7 +127,7 @@ def forward(self, c): return x - @torch.no_grad() + @torch.inference_mode() def inference(self, c): c = c.to(self.first_conv.weight.device) c = torch.nn.functional.pad(c, (self.inference_padding, self.inference_padding), "replicate") @@ -145,7 +145,7 @@ def _remove_weight_norm(m): def apply_weight_norm(self): def _apply_weight_norm(m): - if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d): torch.nn.utils.parametrizations.weight_norm(m) logger.info("Weight norm is applied to %s", m) @@ -155,9 +155,7 @@ def _apply_weight_norm(m): def receptive_field_size(self): return _get_receptive_field_size(self.layers, self.stacks, self.kernel_size) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py index 5d1f817927..d991941441 100644 --- a/TTS/vocoder/models/univnet_generator.py +++ b/TTS/vocoder/models/univnet_generator.py @@ -1,5 +1,4 @@ import logging -from typing import List import numpy as np import torch @@ -21,7 +20,7 @@ def __init__( out_channels: int, hidden_channels: int, cond_channels: int, - upsample_factors: List[int], + upsample_factors: list[int], lvc_layers_each_block: int, lvc_kernel_size: int, kpnet_hidden_channels: int, @@ -128,7 +127,7 @@ def apply_weight_norm(self): """Apply weight normalization module from all of the layers.""" def _apply_weight_norm(m): - if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d): torch.nn.utils.parametrizations.weight_norm(m) logger.info("Weight norm is applied to %s", m) @@ -139,7 +138,7 @@ def receptive_field_size(self): """Return receptive field size.""" return _get_receptive_field_size(self.layers, self.stacks, self.kernel_size) - @torch.no_grad() + @torch.inference_mode() def inference(self, c): """Perform inference. Args: diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index c49abd2201..b1a4a26562 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import Dict, List, Tuple import numpy as np import torch @@ -25,10 +24,10 @@ class WavegradArgs(Coqpit): use_weight_norm: bool = False y_conv_channels: int = 32 x_conv_channels: int = 768 - dblock_out_channels: List[int] = field(default_factory=lambda: [128, 128, 256, 512]) - ublock_out_channels: List[int] = field(default_factory=lambda: [512, 512, 256, 128, 128]) - upsample_factors: List[int] = field(default_factory=lambda: [4, 4, 4, 2, 2]) - upsample_dilations: List[List[int]] = field( + dblock_out_channels: list[int] = field(default_factory=lambda: [128, 128, 256, 512]) + ublock_out_channels: list[int] = field(default_factory=lambda: [512, 512, 256, 128, 128]) + upsample_factors: list[int] = field(default_factory=lambda: [4, 4, 4, 2, 2]) + upsample_dilations: list[list[int]] = field( default_factory=lambda: [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]] ) @@ -123,7 +122,7 @@ def load_noise_schedule(self, path): beta = np.load(path, allow_pickle=True).item()["beta"] # pylint: disable=unexpected-keyword-arg self.compute_noise_level(beta) - @torch.no_grad() + @torch.inference_mode() def inference(self, x, y_n=None): """ Shapes: @@ -218,9 +217,7 @@ def apply_weight_norm(self): self.out_conv = weight_norm(self.out_conv) self.y_conv = weight_norm(self.y_conv) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: @@ -242,7 +239,7 @@ def load_checkpoint( ) self.compute_noise_level(betas) - def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + def train_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]: # format data x = batch["input"] y = batch["waveform"] @@ -258,20 +255,30 @@ def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: return {"model_output": noise_hat}, {"loss": loss} def train_log( # pylint: disable=no-self-use - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument - ) -> Tuple[Dict, np.ndarray]: + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument + ) -> tuple[dict, np.ndarray]: pass - @torch.no_grad() - def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + @torch.inference_mode() + def eval_step(self, batch: dict, criterion: nn.Module) -> tuple[dict, dict]: return self.train_step(batch, criterion) def eval_log( # pylint: disable=no-self-use - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: pass - def test(self, assets: Dict, test_loader: "DataLoader", outputs=None): # pylint: disable=unused-argument + def test(self, assets: dict, test_loader: "DataLoader", outputs=None): # pylint: disable=unused-argument # setup noise schedule and inference ap = assets["audio_processor"] noise_schedule = self.config["test_noise_schedule"] @@ -302,13 +309,13 @@ def get_criterion(): return torch.nn.L1Loss() @staticmethod - def format_batch(batch: Dict) -> Dict: + def format_batch(batch: dict) -> dict: # return a whole audio segment m, y = batch[0], batch[1] y = y.unsqueeze(1) return {"input": m, "waveform": y} - def get_data_loader(self, config: Coqpit, assets: Dict, is_eval: True, samples: List, verbose: bool, num_gpus: int): + def get_data_loader(self, config: Coqpit, assets: dict, is_eval: True, samples: list, verbose: bool, num_gpus: int): ap = assets["audio_processor"] dataset = WaveGradDataset( ap=ap, diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 1847679890..5a93f125ba 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -1,7 +1,6 @@ import sys import time from dataclasses import dataclass, field -from typing import Dict, List, Tuple import numpy as np import torch @@ -171,7 +170,7 @@ class WavernnArgs(Coqpit): num_res_blocks: int = 10 use_aux_net: bool = True use_upsample_net: bool = True - upsample_factors: List[int] = field(default_factory=lambda: [4, 8, 8]) + upsample_factors: list[int] = field(default_factory=lambda: [4, 8, 8]) mode: str = "mold" # mold [string], gauss [string], bits [int] mulaw: bool = True # apply mulaw if mode is bits pad: int = 2 @@ -226,9 +225,9 @@ class of models has however remained an elusive problem. With a focus on text-to self.aux_dims = self.args.res_out_dims // 4 if self.args.use_upsample_net: - assert ( - np.cumprod(self.args.upsample_factors)[-1] == config.audio.hop_length - ), " [!] upsample scales needs to be equal to hop_length" + assert np.cumprod(self.args.upsample_factors)[-1] == config.audio.hop_length, ( + " [!] upsample scales needs to be equal to hop_length" + ) self.upsample = UpsampleNetwork( self.args.feat_dims, self.args.upsample_factors, @@ -307,7 +306,7 @@ def inference(self, mels, batched=None, target=None, overlap=None): rnn1 = self.get_gru_cell(self.rnn1) rnn2 = self.get_gru_cell(self.rnn2) - with torch.no_grad(): + with torch.inference_mode(): if isinstance(mels, np.ndarray): mels = torch.FloatTensor(mels).to(str(next(self.parameters()).device)) @@ -528,16 +527,14 @@ def xfade_and_unfold(y, target, overlap): return unfolded - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: self.eval() assert not self.training - def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + def train_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]: mels = batch["input"] waveform = batch["waveform"] waveform_coarse = batch["waveform_coarse"] @@ -552,13 +549,16 @@ def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: loss_dict = criterion(y_hat, waveform_coarse) return {"model_output": y_hat}, loss_dict - def eval_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + def eval_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]: return self.train_step(batch, criterion) @torch.no_grad() def test( - self, assets: Dict, test_loader: "DataLoader", output: Dict # pylint: disable=unused-argument - ) -> Tuple[Dict, Dict]: + self, + assets: dict, + test_loader: "DataLoader", + output: dict, # pylint: disable=unused-argument + ) -> tuple[dict, dict]: ap = self.ap figures = {} audios = {} @@ -579,14 +579,18 @@ def test( return figures, audios def test_log( - self, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument - ) -> Tuple[Dict, np.ndarray]: + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument + ) -> tuple[dict, np.ndarray]: figures, audios = outputs logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) @staticmethod - def format_batch(batch: Dict) -> Dict: + def format_batch(batch: dict) -> dict: waveform = batch[0] mels = batch[1] waveform_coarse = batch[2] @@ -595,9 +599,9 @@ def format_batch(batch: Dict) -> Dict: def get_data_loader( # pylint: disable=no-self-use self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: True, - samples: List, + samples: list, verbose: bool, num_gpus: int, ): diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py index fe706ba9ff..bef68e5564 100644 --- a/TTS/vocoder/utils/distribution.py +++ b/TTS/vocoder/utils/distribution.py @@ -12,7 +12,7 @@ def gaussian_loss(y_hat, y, log_std_min=-7.0): mean = y_hat[:, :, :1] log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min) # TODO: replace with pytorch dist - log_probs = -0.5 * (-math.log(2.0 * math.pi) - 2.0 * log_std - torch.pow(y - mean, 2) * torch.exp((-2.0 * log_std))) + log_probs = -0.5 * (-math.log(2.0 * math.pi) - 2.0 * log_std - torch.pow(y - mean, 2) * torch.exp(-2.0 * log_std)) return log_probs.squeeze().mean() diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index ac797d97f7..2823d206a0 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -1,5 +1,4 @@ import logging -from typing import Dict import numpy as np import torch @@ -32,7 +31,7 @@ def interpolate_vocoder_input(scale_factor, spec): return spec -def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> Dict: +def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> dict: """Plot the predicted and the real waveform and their spectrograms. Args: diff --git a/docs/source/docker_images.md b/docs/source/docker_images.md index 042f9f8e7a..ef98fe302e 100644 --- a/docs/source/docker_images.md +++ b/docs/source/docker_images.md @@ -7,11 +7,11 @@ You can use premade images built automatically from the latest TTS version. ### CPU version ```bash -docker pull ghcr.io/coqui-ai/tts-cpu +docker pull ghcr.io/idiap/coqui-tts-cpu ``` ### GPU version ```bash -docker pull ghcr.io/coqui-ai/tts +docker pull ghcr.io/idiap/coqui-tts ``` ## Building your own image @@ -25,14 +25,14 @@ You can pass any tts argument after the image name. ### CPU version ```bash -docker run --rm -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts-cpu --text "Hello." --out_path /root/tts-output/hello.wav +docker run --rm -v ~/tts-output:/root/tts-output ghcr.io/idiap/coqui-tts-cpu --text "Hello." --out_path /root/tts-output/hello.wav ``` ### GPU version For the GPU version, you need to have the latest NVIDIA drivers installed. With `nvidia-smi` you can check the CUDA version supported, it must be >= 11.8 ```bash -docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda +docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/idiap/coqui-tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda ``` ## Start a server @@ -41,14 +41,14 @@ Start the container and get a shell inside it. ### CPU version ```bash -docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu +docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/idiap/coqui-tts-cpu python3 TTS/server/server.py --list_models #To get the list of available models python3 TTS/server/server.py --model_name tts_models/en/vctk/vits ``` ### GPU version ```bash -docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/coqui-ai/tts +docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/idiap/coqui-tts python3 TTS/server/server.py --list_models #To get the list of available models python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda ``` diff --git a/docs/source/extension/implementing_a_new_model.md b/docs/source/extension/implementing_a_new_model.md index 2521789771..188f466c72 100644 --- a/docs/source/extension/implementing_a_new_model.md +++ b/docs/source/extension/implementing_a_new_model.md @@ -37,7 +37,7 @@ an infinite flexibility to add custom behaviours for your model and training routines. For more details, see [BaseTTS](../main_classes/model_api.md#base-tts-model) - and `TTS.utils.callbacks`. + and [`trainer.callbacks`](https://github.com/idiap/coqui-ai-Trainer/blob/main/trainer/callbacks.py). 6. Optionally, define `MyModelArgs`. diff --git a/docs/source/inference.md b/docs/source/inference.md index cb7d01fca3..1bb844aee3 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -16,6 +16,7 @@ Coqui TTS provides three main methods for inference: ```{toctree} :hidden: +vc server marytts ``` diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index 91d4b4078c..5f6c6ba44c 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -182,7 +182,7 @@ To use the model API, you need to download the model files and pass config and m If you want to be able to `load_checkpoint` with `use_deepspeed=True` and **enjoy the speedup**, you need to install deepspeed first. ```console -pip install deepspeed==0.10.3 +pip install deepspeed ``` #### Inference parameters diff --git a/docs/source/server.md b/docs/source/server.md index 3fa211d0d7..69bdace27b 100644 --- a/docs/source/server.md +++ b/docs/source/server.md @@ -4,8 +4,7 @@ You can boot up a demo 🐸TTS server to run an inference with your models (make sure to install the additional dependencies with `pip install coqui-tts[server]`). -Note that the server is not optimized for performance and does not support all -Coqui models yet. +Note that the server is not optimized for performance. The demo server provides pretty much the same interface as the CLI command. @@ -15,7 +14,8 @@ tts-server --list_models # list the available models. ``` Run a TTS model, from the release models list, with its default vocoder. -If the model you choose is a multi-speaker TTS model, you can select different speakers on the Web interface and synthesize +If the model you choose is a multi-speaker or multilingual TTS model, you can +select different speakers and languages on the Web interface and synthesize speech. ```bash diff --git a/docs/source/vc.md b/docs/source/vc.md new file mode 100644 index 0000000000..8b45d9393a --- /dev/null +++ b/docs/source/vc.md @@ -0,0 +1,84 @@ +# Voice conversion + +## Overview + +Voice conversion (VC) converts the voice in a speech signal from one speaker to +that of another speaker while preserving the linguistic content. Coqui supports +both voice conversion on its own, as well as applying it after speech synthesis +to enable multi-speaker output with single-speaker TTS models. + +### Python API + +Converting the voice in `source_wav` to the voice of `target_wav` (the latter +can also be a list of files): + +```python +from TTS.api import TTS + +tts = TTS("voice_conversion_models/multilingual/vctk/freevc24").to("cuda") +tts.voice_conversion_to_file( + source_wav="my/source.wav", + target_wav="my/target.wav", + file_path="output.wav" +) +``` + +Voice cloning by combining TTS and VC. The FreeVC model is used for voice +conversion after synthesizing speech. + +```python + +tts = TTS("tts_models/de/thorsten/tacotron2-DDC") +tts.tts_with_vc_to_file( + "Wie sage ich auf Italienisch, dass ich dich liebe?", + speaker_wav=["target1.wav", "target2.wav"], + file_path="output.wav" +) +``` + +Some models, including [XTTS](models/xtts.md), support voice cloning directly +and a separate voice conversion step is not necessary: + +```python +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda") +wav = tts.tts( + text="Hello world!", + speaker_wav="my/cloning/audio.wav", + language="en" +) +``` + +### CLI + +```sh +tts --out_path output/path/speech.wav \ + --model_name "//" \ + --source_wav \ + --target_wav +``` + +## Pretrained models + +Coqui includes the following pretrained voice conversion models. Training is not +supported. + +### FreeVC + +- `voice_conversion_models/multilingual/vctk/freevc24` + +Adapted from: https://github.com/OlaWod/FreeVC + +### kNN-VC + +- `voice_conversion_models/multilingual/multi-dataset/knnvc` + +At least 1-5 minutes of target speaker data are recommended. + +Adapted from: https://github.com/bshall/knn-vc + +### OpenVoice + +- `voice_conversion_models/multilingual/multi-dataset/openvoice_v1` +- `voice_conversion_models/multilingual/multi-dataset/openvoice_v2` + +Adapted from: https://github.com/myshell-ai/OpenVoice diff --git a/hubconf.py b/hubconf.py index 6e10928265..b49c9d6bcc 100644 --- a/hubconf.py +++ b/hubconf.py @@ -1,4 +1,14 @@ -dependencies = ["torch", "gdown", "pysbd", "gruut", "anyascii", "pypinyin", "coqpit", "mecab-python3", "unidic-lite"] +dependencies = [ + "torch", + "gdown", + "pysbd", + "gruut", + "anyascii", + "pypinyin", + "coqpit-config", + "mecab-python3", + "unidic-lite", +] import torch from TTS.utils.manage import ModelManager @@ -39,5 +49,5 @@ def tts(model_name="tts_models/en/ljspeech/tacotron2-DCA", vocoder_name=None, us if __name__ == "__main__": - synthesizer = torch.hub.load("coqui-ai/TTS:dev", "tts", source="github") + synthesizer = torch.hub.load("idiap/coqui-ai-TTS:dev", "tts", source="github") synthesizer.tts("This is a test!") diff --git a/notebooks/dataset_analysis/analyze.py b/notebooks/dataset_analysis/analyze.py index 4855886efd..44bf25c071 100644 --- a/notebooks/dataset_analysis/analyze.py +++ b/notebooks/dataset_analysis/analyze.py @@ -43,7 +43,7 @@ def process_meta_data(path): meta_data = {} # load meta data - with open(path, "r", encoding="utf-8") as f: + with open(path, encoding="utf-8") as f: data = csv.reader(f, delimiter="|") for row in data: frames = int(row[2]) @@ -58,7 +58,7 @@ def process_meta_data(path): "utt": utt, "frames": frames, "audio_len": audio_len, - "row": "{}|{}|{}|{}".format(row[0], row[1], row[2], row[3]), + "row": f"{row[0]}|{row[1]}|{row[2]}|{row[3]}", } ) @@ -156,7 +156,7 @@ def plot_phonemes(train_path, cmu_dict_path, save_path): phonemes = {} - with open(train_path, "r", encoding="utf-8") as f: + with open(train_path, encoding="utf-8") as f: data = csv.reader(f, delimiter="|") phonemes["None"] = 0 for row in data: diff --git a/notebooks/PlotUmapLibriTTS.ipynb b/notebooks/plot_embeddings_umap.ipynb similarity index 56% rename from notebooks/PlotUmapLibriTTS.ipynb rename to notebooks/plot_embeddings_umap.ipynb index 1e29790b9e..b661f85673 100644 --- a/notebooks/PlotUmapLibriTTS.ipynb +++ b/notebooks/plot_embeddings_umap.ipynb @@ -4,13 +4,20 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Overview\n", + "# Overview\n", "\n", "This notebook can be used with both a single or multi- speaker corpus and allows the interactive plotting of speaker embeddings linked to underlying audio (see instructions in the repo's speaker_embedding directory)\n", "\n", "Depending on the directory structure used for your corpus, you may need to adjust handling of **speaker_to_utter** and **locations**." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -19,63 +26,47 @@ "source": [ "import os\n", "import glob\n", + "import random\n", + "from collections import defaultdict\n", + "from pathlib import Path\n", + "\n", "import numpy as np\n", + "import torch\n", "import umap\n", "\n", - "from TTS.utils.audio import AudioProcessor\n", + "from TTS.bin.compute_embeddings import compute_embeddings\n", "from TTS.config import load_config\n", + "from TTS.config.shared_configs import BaseDatasetConfig\n", + "from TTS.tts.datasets import load_tts_samples\n", + "from TTS.utils.audio import AudioProcessor\n", "\n", "from bokeh.io import output_notebook, show\n", "from bokeh.plotting import figure\n", "from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n", "from bokeh.transform import factor_cmap\n", - "from bokeh.palettes import Category10" + "from bokeh.palettes import Category10\n", + "\n", + "output_notebook()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "For larger sets of speakers, you can use **Category20**, but you need to change it in the **pal** variable too\n", + "For larger sets of speakers, you can use `Category20`, but you need to change it in the `pal` variable too\n", "\n", - "List of Bokeh palettes here: http://docs.bokeh.org/en/1.4.0/docs/reference/palettes.html\n", + "List of Bokeh palettes here: https://docs.bokeh.org/en/latest/docs/reference/palettes.html\n", "\n", "**NB:** if you have problems with other palettes, first see https://stackoverflow.com/questions/48333820/why-do-some-bokeh-palettes-raise-a-valueerror-when-used-in-factor-cmap" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "output_notebook()" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You should also adjust all the path constants to point at the relevant locations for you locally" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n", - "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth\"\n", - "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", - "\n", - "# My single speaker locations\n", - "#EMBED_PATH = \"/home/neil/main/Projects/TTS3/embeddings/neil14/\"\n", - "#AUDIO_PATH = \"/home/neil/data/Projects/NeilTTS/neil14/wavs/\"\n", + "## Config\n", "\n", - "# My multi speaker locations\n", - "EMBED_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360-embed_128/\"\n", - "AUDIO_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360/\"" + "You should adjust all the paths to point at the relevant locations for you locally." ] }, { @@ -84,7 +75,16 @@ "metadata": {}, "outputs": [], "source": [ - "!ls -1 $MODEL_RUN_PATH" + "# Dataset\n", + "formatter_name = \"ljspeech\"\n", + "dataset_name = \"ljspeech\"\n", + "dataset_path = \"path/to/LJSpeech-1.1\"\n", + "meta_file_train = \"metadata.csv\"\n", + "\n", + "# Speaker encoder\n", + "se_model_path = \"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar\"\n", + "se_config_path = \"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json\"\n", + "embedding_path = \"speakers.pth\"" ] }, { @@ -93,15 +93,25 @@ "metadata": {}, "outputs": [], "source": [ - "CONFIG = load_config(CONFIG_PATH)\n", - "ap = AudioProcessor(**CONFIG['audio'])" + "dataset_config = BaseDatasetConfig()\n", + "dataset_config.formatter = formatter_name\n", + "dataset_config.dataset_name = dataset_name\n", + "dataset_config.path = dataset_path\n", + "dataset_config.meta_file_train = meta_file_train\n", + "\n", + "meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=False)\n", + "utt_to_wav = {\n", + " item[\"audio_unique_name\"]: str(Path(item[\"audio_file\"]).relative_to(dataset_path)) for item in meta_data_train\n", + "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Bring in the embeddings created by **compute_embeddings.py**" + "## Compute embeddings\n", + "\n", + "You can skip this if you have already computed embeddings with `TTS/bin/compute_embeddings.py`" ] }, { @@ -110,33 +120,38 @@ "metadata": {}, "outputs": [], "source": [ - "embed_files = glob.glob(EMBED_PATH+\"/**/*.npy\", recursive=True)\n", - "print(f'Embeddings found: {len(embed_files)}')" + "compute_embeddings(\n", + " model_path=se_model_path,\n", + " config_path=se_config_path,\n", + " output_path=embedding_path,\n", + " formatter_name=formatter_name,\n", + " dataset_name=dataset_name,\n", + " dataset_path=dataset_path,\n", + " meta_file_train=meta_file_train,\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Check that we did indeed find an embedding" + "## Plot Umap" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "embed_files[0]" + "Bring in the embeddings created by `TTS/bin/compute_embeddings.py`" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "### Process the speakers\n", - "\n", - "Assumes count of **speaker_paths** corresponds to number of speakers (so a corpus in just one directory would be treated like a single speaker and the multiple directories of LibriTTS are treated as distinct speakers)" + "embeddings = torch.load(embedding_path, weights_only=True)" ] }, { @@ -145,15 +160,13 @@ "metadata": {}, "outputs": [], "source": [ - "speaker_paths = list(set([os.path.dirname(os.path.dirname(embed_file)) for embed_file in embed_files]))\n", - "speaker_to_utter = {}\n", - "for embed_file in embed_files:\n", - " speaker_path = os.path.dirname(os.path.dirname(embed_file))\n", - " try:\n", - " speaker_to_utter[speaker_path].append(embed_file)\n", - " except:\n", - " speaker_to_utter[speaker_path]=[embed_file]\n", - "print(f'Speaker count: {len(speaker_paths)}')" + "speakers = set()\n", + "speaker_to_utter = defaultdict(list)\n", + "for idx, embedding in embeddings.items():\n", + " speaker = embedding[\"name\"]\n", + " speakers.add(speaker)\n", + " speaker_to_utter[speaker].append(idx)\n", + "print(f\"Speaker count: {len(speakers)}\")" ] }, { @@ -175,35 +188,32 @@ "labels = []\n", "locations = []\n", "\n", - "# single speaker \n", - "#num_speakers = 1\n", - "#num_utters = 1000\n", + "# single speaker\n", + "num_speakers = 1\n", + "num_utters = 1000\n", "\n", "# multi speaker\n", - "num_speakers = 10\n", - "num_utters = 20\n", + "# num_speakers = 10\n", + "# num_utters = 20\n", "\n", - "\n", - "speaker_idxs = np.random.choice(range(len(speaker_paths)), num_speakers, replace=False )\n", + "speaker_idxs = random.sample(list(speakers), num_speakers)\n", "\n", "for speaker_num, speaker_idx in enumerate(speaker_idxs):\n", - " speaker_path = speaker_paths[speaker_idx]\n", - " speakers_utter = speaker_to_utter[speaker_path]\n", - " utter_idxs = np.random.randint(0, len(speakers_utter) , num_utters)\n", + " speakers_utter = speaker_to_utter[speaker_idx]\n", + " utter_idxs = random.sample(speakers_utter, num_utters)\n", " for utter_idx in utter_idxs:\n", - " embed_path = speaker_to_utter[speaker_path][utter_idx]\n", - " embed = np.load(embed_path)\n", - " embeds.append(embed)\n", - " labels.append(str(speaker_num))\n", - " locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n", - "embeds = np.concatenate(embeds)" + " embed = np.array(embeddings[utter_idx][\"embedding\"])\n", + " embeds.append(embed)\n", + " labels.append(speaker_idx)\n", + " locations.append(utt_to_wav[utter_idx])\n", + "embeds = np.stack(embeds)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Load embeddings with UMAP" + "### Load embeddings with UMAP" ] }, { @@ -222,9 +232,7 @@ "source": [ "### Interactively charting the data in Bokeh\n", "\n", - "Set up various details for Bokeh to plot the data\n", - "\n", - "You can use the regular Bokeh [tools](http://docs.bokeh.org/en/1.4.0/docs/user_guide/tools.html?highlight=tools) to explore the data, with reset setting it back to normal\n", + "You can use the regular Bokeh [tools](https://docs.bokeh.org/en/latest/docs/user_guide/interaction/tools.html) to explore the data, with reset setting it back to normal\n", "\n", "Once you have started the local server (see cell below) you can then click on plotted points which will open a tab to play the audio for that point, enabling easy exploration of your corpus\n", "\n", @@ -238,22 +246,17 @@ "outputs": [], "source": [ "source_wav_stems = ColumnDataSource(\n", - " data=dict(\n", - " x = projection.T[0].tolist(),\n", - " y = projection.T[1].tolist(),\n", - " desc=locations,\n", - " label=labels\n", - " )\n", + " data=dict(\n", + " x=projection.T[0].tolist(),\n", + " y=projection.T[1].tolist(),\n", + " desc=locations,\n", + " label=labels,\n", " )\n", + ")\n", "\n", - "hover = HoverTool(\n", - " tooltips=[\n", - " (\"file\", \"@desc\"),\n", - " (\"speaker\", \"@label\"),\n", - " ]\n", - " )\n", + "hover = HoverTool(tooltips=[(\"file\", \"@desc\"), (\"speaker\", \"@label\")])\n", "\n", - "# optionally consider adding these to the tooltips if you want additional detail\n", + "### Optionally consider adding these to the tooltips if you want additional detail\n", "# for the coordinates: (\"(x,y)\", \"($x, $y)\"),\n", "# for the index of the embedding / wav file: (\"index\", \"$index\"),\n", "\n", @@ -261,10 +264,13 @@ "pal_size = max(len(factors), 3)\n", "pal = Category10[pal_size]\n", "\n", - "p = figure(plot_width=600, plot_height=400, tools=[hover,BoxZoomTool(), ResetTool(), TapTool()])\n", - "\n", - "\n", - "p.circle('x', 'y', source=source_wav_stems, color=factor_cmap('label', palette=pal, factors=factors),)\n", + "p = figure(width=600, height=400, tools=[hover, BoxZoomTool(), ResetTool(), TapTool()])\n", + "p.scatter(\n", + " \"x\",\n", + " \"y\",\n", + " source=source_wav_stems,\n", + " color=factor_cmap(\"label\", palette=pal, factors=factors),\n", + ")\n", "\n", "url = \"http://localhost:8000/@desc\"\n", "taptool = p.select(type=TapTool)\n", @@ -292,7 +298,7 @@ "metadata": {}, "outputs": [], "source": [ - "%cd $AUDIO_PATH\n", + "%cd $dataset_path\n", "%pwd\n", "!python -m http.server" ] @@ -300,7 +306,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -314,7 +320,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index a7baf29e31..e9516d3d8c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,10 +25,10 @@ build-backend = "hatchling.build" [project] name = "coqui-tts" -version = "0.25.1" +version = "0.25.3" description = "Deep learning for Text to Speech." readme = "README.md" -requires-python = ">=3.9, <3.13" +requires-python = ">=3.10, <3.13" license = {text = "MPL-2.0"} authors = [ {name = "Eren Gölge", email = "egolge@coqui.ai"} @@ -39,7 +39,6 @@ maintainers = [ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -70,6 +69,7 @@ dependencies = [ "pyyaml>=6.0", "fsspec[http]>=2023.6.0", "packaging>=23.1", + "typing_extensions>=4.10", # Inference "pysbd>=0.3.4", # Training @@ -87,13 +87,13 @@ dependencies = [ "encodec>=0.1.1", # XTTS "num2words>=0.5.14", - "spacy[ja]>=3,<3.8", + "spacy[ja]>=3.2,<3.8", ] [project.optional-dependencies] # Only used in notebooks notebooks = [ - "bokeh==1.4.0", + "bokeh>=3.0.3", "pandas>=1.4,<2.0", "umap-learn>=0.5.1", ] @@ -115,7 +115,7 @@ ko = [ ] # Japanese ja = [ - "mecab-python3>=1.0.2", + "mecab-python3>=1.0.6", "unidic-lite==1.0.8", "cutlet>=0.2.0", ] @@ -135,11 +135,10 @@ all = [ [dependency-groups] dev = [ - "black==24.2.0", "coverage[toml]>=7", - "nose2>=0.15", - "pre-commit>=3", - "ruff==0.7.0", + "pre-commit>=4", + "pytest>=8", + "ruff==0.9.1", ] # Dependencies for building the documentation docs = [ @@ -173,7 +172,6 @@ exclude = [ "/.readthedocs.yml", "/Makefile", "/dockerfiles", - "/run_bash_tests.sh", "/scripts", "/tests", ] @@ -192,6 +190,7 @@ lint.extend-select = [ "F704", # yield-outside-function "F706", # return-outside-function "F841", # unused-variable + "G004", # no f-string in logging "I", # import sorting "PIE790", # unnecessary-pass "PLC", @@ -201,6 +200,7 @@ lint.extend-select = [ "PLR0911", # too-many-return-statements "PLR1711", # useless-return "PLW", + "UP", # pyupgrade "W291", # trailing-whitespace "NPY201", # NumPy 2.0 deprecation ] @@ -231,14 +231,10 @@ max-returns = 7 "E402", # module level import not at top of file ] -[tool.black] -line-length = 120 -target-version = ['py39'] +[tool.coverage.report] +skip_covered = true +skip_empty = true [tool.coverage.run] parallel = true source = ["TTS"] - -[tool.cibuildwheel] -build = "cp*" -skip = "*-win32 *i686 *musllinux*" diff --git a/run_bash_tests.sh b/run_bash_tests.sh deleted file mode 100755 index 2f5ba88934..0000000000 --- a/run_bash_tests.sh +++ /dev/null @@ -1,7 +0,0 @@ -set -e -TF_CPP_MIN_LOG_LEVEL=3 - -# runtime bash based tests -# TODO: move these to python -./tests/bash_tests/test_demo_server.sh && \ -./tests/bash_tests/test_compute_statistics.sh diff --git a/tests/__init__.py b/tests/__init__.py index f0a8b2f118..1a03d07552 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,5 +1,8 @@ import os +from collections.abc import Callable +from typing import Optional +import pytest from trainer.generic_utils import get_cuda from TTS.config import BaseDatasetConfig @@ -44,6 +47,12 @@ def run_cli(command): assert exit_status == 0, f" [!] command `{command}` failed." +def run_main(main_func: Callable, args: list[str] | None = None, expected_code: int = 0): + with pytest.raises(SystemExit) as exc_info: + main_func(args) + assert exc_info.value.code == expected_code + + def get_test_data_config(): return BaseDatasetConfig(formatter="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv") diff --git a/tests/aux_tests/test_audio_processor.py b/tests/aux_tests/test_audio_processor.py index 5b1fa9d38a..6caf6db30d 100644 --- a/tests/aux_tests/test_audio_processor.py +++ b/tests/aux_tests/test_audio_processor.py @@ -1,190 +1,194 @@ import os -import unittest -from tests import get_tests_input_path, get_tests_output_path, get_tests_path +import pytest + +from tests import get_tests_input_path from TTS.config import BaseAudioConfig from TTS.utils.audio.processor import AudioProcessor -TESTS_PATH = get_tests_path() -OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -os.makedirs(OUT_PATH, exist_ok=True) conf = BaseAudioConfig(mel_fmax=8000, pitch_fmax=640, pitch_fmin=1) -# pylint: disable=protected-access -class TestAudio(unittest.TestCase): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.ap = AudioProcessor(**conf) - - def test_audio_synthesis(self): - """1. load wav - 2. set normalization parameters - 3. extract mel-spec - 4. invert to wav and save the output - """ - print(" > Sanity check for the process wav -> mel -> wav") - - def _test(max_norm, signal_norm, symmetric_norm, clip_norm): - self.ap.max_norm = max_norm - self.ap.signal_norm = signal_norm - self.ap.symmetric_norm = symmetric_norm - self.ap.clip_norm = clip_norm - wav = self.ap.load_wav(WAV_FILE) - mel = self.ap.melspectrogram(wav) - wav_ = self.ap.inv_melspectrogram(mel) - file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav".format( - max_norm, signal_norm, symmetric_norm, clip_norm - ) - print(" | > Creating wav file at : ", file_name) - self.ap.save_wav(wav_, OUT_PATH + file_name) - - # maxnorm = 1.0 - _test(1.0, False, False, False) - _test(1.0, True, False, False) - _test(1.0, True, True, False) - _test(1.0, True, False, True) - _test(1.0, True, True, True) - # maxnorm = 4.0 - _test(4.0, False, False, False) - _test(4.0, True, False, False) - _test(4.0, True, True, False) - _test(4.0, True, False, True) - _test(4.0, True, True, True) - - def test_normalize(self): - """Check normalization and denormalization for range values and consistency""" - print(" > Testing normalization and denormalization.") - wav = self.ap.load_wav(WAV_FILE) - wav = self.ap.sound_norm(wav) # normalize audio to get abetter normalization range below. - self.ap.signal_norm = False - x = self.ap.melspectrogram(wav) - x_old = x - - self.ap.signal_norm = True - self.ap.symmetric_norm = False - self.ap.clip_norm = False - self.ap.max_norm = 4.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - assert (x_old - x).sum() == 0 - # check value range - assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() - assert x_norm.min() >= 0 - 1, x_norm.min() - # check denorm. - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3, (x - x_).mean() - - self.ap.signal_norm = True - self.ap.symmetric_norm = False - self.ap.clip_norm = True - self.ap.max_norm = 4.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - - assert (x_old - x).sum() == 0 - # check value range - assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= 0, x_norm.min() - # check denorm. - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3, (x - x_).mean() - - self.ap.signal_norm = True - self.ap.symmetric_norm = True - self.ap.clip_norm = False - self.ap.max_norm = 4.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - - assert (x_old - x).sum() == 0 - # check value range - assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() - assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() # pylint: disable=invalid-unary-operand-type - assert x_norm.min() <= 0, x_norm.min() - # check denorm. - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3, (x - x_).mean() - - self.ap.signal_norm = True - self.ap.symmetric_norm = True - self.ap.clip_norm = True - self.ap.max_norm = 4.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - - assert (x_old - x).sum() == 0 - # check value range - assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= -self.ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type - assert x_norm.min() <= 0, x_norm.min() - # check denorm. - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3, (x - x_).mean() - - self.ap.signal_norm = True - self.ap.symmetric_norm = False - self.ap.max_norm = 1.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - - assert (x_old - x).sum() == 0 - assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= 0, x_norm.min() - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3 - - self.ap.signal_norm = True - self.ap.symmetric_norm = True - self.ap.max_norm = 1.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - - assert (x_old - x).sum() == 0 - assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= -self.ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type - assert x_norm.min() < 0, x_norm.min() - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3 - - def test_scaler(self): - scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy") - conf.stats_path = scaler_stats_path - conf.preemphasis = 0.0 - conf.do_trim_silence = True - conf.signal_norm = True - - ap = AudioProcessor(**conf) - mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path) - ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) - - self.ap.signal_norm = False - self.ap.preemphasis = 0.0 - - # test scaler forward and backward transforms - wav = self.ap.load_wav(WAV_FILE) - mel_reference = self.ap.melspectrogram(wav) - mel_norm = ap.melspectrogram(wav) - mel_denorm = ap.denormalize(mel_norm) - assert abs(mel_reference - mel_denorm).max() < 1e-4 - - def test_compute_f0(self): # pylint: disable=no-self-use - ap = AudioProcessor(**conf) - wav = ap.load_wav(WAV_FILE) - pitch = ap.compute_f0(wav) - mel = ap.melspectrogram(wav) - assert pitch.shape[0] == mel.shape[1] +@pytest.fixture +def ap(): + """Set up audio processor.""" + return AudioProcessor(**conf) + + +norms = [ + # maxnorm = 1.0 + (1.0, False, False, False), + (1.0, True, False, False), + (1.0, True, True, False), + (1.0, True, False, True), + (1.0, True, True, True), + # maxnorm = 4.0 + (4.0, False, False, False), + (4.0, True, False, False), + (4.0, True, True, False), + (4.0, True, False, True), + (4.0, True, True, True), +] + + +@pytest.mark.parametrize("norms", norms) +def test_audio_synthesis(tmp_path, ap, norms): + """1. load wav + 2. set normalization parameters + 3. extract mel-spec + 4. invert to wav and save the output + """ + print(" > Sanity check for the process wav -> mel -> wav") + max_norm, signal_norm, symmetric_norm, clip_norm = norms + ap.max_norm = max_norm + ap.signal_norm = signal_norm + ap.symmetric_norm = symmetric_norm + ap.clip_norm = clip_norm + wav = ap.load_wav(WAV_FILE) + mel = ap.melspectrogram(wav) + wav_ = ap.inv_melspectrogram(mel) + file_name = ( + f"audio_test-melspec_max_norm_{max_norm}-signal_norm_{signal_norm}-" + f"symmetric_{symmetric_norm}-clip_norm_{clip_norm}.wav" + ) + print(" | > Creating wav file at : ", file_name) + ap.save_wav(wav_, tmp_path / file_name) + + +def test_normalize(ap): + """Check normalization and denormalization for range values and consistency""" + print(" > Testing normalization and denormalization.") + wav = ap.load_wav(WAV_FILE) + wav = ap.sound_norm(wav) # normalize audio to get abetter normalization range below. + ap.signal_norm = False + x = ap.melspectrogram(wav) + x_old = x + + ap.signal_norm = True + ap.symmetric_norm = False + ap.clip_norm = False + ap.max_norm = 4.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + assert (x_old - x).sum() == 0 + # check value range + assert x_norm.max() <= ap.max_norm + 1, x_norm.max() + assert x_norm.min() >= 0 - 1, x_norm.min() + # check denorm. + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3, (x - x_).mean() + + ap.signal_norm = True + ap.symmetric_norm = False + ap.clip_norm = True + ap.max_norm = 4.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + + assert (x_old - x).sum() == 0 + # check value range + assert x_norm.max() <= ap.max_norm, x_norm.max() + assert x_norm.min() >= 0, x_norm.min() + # check denorm. + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3, (x - x_).mean() + + ap.signal_norm = True + ap.symmetric_norm = True + ap.clip_norm = False + ap.max_norm = 4.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + + assert (x_old - x).sum() == 0 + # check value range + assert x_norm.max() <= ap.max_norm + 1, x_norm.max() + assert x_norm.min() >= -ap.max_norm - 2, x_norm.min() # pylint: disable=invalid-unary-operand-type + assert x_norm.min() <= 0, x_norm.min() + # check denorm. + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3, (x - x_).mean() + + ap.signal_norm = True + ap.symmetric_norm = True + ap.clip_norm = True + ap.max_norm = 4.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + + assert (x_old - x).sum() == 0 + # check value range + assert x_norm.max() <= ap.max_norm, x_norm.max() + assert x_norm.min() >= -ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type + assert x_norm.min() <= 0, x_norm.min() + # check denorm. + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3, (x - x_).mean() + + ap.signal_norm = True + ap.symmetric_norm = False + ap.max_norm = 1.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + + assert (x_old - x).sum() == 0 + assert x_norm.max() <= ap.max_norm, x_norm.max() + assert x_norm.min() >= 0, x_norm.min() + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3 + + ap.signal_norm = True + ap.symmetric_norm = True + ap.max_norm = 1.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + + assert (x_old - x).sum() == 0 + assert x_norm.max() <= ap.max_norm, x_norm.max() + assert x_norm.min() >= -ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type + assert x_norm.min() < 0, x_norm.min() + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3 + + +def test_scaler(ap): + scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy") + conf.stats_path = scaler_stats_path + conf.preemphasis = 0.0 + conf.do_trim_silence = True + conf.signal_norm = True + + ap = AudioProcessor(**conf) + mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path) + ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) + + ap.signal_norm = False + ap.preemphasis = 0.0 + + # test scaler forward and backward transforms + wav = ap.load_wav(WAV_FILE) + mel_reference = ap.melspectrogram(wav) + mel_norm = ap.melspectrogram(wav) + mel_denorm = ap.denormalize(mel_norm) + assert abs(mel_reference - mel_denorm).max() < 1e-4 + + +def test_compute_f0(ap): + wav = ap.load_wav(WAV_FILE) + pitch = ap.compute_f0(wav) + mel = ap.melspectrogram(wav) + assert pitch.shape[0] == mel.shape[1] diff --git a/tests/aux_tests/test_compute_statistics.py b/tests/aux_tests/test_compute_statistics.py new file mode 100644 index 0000000000..d6809eb480 --- /dev/null +++ b/tests/aux_tests/test_compute_statistics.py @@ -0,0 +1,10 @@ +from pathlib import Path + +from tests import get_tests_input_path, run_main +from TTS.bin.compute_statistics import main + + +def test_compute_statistics(tmp_path): + config_path = Path(get_tests_input_path()) / "test_glow_tts_config.json" + output_path = tmp_path / "scale_stats.npy" + run_main(main, ["--config_path", str(config_path), "--out_path", str(output_path)]) diff --git a/tests/aux_tests/test_extract_tts_spectrograms.py b/tests/aux_tests/test_extract_tts_spectrograms.py index f2d119ac35..563c5dae02 100644 --- a/tests/aux_tests/test_extract_tts_spectrograms.py +++ b/tests/aux_tests/test_extract_tts_spectrograms.py @@ -1,67 +1,23 @@ -import os -import unittest +from pathlib import Path +import pytest import torch -from tests import get_tests_input_path, get_tests_output_path, run_cli +from tests import get_tests_input_path, run_main +from TTS.bin.extract_tts_spectrograms import main from TTS.config import load_config from TTS.tts.models import setup_model torch.manual_seed(1) -# pylint: disable=protected-access -class TestExtractTTSSpectrograms(unittest.TestCase): - @staticmethod - def test_GlowTTS(): - # set paths - config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json") - checkpoint_path = os.path.join(get_tests_output_path(), "glowtts.pth") - output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") - # load config - c = load_config(config_path) - # create model - model = setup_model(c) - # save model - torch.save({"model": model.state_dict()}, checkpoint_path) - # run test - run_cli( - f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"' - ) - run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"') +@pytest.mark.parametrize("model", ["glow_tts", "tacotron", "tacotron2"]) +def test_extract_tts_spectrograms(tmp_path, model): + config_path = str(Path(get_tests_input_path()) / f"test_{model}_config.json") + checkpoint_path = str(tmp_path / f"{model}.pth") + output_path = str(tmp_path / "output_extract_tts_spectrograms") - @staticmethod - def test_Tacotron2(): - # set paths - config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json") - checkpoint_path = os.path.join(get_tests_output_path(), "tacotron2.pth") - output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") - # load config - c = load_config(config_path) - # create model - model = setup_model(c) - # save model - torch.save({"model": model.state_dict()}, checkpoint_path) - # run test - run_cli( - f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"' - ) - run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"') - - @staticmethod - def test_Tacotron(): - # set paths - config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json") - checkpoint_path = os.path.join(get_tests_output_path(), "tacotron.pth") - output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") - # load config - c = load_config(config_path) - # create model - model = setup_model(c) - # save model - torch.save({"model": model.state_dict()}, checkpoint_path) - # run test - run_cli( - f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"' - ) - run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"') + config = load_config(config_path) + model = setup_model(config) + torch.save({"model": model.state_dict()}, checkpoint_path) + run_main(main, ["--config_path", config_path, "--checkpoint_path", checkpoint_path, "--output_path", output_path]) diff --git a/tests/aux_tests/test_find_unique_phonemes.py b/tests/aux_tests/test_find_unique_phonemes.py index 018679f573..53298cdebd 100644 --- a/tests/aux_tests/test_find_unique_phonemes.py +++ b/tests/aux_tests/test_find_unique_phonemes.py @@ -1,16 +1,12 @@ -import os -import unittest - import torch -from tests import get_tests_output_path, run_cli +from tests import run_main +from TTS.bin.find_unique_phonemes import main from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig torch.manual_seed(1) -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") - dataset_config_en = BaseDatasetConfig( formatter="ljspeech", meta_file_train="metadata.csv", @@ -30,52 +26,26 @@ """ -# pylint: disable=protected-access -class TestFindUniquePhonemes(unittest.TestCase): - @staticmethod - def test_espeak_phonemes(): - # prepare the config - config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - datasets=[dataset_config_en], - ) - config.save_json(config_path) - - # run test - run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"') - - @staticmethod - def test_no_espeak_phonemes(): - # prepare the config - config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - datasets=[dataset_config_en], - ) - config.save_json(config_path) - - # run test - run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"') +def test_find_phonemes(tmp_path): + # prepare the config + config_path = str(tmp_path / "test_model_config.json") + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + datasets=[dataset_config_en], + ) + config.save_json(config_path) + + # run test + run_main(main, ["--config_path", config_path]) diff --git a/tests/aux_tests/test_numpy_transforms.py b/tests/aux_tests/test_numpy_transforms.py index 00597a0f88..129ba5d86b 100644 --- a/tests/aux_tests/test_numpy_transforms.py +++ b/tests/aux_tests/test_numpy_transforms.py @@ -7,18 +7,12 @@ import numpy as np from coqpit import Coqpit -from tests import get_tests_input_path, get_tests_output_path, get_tests_path +from tests import get_tests_input_path, get_tests_path from TTS.utils.audio import numpy_transforms as np_transforms TESTS_PATH = get_tests_path() -OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -os.makedirs(OUT_PATH, exist_ok=True) - - -# pylint: disable=no-self-use - class TestNumpyTransforms(unittest.TestCase): def setUp(self) -> None: diff --git a/tests/aux_tests/test_server.py b/tests/aux_tests/test_server.py new file mode 100644 index 0000000000..1b691f9596 --- /dev/null +++ b/tests/aux_tests/test_server.py @@ -0,0 +1,47 @@ +import os +import signal +import socket +import subprocess +import time +import wave + +import pytest +import requests + +PORT = 5003 + + +def wait_for_server(host, port, timeout=30): + start_time = time.time() + while time.time() - start_time < timeout: + try: + with socket.create_connection((host, port), timeout=2): + return True + except (OSError, ConnectionRefusedError): + time.sleep(1) + raise TimeoutError(f"Server at {host}:{port} did not start within {timeout} seconds.") + + +@pytest.fixture(scope="module", autouse=True) +def start_flask_server(): + server_process = subprocess.Popen( + ["python", "-m", "TTS.server.server", "--port", str(PORT)], + ) + wait_for_server("localhost", PORT) + yield + os.kill(server_process.pid, signal.SIGTERM) + server_process.wait() + + +def test_flask_server(tmp_path): + url = f"http://localhost:{PORT}/api/tts?text=synthesis%20schmynthesis" + response = requests.get(url) + assert response.status_code == 200, f"Request failed with status code {response.status_code}" + + wav_path = tmp_path / "output.wav" + with wav_path.open("wb") as f: + f.write(response.content) + + with wave.open(str(wav_path), "rb") as wav_file: + num_frames = wav_file.getnframes() + assert num_frames > 0, "WAV file contains no frames." diff --git a/tests/aux_tests/test_speaker_encoder_train.py b/tests/aux_tests/test_speaker_encoder_train.py index 5d8626faa6..0e15db2ab0 100644 --- a/tests/aux_tests/test_speaker_encoder_train.py +++ b/tests/aux_tests/test_speaker_encoder_train.py @@ -1,88 +1,86 @@ -import glob -import os import shutil -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseAudioConfig from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig -def run_test_train(): - command = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " +def test_train(tmp_path): + config_path = tmp_path / "test_speaker_encoder_config.json" + output_path = tmp_path / "train_outputs" + + def run_test_train(): + command = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + ) + run_cli(command) + + config = SpeakerEncoderConfig( + batch_size=4, + num_classes_in_batch=4, + num_utter_per_class=2, + eval_num_classes_in_batch=4, + eval_num_utter_per_class=2, + num_loader_workers=1, + epochs=1, + print_step=1, + save_step=2, + print_eval=True, + run_eval=True, + audio=BaseAudioConfig(num_mels=80), ) - run_cli(command) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.loss = "ge2e" + config.save_json(config_path) + print(config) + # train the model for one epoch + run_test_train() -config_path = os.path.join(get_tests_output_path(), "test_speaker_encoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -config = SpeakerEncoderConfig( - batch_size=4, - num_classes_in_batch=4, - num_utter_per_class=2, - eval_num_classes_in_batch=4, - eval_num_utter_per_class=2, - num_loader_workers=1, - epochs=1, - print_step=1, - save_step=2, - print_eval=True, - run_eval=True, - audio=BaseAudioConfig(num_mels=80), -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.loss = "ge2e" -config.save_json(config_path) - -print(config) -# train the model for one epoch -run_test_train() - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) - -# test resnet speaker encoder -config.model_params["model_name"] = "resnet" -config.save_json(config_path) - -# train the model for one epoch -run_test_train() - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) + # test resnet speaker encoder + config.model_params["model_name"] = "resnet" + config.save_json(config_path) -# test model with ge2e loss function -# config.loss = "ge2e" -# config.save_json(config_path) -# run_test_train() + # train the model for one epoch + run_test_train() -# test model with angleproto loss function -# config.loss = "angleproto" -# config.save_json(config_path) -# run_test_train() + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# test model with softmaxproto loss function -config.loss = "softmaxproto" -config.save_json(config_path) -run_test_train() + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) + + # test model with ge2e loss function + # config.loss = "ge2e" + # config.save_json(config_path) + # run_test_train() + + # test model with angleproto loss function + # config.loss = "angleproto" + # config.save_json(config_path) + # run_test_train() + + # test model with softmaxproto loss function + config.loss = "softmaxproto" + config.save_json(config_path) + run_test_train() diff --git a/tests/bash_tests/test_compute_statistics.sh b/tests/bash_tests/test_compute_statistics.sh deleted file mode 100755 index 721777f852..0000000000 --- a/tests/bash_tests/test_compute_statistics.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash -set -xe -BASEDIR=$(dirname "$0") -echo "$BASEDIR" -# run training -CUDA_VISIBLE_DEVICES="" python TTS/bin/compute_statistics.py --config_path $BASEDIR/../inputs/test_glow_tts.json --out_path $BASEDIR/../outputs/scale_stats.npy diff --git a/tests/bash_tests/test_demo_server.sh b/tests/bash_tests/test_demo_server.sh deleted file mode 100755 index ebd0bc8b89..0000000000 --- a/tests/bash_tests/test_demo_server.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -set -xe - -python -m TTS.server.server & -SERVER_PID=$! - -echo 'Waiting for server...' -sleep 30 - -curl -o /tmp/audio.wav "http://localhost:5002/api/tts?text=synthesis%20schmynthesis" -python -c 'import sys; import wave; print(wave.open(sys.argv[1]).getnframes())' /tmp/audio.wav - -kill $SERVER_PID - -rm /tmp/audio.wav diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index 252b429a16..975281c549 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -1,12 +1,12 @@ import os import shutil -import unittest import numpy as np +import pytest import torch from torch.utils.data import DataLoader -from tests import get_tests_data_path, get_tests_output_path +from tests import get_tests_data_path from TTS.tts.configs.shared_configs import BaseDatasetConfig, BaseTTSConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets.dataset import TTSDataset @@ -15,9 +15,6 @@ # pylint: disable=unused-variable -OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") -os.makedirs(OUTPATH, exist_ok=True) - # create a dummy config for testing data loaders. c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False) c.r = 5 @@ -47,210 +44,210 @@ dataset_configs = [dataset_config_wav, dataset_config_mp3, dataset_config_flac] +ap = AudioProcessor(**c.audio) +max_loader_iter = 4 + DATA_EXIST = True if not os.path.exists(c.data_path): DATA_EXIST = False -print(" > Dynamic data loader test: {}".format(DATA_EXIST)) - - -class TestTTSDataset(unittest.TestCase): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.max_loader_iter = 4 - self.ap = AudioProcessor(**c.audio) - - def _create_dataloader(self, batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False): - # load dataset - meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) - items = meta_data_train + meta_data_eval - tokenizer, _ = TTSTokenizer.init_from_config(c) - dataset = TTSDataset( - outputs_per_step=r, - compute_linear_spec=True, - return_wav=True, - tokenizer=tokenizer, - ap=self.ap, - samples=items, - batch_group_size=bgs, - min_text_len=c.min_text_len, - max_text_len=c.max_text_len, - min_audio_len=c.min_audio_len, - max_audio_len=c.max_audio_len, - start_by_longest=start_by_longest, - ) - - # add preprocess to force the length computation - if preprocess_samples: - dataset.preprocess_samples() - - dataloader = DataLoader( - dataset, - batch_size=batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=True, - num_workers=c.num_loader_workers, - ) - return dataloader, dataset - - def test_loader(self): - for dataset_config in dataset_configs: - dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config, preprocess_samples=True) - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - text_input = data["token_id"] - _ = data["token_id_lengths"] - speaker_name = data["speaker_names"] - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - _ = data["stop_targets"] - _ = data["item_idxs"] - wavs = data["waveform"] - - neg_values = text_input[text_input < 0] - check_count = len(neg_values) - - # check basic conditions - self.assertEqual(check_count, 0) - self.assertEqual(linear_input.shape[0], mel_input.shape[0], c.batch_size) - self.assertEqual(linear_input.shape[2], self.ap.fft_size // 2 + 1) - self.assertEqual(mel_input.shape[2], c.audio["num_mels"]) - self.assertEqual(wavs.shape[1], mel_input.shape[1] * c.audio.hop_length) - self.assertIsInstance(speaker_name[0], str) - - # make sure that the computed mels and the waveform match and correctly computed - mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy()) - # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding - mel_new = mel_new[:, : mel_lengths[0]] - ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length) - mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg] - self.assertLess(abs(mel_diff.sum()), 1e-5) - - # check normalization ranges - if self.ap.symmetric_norm: - self.assertLessEqual(mel_input.max(), self.ap.max_norm) - self.assertGreaterEqual( - mel_input.min(), -self.ap.max_norm # pylint: disable=invalid-unary-operand-type - ) - self.assertLess(mel_input.min(), 0) - else: - self.assertLessEqual(mel_input.max(), self.ap.max_norm) - self.assertGreaterEqual(mel_input.min(), 0) - - def test_batch_group_shuffle(self): - dataloader, dataset = self._create_dataloader(2, c.r, 16, dataset_config_wav) - last_length = 0 - frames = dataset.samples - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - mel_lengths = data["mel_lengths"] - avg_length = mel_lengths.numpy().mean() - dataloader.dataset.preprocess_samples() - is_items_reordered = False - for idx, item in enumerate(dataloader.dataset.samples): - if item != frames[idx]: - is_items_reordered = True - break - self.assertGreaterEqual(avg_length, last_length) - self.assertTrue(is_items_reordered) - - def test_start_by_longest(self): - """Test start_by_longest option. - - Ther first item of the fist batch must be longer than all the other items. - """ - dataloader, _ = self._create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True) - dataloader.dataset.preprocess_samples() - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - mel_lengths = data["mel_lengths"] - if i == 0: - max_len = mel_lengths[0] - print(mel_lengths) - self.assertTrue(all(max_len >= mel_lengths)) - - def test_padding_and_spectrograms(self): - def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): - self.assertNotEqual(linear_input[idx, -1].sum(), 0) # check padding - self.assertNotEqual(linear_input[idx, -2].sum(), 0) - self.assertNotEqual(mel_input[idx, -1].sum(), 0) - self.assertNotEqual(mel_input[idx, -2].sum(), 0) - self.assertEqual(stop_target[idx, -1], 1) - self.assertEqual(stop_target[idx, -2], 0) - self.assertEqual(stop_target[idx].sum(), 1) - self.assertEqual(len(mel_lengths.shape), 1) - self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0]) - self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0]) - - dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config_wav) - - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - stop_target = data["stop_targets"] - item_idx = data["item_idxs"] - - # check mel_spec consistency - wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) - mel = self.ap.melspectrogram(wav).astype("float32") - mel = torch.FloatTensor(mel).contiguous() - mel_dl = mel_input[0] - # NOTE: Below needs to check == 0 but due to an unknown reason - # there is a slight difference between two matrices. - # TODO: Check this assert cond more in detail. - self.assertLess(abs(mel.T - mel_dl).max(), 1e-5) - - # check mel-spec correctness - mel_spec = mel_input[0].cpu().numpy() - wav = self.ap.inv_melspectrogram(mel_spec.T) - self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav") - shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav") - - # check linear-spec - linear_spec = linear_input[0].cpu().numpy() - wav = self.ap.inv_spectrogram(linear_spec.T) - self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav") - shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav") - - # check the outputs - check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) - - # Test for batch size 2 - dataloader, _ = self._create_dataloader(2, 1, 0, dataset_config_wav) - - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - stop_target = data["stop_targets"] - item_idx = data["item_idxs"] - - # set id to the longest sequence in the batch - if mel_lengths[0] > mel_lengths[1]: - idx = 0 - else: - idx = 1 - - # check the longer item in the batch - check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) - - # check the other item in the batch - self.assertEqual(linear_input[1 - idx, -1].sum(), 0) - self.assertEqual(mel_input[1 - idx, -1].sum(), 0) - self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1) - self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1]) - self.assertEqual(len(mel_lengths.shape), 1) - - # check batch zero-frame conditions (zero-frame disabled) - # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 - # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 +print(f" > Dynamic data loader test: {DATA_EXIST}") + + +def _create_dataloader(batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False): + # load dataset + meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) + items = meta_data_train + meta_data_eval + tokenizer, _ = TTSTokenizer.init_from_config(c) + dataset = TTSDataset( + outputs_per_step=r, + compute_linear_spec=True, + return_wav=True, + tokenizer=tokenizer, + ap=ap, + samples=items, + batch_group_size=bgs, + min_text_len=c.min_text_len, + max_text_len=c.max_text_len, + min_audio_len=c.min_audio_len, + max_audio_len=c.max_audio_len, + start_by_longest=start_by_longest, + ) + + # add preprocess to force the length computation + if preprocess_samples: + dataset.preprocess_samples() + + dataloader = DataLoader( + dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=dataset.collate_fn, + drop_last=True, + num_workers=c.num_loader_workers, + ) + return dataloader, dataset + + +@pytest.mark.parametrize("dataset_config", dataset_configs) +def test_loader(dataset_config: BaseDatasetConfig): + batch_size = 1 + dataloader, _ = _create_dataloader(batch_size, 1, 0, dataset_config, preprocess_samples=True) + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + text_input = data["token_id"] + _ = data["token_id_lengths"] + speaker_name = data["speaker_names"] + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + _ = data["stop_targets"] + _ = data["item_idxs"] + wavs = data["waveform"] + + neg_values = text_input[text_input < 0] + check_count = len(neg_values) + + # check basic conditions + assert check_count == 0 + assert linear_input.shape[0] == mel_input.shape[0] == batch_size + assert linear_input.shape[2] == ap.fft_size // 2 + 1 + assert mel_input.shape[2] == c.audio["num_mels"] + assert wavs.shape[1] == mel_input.shape[1] * c.audio.hop_length + assert isinstance(speaker_name[0], str) + + # make sure that the computed mels and the waveform match and correctly computed + mel_new = ap.melspectrogram(wavs[0].squeeze().numpy()) + # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding + mel_new = mel_new[:, : mel_lengths[0]] + ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length) + mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg] + assert abs(mel_diff.sum()) < 1e-5 + + # check normalization ranges + if ap.symmetric_norm: + assert mel_input.max() <= ap.max_norm + assert mel_input.min() >= -ap.max_norm + assert mel_input.min() < 0 + else: + assert mel_input.max() <= ap.max_norm + assert mel_input.min() >= 0 + + +def test_batch_group_shuffle(): + dataloader, dataset = _create_dataloader(2, c.r, 16, dataset_config_wav) + last_length = 0 + frames = dataset.samples + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + mel_lengths = data["mel_lengths"] + avg_length = mel_lengths.numpy().mean() + dataloader.dataset.preprocess_samples() + is_items_reordered = False + for idx, item in enumerate(dataloader.dataset.samples): + if item != frames[idx]: + is_items_reordered = True + break + assert avg_length >= last_length + assert is_items_reordered + + +def test_start_by_longest(): + """Test start_by_longest option. + + The first item of the fist batch must be longer than all the other items. + """ + dataloader, _ = _create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True) + dataloader.dataset.preprocess_samples() + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + mel_lengths = data["mel_lengths"] + if i == 0: + max_len = mel_lengths[0] + print(mel_lengths) + assert all(max_len >= mel_lengths) + + +def test_padding_and_spectrograms(tmp_path): + def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): + assert linear_input[idx, -1].sum() != 0 # check padding + assert linear_input[idx, -2].sum() != 0 + assert mel_input[idx, -1].sum() != 0 + assert mel_input[idx, -2].sum() != 0 + assert stop_target[idx, -1] == 1 + assert stop_target[idx, -2] == 0 + assert stop_target[idx].sum() == 1 + assert len(mel_lengths.shape) == 1 + assert mel_lengths[idx] == linear_input[idx].shape[0] + assert mel_lengths[idx] == mel_input[idx].shape[0] + + dataloader, _ = _create_dataloader(1, 1, 0, dataset_config_wav) + + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + stop_target = data["stop_targets"] + item_idx = data["item_idxs"] + + # check mel_spec consistency + wav = np.asarray(ap.load_wav(item_idx[0]), dtype=np.float32) + mel = ap.melspectrogram(wav).astype("float32") + mel = torch.FloatTensor(mel).contiguous() + mel_dl = mel_input[0] + # NOTE: Below needs to check == 0 but due to an unknown reason + # there is a slight difference between two matrices. + # TODO: Check this assert cond more in detail. + assert abs(mel.T - mel_dl).max() < 1e-5 + + # check mel-spec correctness + mel_spec = mel_input[0].cpu().numpy() + wav = ap.inv_melspectrogram(mel_spec.T) + ap.save_wav(wav, tmp_path / "mel_inv_dataloader.wav") + shutil.copy(item_idx[0], tmp_path / "mel_target_dataloader.wav") + + # check linear-spec + linear_spec = linear_input[0].cpu().numpy() + wav = ap.inv_spectrogram(linear_spec.T) + ap.save_wav(wav, tmp_path / "linear_inv_dataloader.wav") + shutil.copy(item_idx[0], tmp_path / "linear_target_dataloader.wav") + + # check the outputs + check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) + + # Test for batch size 2 + dataloader, _ = _create_dataloader(2, 1, 0, dataset_config_wav) + + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + stop_target = data["stop_targets"] + item_idx = data["item_idxs"] + + # set id to the longest sequence in the batch + if mel_lengths[0] > mel_lengths[1]: + idx = 0 + else: + idx = 1 + + # check the longer item in the batch + check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) + + # check the other item in the batch + assert linear_input[1 - idx, -1].sum() == 0 + assert mel_input[1 - idx, -1].sum() == 0 + assert stop_target[1, mel_lengths[1] - 1] == 1 + assert stop_target[1, mel_lengths[1] :].sum() == stop_target.shape[1] - mel_lengths[1] + assert len(mel_lengths.shape) == 1 + + # check batch zero-frame conditions (zero-frame disabled) + # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 + # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 diff --git a/tests/inference_tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py index 28a4088c96..beb7df689b 100644 --- a/tests/inference_tests/test_synthesize.py +++ b/tests/inference_tests/test_synthesize.py @@ -1,20 +1,17 @@ -import os +from tests import run_main +from TTS.bin.synthesize import main -from tests import get_tests_output_path, run_cli - -def test_synthesize(): +def test_synthesize(tmp_path): """Test synthesize.py with diffent arguments.""" - output_path = os.path.join(get_tests_output_path(), "output.wav") - run_cli("tts --list_models") + output_path = str(tmp_path / "output.wav") + + run_main(main, ["--list_models"]) # single speaker model - run_cli(f'tts --text "This is an example." --out_path "{output_path}"') - run_cli( - "tts --model_name tts_models/en/ljspeech/glow-tts " f'--text "This is an example." --out_path "{output_path}"' - ) - run_cli( - "tts --model_name tts_models/en/ljspeech/glow-tts " - "--vocoder_name vocoder_models/en/ljspeech/multiband-melgan " - f'--text "This is an example." --out_path "{output_path}"' - ) + args = ["--text", "This is an example.", "--out_path", output_path] + run_main(main, args) + + args = [*args, "--model_name", "tts_models/en/ljspeech/glow-tts"] + run_main(main, args) + run_main(main, [*args, "--vocoder_name", "vocoder_models/en/ljspeech/multiband-melgan"]) diff --git a/tests/inputs/test_align_tts.json b/tests/inputs/test_align_tts_config.json similarity index 100% rename from tests/inputs/test_align_tts.json rename to tests/inputs/test_align_tts_config.json diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts_config.json similarity index 100% rename from tests/inputs/test_glow_tts.json rename to tests/inputs/test_glow_tts_config.json diff --git a/tests/inputs/test_speedy_speech.json b/tests/inputs/test_speedy_speech_config.json similarity index 100% rename from tests/inputs/test_speedy_speech.json rename to tests/inputs/test_speedy_speech_config.json diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad_config.json similarity index 100% rename from tests/inputs/test_vocoder_wavegrad.json rename to tests/inputs/test_vocoder_wavegrad_config.json diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py index f9067530e6..370a541b97 100644 --- a/tests/text_tests/test_phonemizer.py +++ b/tests/text_tests/test_phonemizer.py @@ -240,12 +240,8 @@ def test_is_available(self): class TestBN_Phonemizer(unittest.TestCase): def setUp(self): self.phonemizer = BN_Phonemizer() - self._TEST_CASES = ( - "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন" - ) - self._EXPECTED = ( - "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।" - ) + self._TEST_CASES = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন" + self._EXPECTED = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।" def test_phonemize(self): self.assertEqual(self.phonemizer.phonemize(self._TEST_CASES, separator=""), self._EXPECTED) diff --git a/tests/text_tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py index 9be1f0bf41..f5d342bb00 100644 --- a/tests/text_tests/test_text_cleaners.py +++ b/tests/text_tests/test_text_cleaners.py @@ -24,6 +24,8 @@ def test_currency() -> None: def test_expand_numbers() -> None: assert phoneme_cleaners("-1") == "minus one" assert phoneme_cleaners("1") == "one" + assert phoneme_cleaners("1" + "0" * 35) == "one hundred decillion" + assert phoneme_cleaners("1" + "0" * 36) == "one" + " zero" * 36 def test_multilingual_phoneme_cleaners() -> None: @@ -43,11 +45,11 @@ def test_normalize_unicode() -> None: ("na\u0303", "nã"), ("o\u0302u", "ôu"), ("n\u0303", "ñ"), - ("\u4E2D\u56FD", "中国"), + ("\u4e2d\u56fd", "中国"), ("niño", "niño"), ("a\u0308", "ä"), ("\u3053\u3093\u306b\u3061\u306f", "こんにちは"), - ("\u03B1\u03B2", "αβ"), + ("\u03b1\u03b2", "αβ"), ] for arg, expect in test_cases: assert normalize_unicode(arg) == expect diff --git a/tests/tts_tests/test_losses.py b/tests/tts_tests/test_losses.py index 794478dca3..2290e9a6cc 100644 --- a/tests/tts_tests/test_losses.py +++ b/tests/tts_tests/test_losses.py @@ -21,7 +21,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 8, 128).float() @@ -29,14 +29,14 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" # seq_len_norm = True # test input == target @@ -52,7 +52,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 8, 128).float() @@ -60,14 +60,14 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item()) + assert abs(output.item() - 1.0) < 1e-5, f"1.0 vs {output.item()}" dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" class MSELossMaskedTests(unittest.TestCase): @@ -85,7 +85,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 8, 128).float() @@ -93,14 +93,14 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" # seq_len_norm = True # test input == target @@ -116,7 +116,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 8, 128).float() @@ -124,14 +124,14 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item()) + assert abs(output.item() - 1.0) < 1e-5, f"1.0 vs {output.item()}" dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" class SSIMLossTests(unittest.TestCase): @@ -153,7 +153,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.ones(4) * 58).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() >= 1.0, "0 vs {}".format(output.item()) + assert output.item() >= 1.0, f"0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 57, 128).float() @@ -168,7 +168,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(54, 58)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" # seq_len_norm = True # test input == target @@ -184,7 +184,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_target = T.zeros(4, 57, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 57, 128).float() @@ -192,14 +192,14 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(54, 58)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item()) + assert abs(output.item() - 1.0) < 1e-5, f"1.0 vs {output.item()}" dummy_input = T.rand(4, 57, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(54, 58)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" class BCELossTest(unittest.TestCase): diff --git a/tests/tts_tests/test_neuralhmm_tts_train.py b/tests/tts_tests/test_neuralhmm_tts_train.py index 4789d53d9e..f4b8d5cadd 100644 --- a/tests/tts_tests/test_neuralhmm_tts_train.py +++ b/tests/tts_tests/test_neuralhmm_tts_train.py @@ -1,92 +1,92 @@ -import glob import json -import os import shutil import torch from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.neuralhmm_tts_config import NeuralhmmTTSConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -parameter_path = os.path.join(get_tests_output_path(), "lj_parameters.pt") -torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + parameter_path = tmp_path / "lj_parameters.pt" -config = NeuralhmmTTSConfig( - batch_size=3, - eval_batch_size=3, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="phoneme_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - mel_statistics_parameter_path=parameter_path, - epochs=1, - print_step=1, - test_sentences=[ - "Be a voice, not an echo.", - ], - print_eval=True, - max_sampling_time=50, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path) + config = NeuralhmmTTSConfig( + batch_size=3, + eval_batch_size=3, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="phoneme_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + mel_statistics_parameter_path=parameter_path, + epochs=1, + print_step=1, + test_sentences=[ + "Be a voice, not an echo.", + ], + print_eval=True, + max_sampling_time=50, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch when mel parameters exists -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + # train the model for one epoch when mel parameters exists + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) + # train the model for one epoch when mel parameters have to be computed from the dataset + if parameter_path.is_file(): + parameter_path.unlink() + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) -# train the model for one epoch when mel parameters have to be computed from the dataset -if os.path.exists(parameter_path): - os.remove(parameter_path) -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_overflow_train.py b/tests/tts_tests/test_overflow_train.py index d86bde6854..e2dec3c899 100644 --- a/tests/tts_tests/test_overflow_train.py +++ b/tests/tts_tests/test_overflow_train.py @@ -1,92 +1,92 @@ -import glob import json -import os import shutil import torch from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.overflow_config import OverflowConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -parameter_path = os.path.join(get_tests_output_path(), "lj_parameters.pt") -torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + parameter_path = tmp_path / "lj_parameters.pt" -config = OverflowConfig( - batch_size=3, - eval_batch_size=3, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="phoneme_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - mel_statistics_parameter_path=parameter_path, - epochs=1, - print_step=1, - test_sentences=[ - "Be a voice, not an echo.", - ], - print_eval=True, - max_sampling_time=50, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path) + config = OverflowConfig( + batch_size=3, + eval_batch_size=3, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="phoneme_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + mel_statistics_parameter_path=parameter_path, + epochs=1, + print_step=1, + test_sentences=[ + "Be a voice, not an echo.", + ], + print_eval=True, + max_sampling_time=50, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch when mel parameters exists -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + # train the model for one epoch when mel parameters exists + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) + # train the model for one epoch when mel parameters have to be computed from the dataset + if parameter_path.is_file(): + parameter_path.unlink() + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) -# train the model for one epoch when mel parameters have to be computed from the dataset -if os.path.exists(parameter_path): - os.remove(parameter_path) -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index 2aac7f101d..30efe38d9f 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -1,72 +1,73 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig -config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_speedy_speech_config.json" + output_path = tmp_path / "train_outputs" -config = SpeedySpeechConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = SpeedySpeechConfig( + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example for it.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example for it.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py index d2d1d5c35f..12791feda4 100644 --- a/tests/tts_tests/test_tacotron2_d-vectors_train.py +++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py @@ -1,79 +1,81 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.tacotron2_config import Tacotron2Config -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -config = Tacotron2Config( - r=5, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - use_speaker_embedding=False, - use_d_vector_file=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - d_vector_file="tests/data/ljspeech/speakers.json", - d_vector_dim=256, - max_decoder_steps=50, -) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = Tacotron2Config( + r=5, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + use_speaker_embedding=False, + use_d_vector_file=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + d_vector_file="tests/data/ljspeech/speakers.json", + d_vector_dim=256, + max_decoder_steps=50, + ) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = config.d_vector_file + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = config.d_vector_file -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Check integrity of the config + with open(continue_config_path, encoding="utf-8") as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index 72b6bcd46b..72069bf943 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -72,8 +72,8 @@ def test_train_step(self): # pylint: disable=no-self-use for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -131,8 +131,8 @@ def test_train_step(): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -198,8 +198,8 @@ def test_train_step(self): if name == "gst_layer.encoder.recurrence.weight_hh_l0": # print(param.grad) continue - assert (param != param_ref).any(), "param {} {} with shape {} not updated!! \n{}\n{}".format( - name, count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -254,8 +254,8 @@ def test_train_step(self): if name == "gst_layer.encoder.recurrence.weight_hh_l0": # print(param.grad) continue - assert (param != param_ref).any(), "param {} {} with shape {} not updated!! \n{}\n{}".format( - name, count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -321,8 +321,8 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -384,7 +384,7 @@ def test_train_step(): name, param = name_param if name == "gst_layer.encoder.recurrence.weight_hh_l0": continue - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py index 83a07d1a6c..2696edb1b6 100644 --- a/tests/tts_tests/test_tacotron2_speaker_emb_train.py +++ b/tests/tts_tests/test_tacotron2_speaker_emb_train.py @@ -1,77 +1,79 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.tacotron2_config import Tacotron2Config -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -config = Tacotron2Config( - r=5, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - use_speaker_embedding=True, - num_speakers=4, - max_decoder_steps=50, -) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = Tacotron2Config( + r=5, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + use_speaker_embedding=True, + num_speakers=4, + max_decoder_steps=50, + ) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = os.path.join(continue_path, "speakers.json") + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = continue_path / "speakers.json" -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index df0e934d8e..f8667b6d02 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -1,72 +1,72 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.tacotron2_config import Tacotron2Config -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -config = Tacotron2Config( - r=5, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - test_sentences=[ - "Be a voice, not an echo.", - ], - print_eval=True, - max_decoder_steps=50, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + config = Tacotron2Config( + r=5, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + test_sentences=[ + "Be a voice, not an echo.", + ], + print_eval=True, + max_decoder_steps=50, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_tacotron_layers.py b/tests/tts_tests/test_tacotron_layers.py index 43e72417c2..9521cfea26 100644 --- a/tests/tts_tests/test_tacotron_layers.py +++ b/tests/tts_tests/test_tacotron_layers.py @@ -67,8 +67,8 @@ def test_in_out(): output, alignment, stop_tokens = layer(dummy_input, dummy_memory, mask=None) assert output.shape[0] == 4 - assert output.shape[1] == 80, "size not {}".format(output.shape[1]) - assert output.shape[2] == 2, "size not {}".format(output.shape[2]) + assert output.shape[1] == 80, f"size not {output.shape[1]}" + assert output.shape[2] == 2, f"size not {output.shape[2]}" assert stop_tokens.shape[0] == 4 diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 7ec3f0df1b..5f9af86e7e 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -51,7 +51,7 @@ def test_train_step(): criterion_st = nn.BCEWithLogitsLoss().to(device) model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() - print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -71,8 +71,8 @@ def test_train_step(): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -105,7 +105,7 @@ def test_train_step(): config.d_vector_dim = 55 model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() - print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -127,8 +127,8 @@ def test_train_step(): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -165,7 +165,7 @@ def test_train_step(): model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) - print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron GST model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -186,8 +186,8 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -217,7 +217,7 @@ def test_train_step(): model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) - print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron GST model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -238,8 +238,8 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -288,7 +288,7 @@ def test_train_step(): criterion = model.get_criterion() optimizer = model.get_optimizer() model.train() - print(" > Num parameters for Tacotron with Capacitron VAE model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron with Capacitron VAE model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -305,8 +305,8 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -341,7 +341,7 @@ def test_train_step(): config.d_vector_dim = 55 model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() - print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -366,7 +366,7 @@ def test_train_step(): name, param = name_param if name == "gst_layer.encoder.recurrence.weight_hh_l0": continue - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index 17f1fd46a6..cc91b18c34 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -1,64 +1,63 @@ -import glob -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.tacotron_config import TacotronConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = TacotronConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - test_sentences=[ - "Be a voice, not an echo.", - ], - print_eval=True, - r=5, - max_decoder_steps=50, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") - -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + + config = TacotronConfig( + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + test_sentences=[ + "Be a voice, not an echo.", + ], + print_eval=True, + r=5, + max_decoder_steps=50, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) + + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index c8a52e1c1b..790439ecb2 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -373,8 +373,8 @@ def _check_parameter_changes(model, model_ref): name = item1[0] param = item1[1] param_ref = item2[1] - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - name, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {name} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count = count + 1 diff --git a/tests/tts_tests/test_vits_d-vectors_train.py b/tests/tts_tests/test_vits_d-vectors_train.py index 741bda91e9..b95e1deed3 100644 --- a/tests/tts_tests/test_vits_d-vectors_train.py +++ b/tests/tts_tests/test_vits_d-vectors_train.py @@ -1,61 +1,61 @@ -import glob -import os import shutil -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.vits_config import VitsConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech-0"], - ], -) -# set audio config -config.audio.do_trim_silence = True -config.audio.trim_db = 60 - -# active multispeaker d-vec mode -config.model_args.use_d_vector_file = True -config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"] -config.model_args.d_vector_dim = 256 - - -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech-0"], + ], + ) + # set audio config + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + + # active multispeaker d-vec mode + config.model_args.use_d_vector_file = True + config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"] + config.model_args.d_vector_dim = 256 + + config.save_json(config_path) + + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py index 09df7d29f2..189e6cfb4d 100644 --- a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py +++ b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py @@ -1,110 +1,111 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -dataset_config_en = BaseDatasetConfig( - formatter="ljspeech", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - path="tests/data/ljspeech", - language="en", -) - -dataset_config_pt = BaseDatasetConfig( - formatter="ljspeech", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - path="tests/data/ljspeech", - language="pt-br", -) - -config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech", None, "en"], - ["Be a voice, not an echo.", "ljspeech", None, "pt-br"], - ], - datasets=[dataset_config_en, dataset_config_pt], -) -# set audio config -config.audio.do_trim_silence = True -config.audio.trim_db = 60 - -# active multilingual mode -config.model_args.use_language_embedding = True -config.use_language_embedding = True -# active multispeaker mode -config.model_args.use_speaker_embedding = True -config.use_speaker_embedding = True - -# deactivate multispeaker d-vec mode -config.model_args.use_d_vector_file = False -config.use_d_vector_file = False - -# duration predictor -config.model_args.use_sdp = False -config.use_sdp = False - -# active language sampler -config.use_language_weighted_sampler = True - -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech" -languae_id = "en" -continue_speakers_path = os.path.join(continue_path, "speakers.json") -continue_languages_path = os.path.join(continue_path, "language_ids.json") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + + dataset_config_en = BaseDatasetConfig( + formatter="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en", + ) + + dataset_config_pt = BaseDatasetConfig( + formatter="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="pt-br", + ) + + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech", None, "en"], + ["Be a voice, not an echo.", "ljspeech", None, "pt-br"], + ], + datasets=[dataset_config_en, dataset_config_pt], + ) + # set audio config + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + + # active multilingual mode + config.model_args.use_language_embedding = True + config.use_language_embedding = True + # active multispeaker mode + config.model_args.use_speaker_embedding = True + config.use_speaker_embedding = True + + # deactivate multispeaker d-vec mode + config.model_args.use_d_vector_file = False + config.use_d_vector_file = False + + # duration predictor + config.model_args.use_sdp = False + config.use_sdp = False + + # active language sampler + config.use_language_weighted_sampler = True + + config.save_json(config_path) + + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech" + languae_id = "en" + continue_speakers_path = continue_path / "speakers.json" + continue_languages_path = continue_path / "language_ids.json" + + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 + + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py index 7ae09c0e5c..8b8757422c 100644 --- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py +++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py @@ -1,117 +1,117 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -dataset_config_en = BaseDatasetConfig( - formatter="ljspeech_test", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - path="tests/data/ljspeech", - language="en", -) - -dataset_config_pt = BaseDatasetConfig( - formatter="ljspeech_test", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - path="tests/data/ljspeech", - language="pt-br", -) - -config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="multilingual_cleaners", - use_phonemes=False, - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech-0", None, "en"], - ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"], - ], - datasets=[dataset_config_en, dataset_config_en, dataset_config_en, dataset_config_pt], -) -# set audio config -config.audio.do_trim_silence = True -config.audio.trim_db = 60 - -# active multilingual mode -config.model_args.use_language_embedding = True -config.use_language_embedding = True - -# deactivate multispeaker mode -config.model_args.use_speaker_embedding = False -config.use_speaker_embedding = False - -# active multispeaker d-vec mode -config.model_args.use_d_vector_file = True -config.use_d_vector_file = True -config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"] -config.d_vector_file = ["tests/data/ljspeech/speakers.json"] -config.model_args.d_vector_dim = 256 -config.d_vector_dim = 256 - -# duration predictor -config.model_args.use_sdp = True -config.use_sdp = True - -# activate language and speaker samplers -config.use_language_weighted_sampler = True -config.language_weighted_sampler_alpha = 10 -config.use_speaker_weighted_sampler = True -config.speaker_weighted_sampler_alpha = 5 - -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -languae_id = "en" -continue_speakers_path = config.d_vector_file -continue_languages_path = os.path.join(continue_path, "language_ids.json") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + + dataset_config_en = BaseDatasetConfig( + formatter="ljspeech_test", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en", + ) + + dataset_config_pt = BaseDatasetConfig( + formatter="ljspeech_test", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="pt-br", + ) + + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="multilingual_cleaners", + use_phonemes=False, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech-0", None, "en"], + ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"], + ], + datasets=[dataset_config_en, dataset_config_en, dataset_config_en, dataset_config_pt], + ) + # set audio config + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + + # active multilingual mode + config.model_args.use_language_embedding = True + config.use_language_embedding = True + + # deactivate multispeaker mode + config.model_args.use_speaker_embedding = False + config.use_speaker_embedding = False + + # active multispeaker d-vec mode + config.model_args.use_d_vector_file = True + config.use_d_vector_file = True + config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"] + config.d_vector_file = ["tests/data/ljspeech/speakers.json"] + config.model_args.d_vector_dim = 256 + config.d_vector_dim = 256 + + # duration predictor + config.model_args.use_sdp = True + config.use_sdp = True + + # activate language and speaker samplers + config.use_language_weighted_sampler = True + config.language_weighted_sampler_alpha = 10 + config.use_speaker_weighted_sampler = True + config.speaker_weighted_sampler_alpha = 5 + + config.save_json(config_path) + + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + languae_id = "en" + continue_speakers_path = config.d_vector_file + continue_languages_path = continue_path / "language_ids.json" + + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 + + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py index 69fae21f8d..6678cca90c 100644 --- a/tests/tts_tests/test_vits_speaker_emb_train.py +++ b/tests/tts_tests/test_vits_speaker_emb_train.py @@ -1,83 +1,83 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.vits_config import VitsConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech-1"], - ], -) -# set audio config -config.audio.do_trim_silence = True -config.audio.trim_db = 60 + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech-1"], + ], + ) + # set audio config + config.audio.do_trim_silence = True + config.audio.trim_db = 60 -# active multispeaker d-vec mode -config.model_args.use_speaker_embedding = True -config.model_args.use_d_vector_file = False -config.model_args.d_vector_file = None -config.model_args.d_vector_dim = 256 + # active multispeaker d-vec mode + config.model_args.use_speaker_embedding = True + config.model_args.use_d_vector_file = False + config.model_args.d_vector_file = None + config.model_args.d_vector_dim = 256 + config.save_json(config_path) -config.save_json(config_path) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = continue_path / "speakers.json" -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = os.path.join(continue_path, "speakers.json") + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py index 78f42d154b..e0f7a656b0 100644 --- a/tests/tts_tests/test_vits_train.py +++ b/tests/tts_tests/test_vits_train.py @@ -1,72 +1,73 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.vits_config import VitsConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo."], - ], -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo."], + ], + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests2/test_align_tts_train.py b/tests/tts_tests2/test_align_tts_train.py index 91c3c35bc6..1582f51fd4 100644 --- a/tests/tts_tests2/test_align_tts_train.py +++ b/tests/tts_tests2/test_align_tts_train.py @@ -1,72 +1,71 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.align_tts_config import AlignTTSConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = AlignTTSConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], -) + config = AlignTTSConfig( + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + ) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py index 1e5cd49f73..74d7a0a734 100644 --- a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py +++ b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py @@ -1,100 +1,98 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig from TTS.tts.models.delightful_tts import DelightfulTtsArgs, VocoderConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -audio_config = DelightfulTtsAudioConfig() -model_args = DelightfulTtsArgs( - use_speaker_embedding=False, d_vector_dim=256, use_d_vector_file=True, speaker_embedding_channels=256 -) - -vocoder_config = VocoderConfig() - -config = DelightfulTTSConfig( - model_args=model_args, - audio=audio_config, - vocoder=vocoder_config, - batch_size=2, - eval_batch_size=8, - compute_f0=True, - run_eval=True, - test_delay_epochs=-1, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache_delightful/", ## delightful f0 cache is incompatible with other models - epochs=1, - print_step=1, - print_eval=True, - binary_align_loss_alpha=0.0, - use_attn_priors=False, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech-0"], - ], - output_path=output_path, - use_speaker_embedding=False, - use_d_vector_file=True, - d_vector_file="tests/data/ljspeech/speakers.json", - d_vector_dim=256, - speaker_embedding_channels=256, -) - -# active multispeaker d-vec mode -config.model_args.use_speaker_embedding = False -config.model_args.use_d_vector_file = True -config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" -config.model_args.d_vector_dim = 256 - - -config.save_json(config_path) - -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) - -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -speaker_id = "ljspeech-1" -continue_speakers_path = config.d_vector_file - -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --speakers_file_path {continue_speakers_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) -shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + + audio_config = DelightfulTtsAudioConfig() + model_args = DelightfulTtsArgs( + use_speaker_embedding=False, d_vector_dim=256, use_d_vector_file=True, speaker_embedding_channels=256 + ) + + vocoder_config = VocoderConfig() + + config = DelightfulTTSConfig( + model_args=model_args, + audio=audio_config, + vocoder=vocoder_config, + batch_size=2, + eval_batch_size=8, + compute_f0=True, + run_eval=True, + test_delay_epochs=-1, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + f0_cache_path=tmp_path / "f0_cache", # delightful f0 cache is incompatible with other models + epochs=1, + print_step=1, + print_eval=True, + binary_align_loss_alpha=0.0, + use_attn_priors=False, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech-0"], + ], + output_path=output_path, + use_speaker_embedding=False, + use_d_vector_file=True, + d_vector_file="tests/data/ljspeech/speakers.json", + d_vector_dim=256, + speaker_embedding_channels=256, + ) + + # active multispeaker d-vec mode + config.model_args.use_speaker_embedding = False + config.model_args.use_d_vector_file = True + config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" + config.model_args.d_vector_dim = 256 + config.save_json(config_path) + + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + + run_cli(command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = config.d_vector_file + + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 + + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --speakers_file_path {continue_speakers_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_delightful_tts_emb_spk.py b/tests/tts_tests2/test_delightful_tts_emb_spk.py index 9bbf7a55ea..68f790599e 100644 --- a/tests/tts_tests2/test_delightful_tts_emb_spk.py +++ b/tests/tts_tests2/test_delightful_tts_emb_spk.py @@ -1,94 +1,93 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig from TTS.tts.models.delightful_tts import DelightfulTtsArgs, VocoderConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -audio_config = DelightfulTtsAudioConfig() -model_args = DelightfulTtsArgs(use_speaker_embedding=False) - -vocoder_config = VocoderConfig() - -config = DelightfulTTSConfig( - model_args=model_args, - audio=audio_config, - vocoder=vocoder_config, - batch_size=2, - eval_batch_size=8, - compute_f0=True, - run_eval=True, - test_delay_epochs=-1, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache_delightful/", ## delightful f0 cache is incompatible with other models - epochs=1, - print_step=1, - print_eval=True, - binary_align_loss_alpha=0.0, - use_attn_priors=False, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech"], - ], - output_path=output_path, - num_speakers=4, - use_speaker_embedding=True, -) - -# active multispeaker d-vec mode -config.model_args.use_speaker_embedding = True -config.model_args.use_d_vector_file = False -config.model_args.d_vector_file = None -config.model_args.d_vector_dim = 256 - - -config.save_json(config_path) - -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.dataset_name ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) - -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech" -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) -shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + + audio_config = DelightfulTtsAudioConfig() + model_args = DelightfulTtsArgs(use_speaker_embedding=False) + + vocoder_config = VocoderConfig() + + config = DelightfulTTSConfig( + model_args=model_args, + audio=audio_config, + vocoder=vocoder_config, + batch_size=2, + eval_batch_size=8, + compute_f0=True, + run_eval=True, + test_delay_epochs=-1, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + f0_cache_path=tmp_path / "f0_cache", # delightful f0 cache is incompatible with other models + epochs=1, + print_step=1, + print_eval=True, + binary_align_loss_alpha=0.0, + use_attn_priors=False, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech"], + ], + output_path=output_path, + num_speakers=4, + use_speaker_embedding=True, + ) + + # active multispeaker d-vec mode + config.model_args.use_speaker_embedding = True + config.model_args.use_d_vector_file = False + config.model_args.d_vector_file = None + config.model_args.d_vector_dim = 256 + config.save_json(config_path) + + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.dataset_name ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + + run_cli(command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech" + + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 + + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_delightful_tts_train.py b/tests/tts_tests2/test_delightful_tts_train.py index 3e6fbd2e86..4676ee4869 100644 --- a/tests/tts_tests2/test_delightful_tts_train.py +++ b/tests/tts_tests2/test_delightful_tts_train.py @@ -1,97 +1,97 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.delightful_tts_config import DelightfulTTSConfig from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -audio_config = BaseAudioConfig( - sample_rate=22050, - do_trim_silence=True, - trim_db=60.0, - signal_norm=False, - mel_fmin=0.0, - mel_fmax=8000, - spec_gain=1.0, - log_func="np.log", - ref_level_db=20, - preemphasis=0.0, -) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -audio_config = DelightfulTtsAudioConfig() -model_args = DelightfulTtsArgs() + audio_config = BaseAudioConfig( + sample_rate=22050, + do_trim_silence=True, + trim_db=60.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + ref_level_db=20, + preemphasis=0.0, + ) -vocoder_config = VocoderConfig() + audio_config = DelightfulTtsAudioConfig() + model_args = DelightfulTtsArgs() + vocoder_config = VocoderConfig() -config = DelightfulTTSConfig( - audio=audio_config, - batch_size=2, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache_delightful/", ## delightful f0 cache is incompatible with other models - run_eval=True, - test_delay_epochs=-1, - binary_align_loss_alpha=0.0, - epochs=1, - print_step=1, - use_attn_priors=False, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo."], - ], - use_speaker_embedding=False, -) -config.save_json(config_path) + config = DelightfulTTSConfig( + audio=audio_config, + batch_size=2, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + f0_cache_path=tmp_path / "f0_cache", # delightful f0 cache is incompatible with other models + run_eval=True, + test_delay_epochs=-1, + binary_align_loss_alpha=0.0, + epochs=1, + print_step=1, + use_attn_priors=False, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo."], + ], + use_speaker_embedding=False, + ) + config.save_json(config_path) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{'cpu'}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs -1" -) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{'cpu'}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs -1" + ) -run_cli(command_train) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == -1 + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == -1 -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) -shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/") + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py b/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py index e6bc9f9feb..379e2f346b 100644 --- a/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py +++ b/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py @@ -1,92 +1,94 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.fast_pitch_config import FastPitchConfig -config_path = os.path.join(get_tests_output_path(), "fast_pitch_speaker_emb_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -audio_config = BaseAudioConfig( - sample_rate=22050, - do_trim_silence=True, - trim_db=60.0, - signal_norm=False, - mel_fmin=0.0, - mel_fmax=8000, - spec_gain=1.0, - log_func="np.log", - ref_level_db=20, - preemphasis=0.0, -) +def test_train(tmp_path): + config_path = tmp_path / "fast_pitch_speaker_emb_config.json" + output_path = tmp_path / "train_outputs" -config = FastPitchConfig( - audio=audio_config, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - use_speaker_embedding=True, - test_sentences=[ - "Be a voice, not an echo.", - ], -) -config.audio.do_trim_silence = True -config.use_speaker_embedding = True -config.model_args.use_speaker_embedding = True -config.audio.trim_db = 60 -config.save_json(config_path) + audio_config = BaseAudioConfig( + sample_rate=22050, + do_trim_silence=True, + trim_db=60.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + ref_level_db=20, + preemphasis=0.0, + ) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + config = FastPitchConfig( + audio=audio_config, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + f0_cache_path="tests/data/ljspeech/f0_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + use_speaker_embedding=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + ) + config.audio.do_trim_silence = True + config.use_speaker_embedding = True + config.model_args.use_speaker_embedding = True + config.audio.trim_db = 60 + config.save_json(config_path) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = os.path.join(continue_path, "speakers.json") + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = continue_path / "speakers.json" -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_fast_pitch_train.py b/tests/tts_tests2/test_fast_pitch_train.py index fe87c8b600..e0838a2049 100644 --- a/tests/tts_tests2/test_fast_pitch_train.py +++ b/tests/tts_tests2/test_fast_pitch_train.py @@ -1,91 +1,93 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.fast_pitch_config import FastPitchConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -audio_config = BaseAudioConfig( - sample_rate=22050, - do_trim_silence=True, - trim_db=60.0, - signal_norm=False, - mel_fmin=0.0, - mel_fmax=8000, - spec_gain=1.0, - log_func="np.log", - ref_level_db=20, - preemphasis=0.0, -) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = FastPitchConfig( - audio=audio_config, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - use_speaker_embedding=False, -) -config.audio.do_trim_silence = True -config.use_speaker_embedding = False -config.model_args.use_speaker_embedding = False -config.audio.trim_db = 60 -config.save_json(config_path) + audio_config = BaseAudioConfig( + sample_rate=22050, + do_trim_silence=True, + trim_db=60.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + ref_level_db=20, + preemphasis=0.0, + ) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) + config = FastPitchConfig( + audio=audio_config, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + f0_cache_path="tests/data/ljspeech/f0_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + use_speaker_embedding=False, + ) + config.audio.do_trim_silence = True + config.use_speaker_embedding = False + config.model_args.use_speaker_embedding = False + config.audio.trim_db = 60 + config.save_json(config_path) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + run_cli(command_train) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py b/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py index 735d2fc4c6..348729c6f4 100644 --- a/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py +++ b/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py @@ -1,95 +1,97 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.fastspeech2_config import Fastspeech2Config -config_path = os.path.join(get_tests_output_path(), "fast_pitch_speaker_emb_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -audio_config = BaseAudioConfig( - sample_rate=22050, - do_trim_silence=True, - trim_db=60.0, - signal_norm=False, - mel_fmin=0.0, - mel_fmax=8000, - spec_gain=1.0, - log_func="np.log", - ref_level_db=20, - preemphasis=0.0, -) +def test_train(tmp_path): + config_path = tmp_path / "fast_pitch_speaker_emb_config.json" + output_path = tmp_path / "train_outputs" -config = Fastspeech2Config( - audio=audio_config, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache/", - compute_f0=True, - compute_energy=True, - energy_cache_path="tests/data/ljspeech/energy_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - use_speaker_embedding=True, - test_sentences=[ - "Be a voice, not an echo.", - ], -) -config.audio.do_trim_silence = True -config.use_speaker_embedding = True -config.model_args.use_speaker_embedding = True -config.audio.trim_db = 60 -config.save_json(config_path) + audio_config = BaseAudioConfig( + sample_rate=22050, + do_trim_silence=True, + trim_db=60.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + ref_level_db=20, + preemphasis=0.0, + ) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + config = Fastspeech2Config( + audio=audio_config, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + f0_cache_path="tests/data/ljspeech/f0_cache/", + compute_f0=True, + compute_energy=True, + energy_cache_path=tmp_path / "energy_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + use_speaker_embedding=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + ) + config.audio.do_trim_silence = True + config.use_speaker_embedding = True + config.model_args.use_speaker_embedding = True + config.audio.trim_db = 60 + config.save_json(config_path) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = os.path.join(continue_path, "speakers.json") + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = continue_path / "speakers.json" -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_fastspeech_2_train.py b/tests/tts_tests2/test_fastspeech_2_train.py index 07fc5a1a2c..ab513ec827 100644 --- a/tests/tts_tests2/test_fastspeech_2_train.py +++ b/tests/tts_tests2/test_fastspeech_2_train.py @@ -1,94 +1,96 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.fastspeech2_config import Fastspeech2Config -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -audio_config = BaseAudioConfig( - sample_rate=22050, - do_trim_silence=True, - trim_db=60.0, - signal_norm=False, - mel_fmin=0.0, - mel_fmax=8000, - spec_gain=1.0, - log_func="np.log", - ref_level_db=20, - preemphasis=0.0, -) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = Fastspeech2Config( - audio=audio_config, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache/", - compute_f0=True, - compute_energy=True, - energy_cache_path="tests/data/ljspeech/energy_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - use_speaker_embedding=False, -) -config.audio.do_trim_silence = True -config.use_speaker_embedding = False -config.model_args.use_speaker_embedding = False -config.audio.trim_db = 60 -config.save_json(config_path) + audio_config = BaseAudioConfig( + sample_rate=22050, + do_trim_silence=True, + trim_db=60.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + ref_level_db=20, + preemphasis=0.0, + ) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) + config = Fastspeech2Config( + audio=audio_config, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + f0_cache_path="tests/data/ljspeech/f0_cache/", + compute_f0=True, + compute_energy=True, + energy_cache_path=output_path / "energy_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + use_speaker_embedding=False, + ) + config.audio.do_trim_silence = True + config.use_speaker_embedding = False + config.model_args.use_speaker_embedding = False + config.audio.trim_db = 60 + config.save_json(config_path) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + run_cli(command_train) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_glow_tts.py b/tests/tts_tests2/test_glow_tts.py index 3c7ac51556..c92063576f 100644 --- a/tests/tts_tests2/test_glow_tts.py +++ b/tests/tts_tests2/test_glow_tts.py @@ -42,8 +42,8 @@ def _create_inputs(batch_size=8): def _check_parameter_changes(model, model_ref): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -107,7 +107,7 @@ def _test_forward(self, batch_size): config = GlowTTSConfig(num_chars=32) model = GlowTTS(config).to(device) model.train() - print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + print(f" > Num parameters for GlowTTS model:{count_parameters(model)}") # inference encoder and decoder with MAS y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths) self.assertEqual(y["z"].shape, mel_spec.shape) @@ -134,7 +134,7 @@ def _test_forward_with_d_vector(self, batch_size): ) model = GlowTTS.init_from_config(config).to(device) model.train() - print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + print(f" > Num parameters for GlowTTS model:{count_parameters(model)}") # inference encoder and decoder with MAS y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"d_vectors": d_vector}) self.assertEqual(y["z"].shape, mel_spec.shape) @@ -160,7 +160,7 @@ def _test_forward_with_speaker_id(self, batch_size): ) model = GlowTTS.init_from_config(config).to(device) model.train() - print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + print(f" > Num parameters for GlowTTS model:{count_parameters(model)}") # inference encoder and decoder with MAS y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"speaker_ids": speaker_ids}) self.assertEqual(y["z"].shape, mel_spec.shape) @@ -241,10 +241,10 @@ def _test_inference_with_MAS(self, batch_size): # inference encoder and decoder with MAS y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths) y2 = model.decoder_inference(mel_spec, mel_lengths) - assert ( - y2["model_outputs"].shape == y["model_outputs"].shape - ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format( - y["model_outputs"].shape, y2["model_outputs"].shape + assert y2["model_outputs"].shape == y["model_outputs"].shape, ( + "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format( + y["model_outputs"].shape, y2["model_outputs"].shape + ) ) def test_inference_with_MAS(self): @@ -261,7 +261,7 @@ def test_train_step(self): # reference model to compare model weights model_ref = GlowTTS(config).to(device) model.train() - print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + print(f" > Num parameters for GlowTTS model:{count_parameters(model)}") # pass the state to ref model model_ref.load_state_dict(copy.deepcopy(model.state_dict())) count = 0 diff --git a/tests/tts_tests2/test_glow_tts_d-vectors_train.py b/tests/tts_tests2/test_glow_tts_d-vectors_train.py index 8236607c25..f03139ac77 100644 --- a/tests/tts_tests2/test_glow_tts_d-vectors_train.py +++ b/tests/tts_tests2/test_glow_tts_d-vectors_train.py @@ -1,79 +1,80 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.glow_tts_config import GlowTTSConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = GlowTTSConfig( - batch_size=2, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - data_dep_init_steps=1.0, - use_speaker_embedding=False, - use_d_vector_file=True, - d_vector_file="tests/data/ljspeech/speakers.json", - d_vector_dim=256, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = GlowTTSConfig( + batch_size=2, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + data_dep_init_steps=1.0, + use_speaker_embedding=False, + use_d_vector_file=True, + d_vector_file="tests/data/ljspeech/speakers.json", + d_vector_dim=256, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = config.d_vector_file + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = config.d_vector_file -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_glow_tts_speaker_emb_train.py b/tests/tts_tests2/test_glow_tts_speaker_emb_train.py index 4a8bd0658d..b9fe93a2fa 100644 --- a/tests/tts_tests2/test_glow_tts_speaker_emb_train.py +++ b/tests/tts_tests2/test_glow_tts_speaker_emb_train.py @@ -1,76 +1,77 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.glow_tts_config import GlowTTSConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = GlowTTSConfig( - batch_size=2, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - data_dep_init_steps=1.0, - use_speaker_embedding=True, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = GlowTTSConfig( + batch_size=2, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + data_dep_init_steps=1.0, + use_speaker_embedding=True, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = os.path.join(continue_path, "speakers.json") + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = continue_path / "speakers.json" -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_glow_tts_train.py b/tests/tts_tests2/test_glow_tts_train.py index 1d7f913575..3f1bf3a794 100644 --- a/tests/tts_tests2/test_glow_tts_train.py +++ b/tests/tts_tests2/test_glow_tts_train.py @@ -1,73 +1,74 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.glow_tts_config import GlowTTSConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = GlowTTSConfig( - batch_size=2, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - data_dep_init_steps=1.0, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = GlowTTSConfig( + batch_size=2, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + data_dep_init_steps=1.0, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/vc_tests/test_freevc.py b/tests/vc_tests/test_freevc.py index fe07b2723c..784e32a68d 100644 --- a/tests/vc_tests/test_freevc.py +++ b/tests/vc_tests/test_freevc.py @@ -55,7 +55,7 @@ def _test_forward(self, batch_size): config = FreeVCConfig() model = FreeVC(config).to(device) model.train() - print(" > Num parameters for FreeVC model:%s" % (count_parameters(model))) + print(f" > Num parameters for FreeVC model:{count_parameters(model)}") mel, spec, spec_lengths, waveform = self._create_inputs(config, batch_size) @@ -80,9 +80,9 @@ def _test_inference(self, batch_size): wavlm_vec_lengths = torch.ones(batch_size, dtype=torch.long) output_wav = model.inference(wavlm_vec, None, mel, wavlm_vec_lengths) - assert ( - output_wav.shape[-1] // config.audio.hop_length == wavlm_vec.shape[-1] - ), f"{output_wav.shape[-1] // config.audio.hop_length} != {wavlm_vec.shape}" + assert output_wav.shape[-1] // config.audio.hop_length == wavlm_vec.shape[-1], ( + f"{output_wav.shape[-1] // config.audio.hop_length} != {wavlm_vec.shape}" + ) def test_inference(self): self._test_inference(1) @@ -95,9 +95,9 @@ def test_voice_conversion(self): source_wav, target_wav = self._create_inputs_inference() output_wav = model.voice_conversion(source_wav, target_wav) - assert ( - output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length - ), f"{output_wav.shape} != {source_wav.shape}, {config.audio.hop_length}" + assert output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length, ( + f"{output_wav.shape} != {source_wav.shape}, {config.audio.hop_length}" + ) def test_train_step(self): ... diff --git a/tests/vc_tests/test_openvoice.py b/tests/vc_tests/test_openvoice.py index c9f7ae3931..703873ea47 100644 --- a/tests/vc_tests/test_openvoice.py +++ b/tests/vc_tests/test_openvoice.py @@ -16,7 +16,6 @@ class TestOpenVoice(unittest.TestCase): - @staticmethod def _create_inputs_inference(): source_wav = torch.rand(16100) @@ -37,6 +36,6 @@ def test_voice_conversion(self): source_wav, target_wav = self._create_inputs_inference() output_wav = model.voice_conversion(source_wav, target_wav) - assert ( - output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length - ), f"{output_wav.shape} != {source_wav.shape}" + assert output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length, ( + f"{output_wav.shape} != {source_wav.shape}" + ) diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py deleted file mode 100644 index 9d4e193382..0000000000 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ /dev/null @@ -1,43 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import FullbandMelganConfig - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = FullbandMelganConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_hifigan_train.py b/tests/vocoder_tests/test_hifigan_train.py deleted file mode 100644 index c506fb48dc..0000000000 --- a/tests/vocoder_tests/test_hifigan_train.py +++ /dev/null @@ -1,43 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import HifiganConfig - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = HifiganConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=1024, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py deleted file mode 100644 index 6ef9cd495b..0000000000 --- a/tests/vocoder_tests/test_melgan_train.py +++ /dev/null @@ -1,43 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import MelganConfig - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = MelganConfig( - batch_size=4, - eval_batch_size=4, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=2048, - eval_split_size=1, - print_step=1, - discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py deleted file mode 100644 index 8002760706..0000000000 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ /dev/null @@ -1,44 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import MultibandMelganConfig - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = MultibandMelganConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - steps_to_start_discriminator=1, - data_path="tests/data/ljspeech", - discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_parallel_wavegan_train.py b/tests/vocoder_tests/test_parallel_wavegan_train.py deleted file mode 100644 index a126befe2e..0000000000 --- a/tests/vocoder_tests/test_parallel_wavegan_train.py +++ /dev/null @@ -1,42 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import ParallelWaveganConfig - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = ParallelWaveganConfig( - batch_size=4, - eval_batch_size=4, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=2048, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_training.py b/tests/vocoder_tests/test_training.py new file mode 100644 index 0000000000..8965de01ee --- /dev/null +++ b/tests/vocoder_tests/test_training.py @@ -0,0 +1,112 @@ +import glob +import os + +import pytest + +from tests import run_main +from TTS.bin.train_vocoder import main +from TTS.vocoder.configs import ( + FullbandMelganConfig, + HifiganConfig, + MelganConfig, + MultibandMelganConfig, + ParallelWaveganConfig, + WavegradConfig, + WavernnConfig, +) +from TTS.vocoder.models.wavernn import WavernnArgs + +GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" + +BASE_CONFIG = { + "batch_size": 8, + "eval_batch_size": 8, + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "run_eval": True, + "test_delay_epochs": -1, + "epochs": 1, + "seq_len": 8192, + "eval_split_size": 1, + "print_step": 1, + "print_eval": True, + "data_path": "tests/data/ljspeech", +} + +DISCRIMINATOR_MODEL_PARAMS = { + "base_channels": 16, + "max_channels": 64, + "downsample_factors": [4, 4, 4], +} + + +def create_config(config_class, **overrides): + params = {**BASE_CONFIG, **overrides} + return config_class(**params) + + +def run_train(tmp_path, config): + config_path = str(tmp_path / "test_vocoder_config.json") + output_path = tmp_path / "train_outputs" + config.output_path = output_path + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) + + # Train the model for one epoch + run_main(main, ["--config_path", config_path]) + + # Find the latest folder + continue_path = str(max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)) + + # Restore the model and continue training for one more epoch + run_main(main, ["--continue_path", continue_path]) + + +def test_train_hifigan(tmp_path): + config = create_config(HifiganConfig, seq_len=1024) + run_train(tmp_path, config) + + +def test_train_melgan(tmp_path): + config = create_config( + MelganConfig, + batch_size=4, + eval_batch_size=4, + seq_len=2048, + discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS, + ) + run_train(tmp_path, config) + + +def test_train_multiband_melgan(tmp_path): + config = create_config( + MultibandMelganConfig, steps_to_start_discriminator=1, discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS + ) + run_train(tmp_path, config) + + +def test_train_fullband_melgan(tmp_path): + config = create_config(FullbandMelganConfig, discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS) + run_train(tmp_path, config) + + +def test_train_parallel_wavegan(tmp_path): + config = create_config(ParallelWaveganConfig, batch_size=4, eval_batch_size=4, seq_len=2048) + run_train(tmp_path, config) + + +# TODO: Reactivate after improving CI run times +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Takes ~2h on CI (15min/step vs 8sec/step locally)") +def test_train_wavegrad(tmp_path): + config = create_config(WavegradConfig, test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}) + run_train(tmp_path, config) + + +def test_train_wavernn(tmp_path): + config = create_config( + WavernnConfig, + model_args=WavernnArgs(), + seq_len=256, # For shorter test time + ) + run_train(tmp_path, config) diff --git a/tests/vocoder_tests/test_vocoder_gan_datasets.py b/tests/vocoder_tests/test_vocoder_gan_datasets.py index c39d70e94c..d540667ee8 100644 --- a/tests/vocoder_tests/test_vocoder_gan_datasets.py +++ b/tests/vocoder_tests/test_vocoder_gan_datasets.py @@ -3,16 +3,12 @@ import numpy as np from torch.utils.data import DataLoader -from tests import get_tests_output_path, get_tests_path +from tests import get_tests_path from TTS.utils.audio import AudioProcessor from TTS.vocoder.configs import BaseGANVocoderConfig from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data -file_path = os.path.dirname(os.path.realpath(__file__)) -OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") -os.makedirs(OUTPATH, exist_ok=True) - C = BaseGANVocoderConfig() test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") diff --git a/tests/vocoder_tests/test_vocoder_losses.py b/tests/vocoder_tests/test_vocoder_losses.py index 95501c2d39..c9432d7f4b 100644 --- a/tests/vocoder_tests/test_vocoder_losses.py +++ b/tests/vocoder_tests/test_vocoder_losses.py @@ -2,17 +2,12 @@ import torch -from tests import get_tests_input_path, get_tests_output_path, get_tests_path +from tests import get_tests_input_path from TTS.config import BaseAudioConfig from TTS.utils.audio import AudioProcessor from TTS.utils.audio.numpy_transforms import stft from TTS.vocoder.layers.losses import MelganFeatureLoss, MultiScaleSTFTLoss, STFTLoss, TorchSTFT -TESTS_PATH = get_tests_path() - -OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") -os.makedirs(OUT_PATH, exist_ok=True) - WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") ap = AudioProcessor(**BaseAudioConfig().to_dict()) diff --git a/tests/vocoder_tests/test_vocoder_pqmf.py b/tests/vocoder_tests/test_vocoder_pqmf.py index afe8d1dc8f..9be492927d 100644 --- a/tests/vocoder_tests/test_vocoder_pqmf.py +++ b/tests/vocoder_tests/test_vocoder_pqmf.py @@ -4,14 +4,13 @@ import torch from librosa.core import load -from tests import get_tests_input_path, get_tests_output_path, get_tests_path +from tests import get_tests_input_path from TTS.vocoder.layers.pqmf import PQMF -TESTS_PATH = get_tests_path() WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -def test_pqmf(): +def test_pqmf(tmp_path): w, sr = load(WAV_FILE) layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) @@ -23,4 +22,4 @@ def test_pqmf(): print(w2_.max()) print(w2_.min()) print(w2_.mean()) - sf.write(os.path.join(get_tests_output_path(), "pqmf_output.wav"), w2_.flatten().detach(), sr) + sf.write(tmp_path / "pqmf_output.wav", w2_.flatten().detach(), sr) diff --git a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py index 503b4e2483..c3ae1309dc 100644 --- a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py +++ b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py @@ -1,29 +1,38 @@ import os -import shutil import numpy as np +import pytest from torch.utils.data import DataLoader -from tests import get_tests_output_path, get_tests_path +from tests import get_tests_path from TTS.utils.audio import AudioProcessor from TTS.vocoder.configs import WavernnConfig from TTS.vocoder.datasets.preprocess import load_wav_feat_data, preprocess_wav_files from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -file_path = os.path.dirname(os.path.realpath(__file__)) -OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") -os.makedirs(OUTPATH, exist_ok=True) - C = WavernnConfig() test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") -test_mel_feat_path = os.path.join(test_data_path, "mel") -test_quant_feat_path = os.path.join(test_data_path, "quant") -ok_ljspeech = os.path.exists(test_data_path) +params = [ + [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0], + [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4], + [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0], + [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0], + [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0], + [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2], + [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0], +] + + +@pytest.mark.parametrize("params", params) +def test_parametrized_wavernn_dataset(tmp_path, params): + """Run dataloader with given parameters and check conditions""" + print(params) + batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers = params + test_mel_feat_path = tmp_path / "mel" + test_quant_feat_path = tmp_path / "quant" -def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers): - """run dataloader with given parameters and check conditions""" ap = AudioProcessor(**C.audio) C.batch_size = batch_size @@ -31,7 +40,7 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor C.seq_len = seq_len C.data_path = test_data_path - preprocess_wav_files(test_data_path, C, ap) + preprocess_wav_files(tmp_path, C, ap) _, train_items = load_wav_feat_data(test_data_path, test_mel_feat_path, 5) dataset = WaveRNNDataset( @@ -50,35 +59,12 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor max_iter = 10 count_iter = 0 - try: - for data in loader: - x_input, mels, _ = data - expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2)) - assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}" - - assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1] - count_iter += 1 - if count_iter == max_iter: - break - # except AssertionError: - # shutil.rmtree(test_mel_feat_path) - # shutil.rmtree(test_quant_feat_path) - finally: - shutil.rmtree(test_mel_feat_path) - shutil.rmtree(test_quant_feat_path) - + for data in loader: + x_input, mels, _ = data + expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2)) + assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}" -def test_parametrized_wavernn_dataset(): - """test dataloader with different parameters""" - params = [ - [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0], - [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4], - [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0], - [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0], - [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0], - [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2], - [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0], - ] - for param in params: - print(param) - wavernn_dataset_case(*param) + assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1] + count_iter += 1 + if count_iter == max_iter: + break diff --git a/tests/vocoder_tests/test_wavegrad.py b/tests/vocoder_tests/test_wavegrad.py index 43b5f08042..d1d3610b70 100644 --- a/tests/vocoder_tests/test_wavegrad.py +++ b/tests/vocoder_tests/test_wavegrad.py @@ -1,5 +1,3 @@ -import unittest - import numpy as np import torch from torch import optim @@ -10,50 +8,43 @@ # pylint: disable=unused-variable torch.manual_seed(1) -use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -class WavegradTrainTest(unittest.TestCase): - def test_train_step(self): # pylint: disable=no-self-use - """Test if all layers are updated in a basic training cycle""" - input_dummy = torch.rand(8, 1, 20 * 300).to(device) - mel_spec = torch.rand(8, 80, 20).to(device) - - criterion = torch.nn.L1Loss().to(device) - args = WavegradArgs( - in_channels=80, - out_channels=1, - upsample_factors=[5, 5, 3, 2, 2], - upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - ) - config = WavegradConfig(model_params=args) - model = Wavegrad(config) - - model_ref = Wavegrad(config) - model.train() - model.to(device) - betas = np.linspace(1e-6, 1e-2, 1000) - model.compute_noise_level(betas) - model_ref.load_state_dict(model.state_dict()) - model_ref.to(device) - count = 0 - for param, param_ref in zip(model.parameters(), model_ref.parameters()): - assert (param - param_ref).sum() == 0, param - count += 1 - optimizer = optim.Adam(model.parameters(), lr=0.001) - for i in range(5): - y_hat = model.forward(input_dummy, mel_spec, torch.rand(8).to(device)) - optimizer.zero_grad() - loss = criterion(y_hat, input_dummy) - loss.backward() - optimizer.step() - # check parameter changes - count = 0 - for param, param_ref in zip(model.parameters(), model_ref.parameters()): - # ignore pre-higway layer since it works conditional - # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) - count += 1 +def test_train_step(): + """Test if all layers are updated in a basic training cycle""" + torch.set_grad_enabled(True) + input_dummy = torch.rand(8, 1, 20 * 300).to(device) + mel_spec = torch.rand(8, 80, 20).to(device) + + criterion = torch.nn.L1Loss().to(device) + args = WavegradArgs( + in_channels=80, + out_channels=1, + upsample_factors=[5, 5, 3, 2, 2], + upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], + ) + config = WavegradConfig(model_params=args) + model = Wavegrad(config) + + model_ref = Wavegrad(config) + model.train() + model.to(device) + betas = np.linspace(1e-6, 1e-2, 1000) + model.compute_noise_level(betas) + model_ref.load_state_dict(model.state_dict()) + model_ref.to(device) + for param, param_ref in zip(model.parameters(), model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + optimizer = optim.Adam(model.parameters(), lr=0.001) + for _ in range(5): + y_hat = model.forward(input_dummy, mel_spec, torch.rand(8).to(device)) + optimizer.zero_grad() + loss = criterion(y_hat, input_dummy) + loss.backward() + optimizer.step() + # check parameter changes + for i, (param, param_ref) in enumerate(zip(model.parameters(), model_ref.parameters())): + # ignore pre-higway layer since it works conditional + # if count not in [145, 59]: + assert (param != param_ref).any(), f"param {i} with shape {param.shape} not updated!! \n{param}\n{param_ref}" diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py deleted file mode 100644 index 9b10759505..0000000000 --- a/tests/vocoder_tests/test_wavegrad_train.py +++ /dev/null @@ -1,54 +0,0 @@ -import glob -import os -import shutil -import unittest - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import WavegradConfig - - -class WavegradTrainingTest(unittest.TestCase): - # TODO: Reactivate after improving CI run times - # This test currently takes ~2h on CI (15min/step vs 8sec/step locally) - if os.getenv("GITHUB_ACTIONS") == "true": - __test__ = False - - def test_train(self): # pylint: disable=no-self-use - config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") - output_path = os.path.join(get_tests_output_path(), "train_outputs") - - config = WavegradConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, - test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}, - ) - config.audio.do_trim_silence = True - config.audio.trim_db = 60 - config.save_json(config_path) - - # train the model for one epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " - ) - run_cli(command_train) - - # Find latest folder - continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - - # restore the model and continue training for one more epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " - ) - run_cli(command_train) - shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_wavernn_train.py b/tests/vocoder_tests/test_wavernn_train.py deleted file mode 100644 index 337e24259f..0000000000 --- a/tests/vocoder_tests/test_wavernn_train.py +++ /dev/null @@ -1,45 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import WavernnConfig -from TTS.vocoder.models.wavernn import WavernnArgs - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = WavernnConfig( - model_args=WavernnArgs(), - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=256, # for shorter test time - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/xtts_tests/test_xtts_gpt_train.py b/tests/xtts_tests/test_xtts_gpt_train.py index bb592f1f2d..4d22b8102f 100644 --- a/tests/xtts_tests/test_xtts_gpt_train.py +++ b/tests/xtts_tests/test_xtts_gpt_train.py @@ -1,10 +1,9 @@ -import os -import shutil +from pathlib import Path +import pytest import torch from trainer import Trainer, TrainerArgs -from tests import get_tests_output_path from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.layers.xtts.dvae import DiscreteVAE @@ -28,37 +27,9 @@ DASHBOARD_LOGGER = "tensorboard" LOGGER_URI = None -# Set here the path that the checkpoints will be saved. Default: ./run/training/ -OUT_PATH = os.path.join(get_tests_output_path(), "train_outputs", "xtts_tests") -os.makedirs(OUT_PATH, exist_ok=True) - -# Create DVAE checkpoint and mel_norms on test time -# DVAE parameters: For the training we need the dvae to extract the dvae tokens, given that you must provide the paths for this model -DVAE_CHECKPOINT = os.path.join(OUT_PATH, "dvae.pth") # DVAE checkpoint -MEL_NORM_FILE = os.path.join( - OUT_PATH, "mel_stats.pth" -) # Mel spectrogram norms, required for dvae mel spectrogram extraction -dvae = DiscreteVAE( - channels=80, - normalization=None, - positional_dims=1, - num_tokens=8192, - codebook_dim=512, - hidden_dim=512, - num_resnet_blocks=3, - kernel_size=3, - num_layers=2, - use_transposed_convs=False, -) -torch.save(dvae.state_dict(), DVAE_CHECKPOINT) -mel_stats = torch.ones(80) -torch.save(mel_stats, MEL_NORM_FILE) - - # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. TOKENIZER_FILE = "tests/inputs/xtts_vocab.json" # vocab.json file -XTTS_CHECKPOINT = None # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_style_emb_repetition_fix_gt/132500_gpt_ema_coqui_tts_with_enhanced_hifigan.pth" # model.pth file - +XTTS_CHECKPOINT = None # model.pth file # Training sentences generations SPEAKER_REFERENCE = [ @@ -66,99 +37,122 @@ ] # speaker reference to be used in training test sentences LANGUAGE = config_dataset.language - # Training Parameters OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False START_WITH_EVAL = False # if True it will star with evaluation BATCH_SIZE = 2 # set here the batch size GRAD_ACUMM_STEPS = 1 # set here the grad accumulation steps -# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly. - - -# init args and config -model_args = GPTArgs( - max_conditioning_length=132300, # 6 secs - min_conditioning_length=66150, # 3 secs - debug_loading_failures=False, - max_wav_length=255995, # ~11.6 seconds - max_text_length=200, - mel_norm_file=MEL_NORM_FILE, - dvae_checkpoint=DVAE_CHECKPOINT, - xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune - tokenizer_file=TOKENIZER_FILE, - gpt_num_audio_tokens=8194, - gpt_start_audio_token=8192, - gpt_stop_audio_token=8193, -) -audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) -config = GPTTrainerConfig( - epochs=1, - output_path=OUT_PATH, - model_args=model_args, - run_name=RUN_NAME, - project_name=PROJECT_NAME, - run_description=""" - GPT XTTS training - """, - dashboard_logger=DASHBOARD_LOGGER, - logger_uri=LOGGER_URI, - audio=audio_config, - batch_size=BATCH_SIZE, - batch_group_size=48, - eval_batch_size=BATCH_SIZE, - num_loader_workers=8, - eval_split_max_size=256, - print_step=50, - plot_step=100, - log_model_step=1000, - save_step=10000, - save_n_checkpoints=1, - save_checkpoints=True, - # target_loss="loss", - print_eval=False, - # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters. - optimizer="AdamW", - optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS, - optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2}, - lr=5e-06, # learning rate - lr_scheduler="MultiStepLR", - # it was adjusted accordly for the new step scheme - lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1}, - test_sentences=[ - { - "text": "This cake is great. It's so delicious and moist.", - "speaker_wav": SPEAKER_REFERENCE, - "language": LANGUAGE, - }, - ], -) - -# init the model from config -model = GPTTrainer.init_from_config(config) +# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 +# for more efficient training. You can increase/decrease BATCH_SIZE but then set +# GRAD_ACUMM_STEPS accordingly. -# load training samples -train_samples, eval_samples = load_tts_samples( - DATASETS_CONFIG_LIST, - eval_split=True, - eval_split_max_size=config.eval_split_max_size, - eval_split_size=config.eval_split_size, -) +audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) -# init the trainer and 🚀 -trainer = Trainer( - TrainerArgs( - restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter - skip_train_epoch=False, - start_with_eval=True, - grad_accum_steps=GRAD_ACUMM_STEPS, - ), - config, - output_path=OUT_PATH, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, -) -trainer.fit() -# remove output path -shutil.rmtree(OUT_PATH) +@pytest.mark.parametrize("use_perceiver", [False, True]) +def test_xtts_gpt_train(tmp_path: Path, use_perceiver: bool): + # Create DVAE checkpoint and mel_norms on test time + # DVAE parameters: For the training we need the dvae to extract the dvae tokens, + # given that you must provide the paths for this model + DVAE_CHECKPOINT = tmp_path / "dvae.pth" + # Mel spectrogram norms for dvae mel spectrogram extraction + MEL_NORM_FILE = tmp_path / "mel_stats.pth" + dvae = DiscreteVAE( + channels=80, + normalization=None, + positional_dims=1, + num_tokens=8192, + codebook_dim=512, + hidden_dim=512, + num_resnet_blocks=3, + kernel_size=3, + num_layers=2, + use_transposed_convs=False, + ) + torch.save(dvae.state_dict(), DVAE_CHECKPOINT) + mel_stats = torch.ones(80) + torch.save(mel_stats, MEL_NORM_FILE) + + # init args and config + model_args = GPTArgs( + max_conditioning_length=132300, # 6 secs + min_conditioning_length=66150, # 3 secs + debug_loading_failures=False, + max_wav_length=255995, # ~11.6 seconds + max_text_length=200, + mel_norm_file=MEL_NORM_FILE, + dvae_checkpoint=DVAE_CHECKPOINT, + xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune + tokenizer_file=TOKENIZER_FILE, + gpt_num_audio_tokens=8194, + gpt_start_audio_token=8192, + gpt_stop_audio_token=8193, + gpt_use_perceiver_resampler=use_perceiver, + ) + + config = GPTTrainerConfig( + epochs=1, + output_path=tmp_path, + model_args=model_args, + run_name=RUN_NAME, + project_name=PROJECT_NAME, + run_description="GPT XTTS training", + dashboard_logger=DASHBOARD_LOGGER, + logger_uri=LOGGER_URI, + audio=audio_config, + batch_size=BATCH_SIZE, + batch_group_size=48, + eval_batch_size=BATCH_SIZE, + num_loader_workers=8, + eval_split_max_size=256, + print_step=50, + plot_step=100, + log_model_step=1000, + save_step=10000, + save_n_checkpoints=1, + save_checkpoints=True, + # target_loss="loss", + print_eval=False, + # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters. + optimizer="AdamW", + optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS, + optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2}, + lr=5e-06, # learning rate + lr_scheduler="MultiStepLR", + # it was adjusted accordly for the new step scheme + lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1}, + test_sentences=[ + { + "text": "This cake is great. It's so delicious and moist.", + "speaker_wav": SPEAKER_REFERENCE, + "language": LANGUAGE, + }, + ], + ) + + # init the model from config + model = GPTTrainer.init_from_config(config) + + # load training samples + train_samples, eval_samples = load_tts_samples( + DATASETS_CONFIG_LIST, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, + ) + + # init the trainer and 🚀 + trainer = Trainer( + TrainerArgs( + restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter + skip_train_epoch=False, + start_with_eval=True, + grad_accum_steps=GRAD_ACUMM_STEPS, + ), + config, + output_path=tmp_path, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + ) + trainer.fit() diff --git a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py b/tests/xtts_tests/test_xtts_v2-0_gpt_train.py deleted file mode 100644 index 454e867385..0000000000 --- a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py +++ /dev/null @@ -1,164 +0,0 @@ -import os -import shutil - -import torch -from trainer import Trainer, TrainerArgs - -from tests import get_tests_output_path -from TTS.config.shared_configs import BaseDatasetConfig -from TTS.tts.datasets import load_tts_samples -from TTS.tts.layers.xtts.dvae import DiscreteVAE -from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig -from TTS.tts.models.xtts import XttsAudioConfig - -config_dataset = BaseDatasetConfig( - formatter="ljspeech", - dataset_name="ljspeech", - path="tests/data/ljspeech/", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - language="en", -) - -DATASETS_CONFIG_LIST = [config_dataset] - -# Logging parameters -RUN_NAME = "GPT_XTTS_LJSpeech_FT" -PROJECT_NAME = "XTTS_trainer" -DASHBOARD_LOGGER = "tensorboard" -LOGGER_URI = None - -OUT_PATH = os.path.join(get_tests_output_path(), "train_outputs", "xtts_tests") -os.makedirs(OUT_PATH, exist_ok=True) - -# Create DVAE checkpoint and mel_norms on test time -# DVAE parameters: For the training we need the dvae to extract the dvae tokens, given that you must provide the paths for this model -DVAE_CHECKPOINT = os.path.join(OUT_PATH, "dvae.pth") # DVAE checkpoint -# Mel spectrogram norms, required for dvae mel spectrogram extraction -MEL_NORM_FILE = os.path.join(OUT_PATH, "mel_stats.pth") -dvae = DiscreteVAE( - channels=80, - normalization=None, - positional_dims=1, - num_tokens=8192, - codebook_dim=512, - hidden_dim=512, - num_resnet_blocks=3, - kernel_size=3, - num_layers=2, - use_transposed_convs=False, -) -torch.save(dvae.state_dict(), DVAE_CHECKPOINT) -mel_stats = torch.ones(80) -torch.save(mel_stats, MEL_NORM_FILE) - - -# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. -TOKENIZER_FILE = "tests/inputs/xtts_vocab.json" # vocab.json file -XTTS_CHECKPOINT = None # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_style_emb_repetition_fix_gt/132500_gpt_ema_coqui_tts_with_enhanced_hifigan.pth" # model.pth file - - -# Training sentences generations -SPEAKER_REFERENCE = [ - "tests/data/ljspeech/wavs/LJ001-0002.wav" -] # speaker reference to be used in training test sentences -LANGUAGE = config_dataset.language - - -# Training Parameters -OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False -START_WITH_EVAL = False # if True it will star with evaluation -BATCH_SIZE = 2 # set here the batch size -GRAD_ACUMM_STEPS = 1 # set here the grad accumulation steps -# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly. - - -# init args and config -model_args = GPTArgs( - max_conditioning_length=132300, # 6 secs - min_conditioning_length=66150, # 3 secs - debug_loading_failures=False, - max_wav_length=255995, # ~11.6 seconds - max_text_length=200, - mel_norm_file=MEL_NORM_FILE, - dvae_checkpoint=DVAE_CHECKPOINT, - xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune - tokenizer_file=TOKENIZER_FILE, - gpt_num_audio_tokens=8194, - gpt_start_audio_token=8192, - gpt_stop_audio_token=8193, - gpt_use_masking_gt_prompt_approach=True, - gpt_use_perceiver_resampler=True, -) - -audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) - -config = GPTTrainerConfig( - epochs=1, - output_path=OUT_PATH, - model_args=model_args, - run_name=RUN_NAME, - project_name=PROJECT_NAME, - run_description="GPT XTTS training", - dashboard_logger=DASHBOARD_LOGGER, - logger_uri=LOGGER_URI, - audio=audio_config, - batch_size=BATCH_SIZE, - batch_group_size=48, - eval_batch_size=BATCH_SIZE, - num_loader_workers=8, - eval_split_max_size=256, - print_step=50, - plot_step=100, - log_model_step=1000, - save_step=10000, - save_n_checkpoints=1, - save_checkpoints=True, - # target_loss="loss", - print_eval=False, - # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters. - optimizer="AdamW", - optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS, - optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2}, - lr=5e-06, # learning rate - lr_scheduler="MultiStepLR", - # it was adjusted accordly for the new step scheme - lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1}, - test_sentences=[ - { - "text": "This cake is great. It's so delicious and moist.", - "speaker_wav": SPEAKER_REFERENCE, - "language": LANGUAGE, - }, - ], -) - -# init the model from config -model = GPTTrainer.init_from_config(config) - -# load training samples -train_samples, eval_samples = load_tts_samples( - DATASETS_CONFIG_LIST, - eval_split=True, - eval_split_max_size=config.eval_split_max_size, - eval_split_size=config.eval_split_size, -) - -# init the trainer and 🚀 -trainer = Trainer( - TrainerArgs( - restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter - skip_train_epoch=False, - start_with_eval=True, - grad_accum_steps=GRAD_ACUMM_STEPS, - ), - config, - output_path=OUT_PATH, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, -) -trainer.fit() - -# remove output path -shutil.rmtree(OUT_PATH) diff --git a/tests/zoo_tests/test_big_models.py b/tests/zoo_tests/test_big_models.py new file mode 100644 index 0000000000..8a9780b4f0 --- /dev/null +++ b/tests/zoo_tests/test_big_models.py @@ -0,0 +1,193 @@ +"""These tests should be run locally because the models are too big for CI.""" + +import os + +import pytest +import torch + +from tests import get_tests_data_path, run_main +from TTS.bin.synthesize import main +from TTS.utils.manage import ModelManager + +GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" + + +@pytest.fixture(scope="session", autouse=True) +def set_env(): + os.environ["COQUI_TOS_AGREED"] = "1" + + +@pytest.fixture +def manager(): + """Set up model manager.""" + return ModelManager(progress_bar=False) + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_xtts(tmp_path): + """XTTS is too big to run on github actions. We need to test it locally""" + args = [ + "--model_name", + "tts_models/multilingual/multi-dataset/xtts_v1.1", + "--text", + "C'est un exemple.", + "--language_idx", + "fr", + "--out_path", + str(tmp_path / "output.wav"), + "--no-progress_bar", + "--speaker_wav", + os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"), + ] + if torch.cuda.is_available(): + args.append("--use_cuda") + run_main(main, args) + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_xtts_streaming(manager): + """Testing the new inference_stream method""" + from TTS.tts.configs.xtts_config import XttsConfig + from TTS.tts.models.xtts import Xtts + + speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] + speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav") + speaker_wav.append(speaker_wav_2) + model_path, _, _ = manager.download_model("tts_models/multilingual/multi-dataset/xtts_v1.1") + config = XttsConfig() + config.load_json(model_path / "config.json") + model = Xtts.init_from_config(config) + model.load_checkpoint(config, checkpoint_dir=str(model_path)) + model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) + + print("Computing speaker latents...") + gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) + + print("Inference...") + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + ) + wav_chunks = [] + for i, chunk in enumerate(chunks): + if i == 0: + assert chunk.shape[-1] > 5000 + wav_chunks.append(chunk) + assert len(wav_chunks) > 1 + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_xtts_v2(tmp_path): + """XTTS is too big to run on github actions. We need to test it locally""" + args = [ + "--model_name", + "tts_models/multilingual/multi-dataset/xtts_v2", + "--text", + "C'est un exemple.", + "--language_idx", + "fr", + "--out_path", + str(tmp_path / "output.wav"), + "--no-progress_bar", + "--speaker_wav", + os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"), + os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav"), + ] + if torch.cuda.is_available(): + args.append("--use_cuda") + run_main(main, args) + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_xtts_v2_streaming(manager): + """Testing the new inference_stream method""" + from TTS.tts.configs.xtts_config import XttsConfig + from TTS.tts.models.xtts import Xtts + + speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] + model_path, _, _ = manager.download_model("tts_models/multilingual/multi-dataset/xtts_v2") + config = XttsConfig() + config.load_json(model_path / "config.json") + model = Xtts.init_from_config(config) + model.load_checkpoint(config, checkpoint_dir=str(model_path)) + model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) + + print("Computing speaker latents...") + gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) + + print("Inference...") + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + ) + wav_chunks = [] + for i, chunk in enumerate(chunks): + if i == 0: + assert chunk.shape[-1] > 5000 + wav_chunks.append(chunk) + assert len(wav_chunks) > 1 + normal_len = sum([len(chunk) for chunk in wav_chunks]) + + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + speed=1.5, + ) + wav_chunks = [] + for i, chunk in enumerate(chunks): + wav_chunks.append(chunk) + fast_len = sum([len(chunk) for chunk in wav_chunks]) + + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + speed=0.66, + ) + wav_chunks = [] + for i, chunk in enumerate(chunks): + wav_chunks.append(chunk) + slow_len = sum([len(chunk) for chunk in wav_chunks]) + + assert slow_len > normal_len + assert normal_len > fast_len + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_tortoise(tmp_path): + args = [ + "--model_name", + "tts_models/en/multi-dataset/tortoise-v2", + "--text", + "This is an example.", + "--out_path", + str(tmp_path / "output.wav"), + "--no-progress_bar", + ] + if torch.cuda.is_available(): + args.append("--use_cuda") + run_main(main, args) + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_bark(tmp_path): + """Bark is too big to run on github actions. We need to test it locally""" + args = [ + "--model_name", + "tts_models/multilingual/multi-dataset/bark", + "--text", + "This is an example.", + "--out_path", + str(tmp_path / "output.wav"), + "--no-progress_bar", + ] + if torch.cuda.is_available(): + args.append("--use_cuda") + run_main(main, args) diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index 461b4fbe12..9f02672ef1 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -2,10 +2,11 @@ import os import shutil -import torch -from trainer.io import get_user_data_dir +import pytest -from tests import get_tests_data_path, get_tests_output_path, run_cli +from tests import get_tests_data_path, run_main +from TTS.api import TTS +from TTS.bin.synthesize import main from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.manage import ModelManager @@ -18,249 +19,81 @@ ] -def run_models(offset=0, step=1): - """Check if all the models are downloadable and tts models run correctly.""" - print(" > Run synthesizer with all the models.") - output_path = os.path.join(get_tests_output_path(), "output.wav") - manager = ModelManager(output_prefix=get_tests_output_path(), progress_bar=False) - model_names = [name for name in manager.list_models() if name not in MODELS_WITH_SEP_TESTS] - print("Model names:", model_names) - for model_name in model_names[offset::step]: - print(f"\n > Run - {model_name}") - model_path, _, _ = manager.download_model(model_name) - if "tts_models" in model_name: - local_download_dir = model_path.parent - # download and run the model - speaker_files = list(local_download_dir.glob("speaker*")) - language_files = list(local_download_dir.glob("language*")) - speaker_arg = "" - language_arg = "" - if len(speaker_files) > 0: - # multi-speaker model - if "speaker_ids" in speaker_files[0].stem: - speaker_manager = SpeakerManager(speaker_id_file_path=speaker_files[0]) - elif "speakers" in speaker_files[0].stem: - speaker_manager = SpeakerManager(d_vectors_file_path=speaker_files[0]) - speakers = list(speaker_manager.name_to_id.keys()) - if len(speakers) > 1: - speaker_arg = f'--speaker_idx "{speakers[0]}"' - if len(language_files) > 0 and "language_ids" in language_files[0].stem: - # multi-lingual model - language_manager = LanguageManager(language_ids_file_path=language_files[0]) - languages = language_manager.language_names - if len(languages) > 1: - language_arg = f'--language_idx "{languages[0]}"' - run_cli( - f'tts --model_name {model_name} --text "This is an example." ' - f'--out_path "{output_path}" {speaker_arg} {language_arg} --no-progress_bar' - ) - # remove downloaded models - shutil.rmtree(local_download_dir) - shutil.rmtree(get_user_data_dir("tts")) - elif "voice_conversion_models" in model_name: - speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") - reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav") - run_cli( - f"tts --model_name {model_name} " - f'--out_path "{output_path}" --source_wav "{speaker_wav}" --target_wav "{reference_wav}" --no-progress_bar' - ) - else: - # only download the model - manager.download_model(model_name) - print(f" | > OK: {model_name}") - - -def test_xtts(): - """XTTS is too big to run on github actions. We need to test it locally""" - output_path = os.path.join(get_tests_output_path(), "output.wav") - speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") - use_gpu = torch.cuda.is_available() - if use_gpu: - run_cli( - "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda ' - f'--speaker_wav "{speaker_wav}" --language_idx "en"' - ) +@pytest.fixture(autouse=True) +def run_around_tests(tmp_path): + """Download models to a temp folder and delete it afterwards.""" + os.environ["TTS_HOME"] = str(tmp_path) + yield + shutil.rmtree(tmp_path) + + +@pytest.fixture +def manager(tmp_path): + """Set up model manager.""" + return ModelManager(output_prefix=tmp_path, progress_bar=False) + + +# To split tests into different CI jobs +num_partitions = int(os.getenv("NUM_PARTITIONS", "1")) +partition = int(os.getenv("TEST_PARTITION", "0")) +model_names = [name for name in TTS.list_models() if name not in MODELS_WITH_SEP_TESTS] +model_names.extend(["tts_models/deu/fairseq/vits", "tts_models/sqi/fairseq/vits"]) +model_names = [name for i, name in enumerate(model_names) if i % num_partitions == partition] + + +@pytest.mark.parametrize("model_name", model_names) +def test_models(tmp_path, model_name, manager): + print(f"\n > Run - {model_name}") + output_path = str(tmp_path / "output.wav") + model_path, _, _ = manager.download_model(model_name) + args = ["--model_name", model_name, "--out_path", output_path, "--no-progress_bar"] + if "tts_models" in model_name: + local_download_dir = model_path.parent + # download and run the model + speaker_files = list(local_download_dir.glob("speaker*")) + language_files = list(local_download_dir.glob("language*")) + speaker_arg = [] + language_arg = [] + if len(speaker_files) > 0: + # multi-speaker model + if "speaker_ids" in speaker_files[0].stem: + speaker_manager = SpeakerManager(speaker_id_file_path=speaker_files[0]) + elif "speakers" in speaker_files[0].stem: + speaker_manager = SpeakerManager(d_vectors_file_path=speaker_files[0]) + speakers = list(speaker_manager.name_to_id.keys()) + if len(speakers) > 1: + speaker_arg = ["--speaker_idx", speakers[0]] + if len(language_files) > 0 and "language_ids" in language_files[0].stem: + # multi-lingual model + language_manager = LanguageManager(language_ids_file_path=language_files[0]) + languages = language_manager.language_names + if len(languages) > 1: + language_arg = ["--language_idx", languages[0]] + run_main(main, [*args, "--text", "This is an example.", *speaker_arg, *language_arg]) + elif "voice_conversion_models" in model_name: + speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") + reference_wav1 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0028.wav") + reference_wav2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav") + run_main(main, [*args, "--source_wav", speaker_wav, "--target_wav", reference_wav1, reference_wav2]) else: - run_cli( - "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar ' - f'--speaker_wav "{speaker_wav}" --language_idx "en"' - ) - - -def test_xtts_streaming(): - """Testing the new inference_stream method""" - from TTS.tts.configs.xtts_config import XttsConfig - from TTS.tts.models.xtts import Xtts - - speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] - speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav") - speaker_wav.append(speaker_wav_2) - model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1.1") - config = XttsConfig() - config.load_json(os.path.join(model_path, "config.json")) - model = Xtts.init_from_config(config) - model.load_checkpoint(config, checkpoint_dir=model_path) - model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) + # only download the model + manager.download_model(model_name) + print(f" | > OK: {model_name}") - print("Computing speaker latents...") - gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) - - print("Inference...") - chunks = model.inference_stream( - "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", - "en", - gpt_cond_latent, - speaker_embedding, - ) - wav_chuncks = [] - for i, chunk in enumerate(chunks): - if i == 0: - assert chunk.shape[-1] > 5000 - wav_chuncks.append(chunk) - assert len(wav_chuncks) > 1 - - -def test_xtts_v2(): - """XTTS is too big to run on github actions. We need to test it locally""" - output_path = os.path.join(get_tests_output_path(), "output.wav") - speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") - speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav") - use_gpu = torch.cuda.is_available() - if use_gpu: - run_cli( - "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda ' - f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"' - ) - else: - run_cli( - "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar ' - f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"' - ) - -def test_xtts_v2_streaming(): - """Testing the new inference_stream method""" - from TTS.tts.configs.xtts_config import XttsConfig - from TTS.tts.models.xtts import Xtts - - speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] - model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2") - config = XttsConfig() - config.load_json(os.path.join(model_path, "config.json")) - model = Xtts.init_from_config(config) - model.load_checkpoint(config, checkpoint_dir=model_path) - model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) - - print("Computing speaker latents...") - gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) - - print("Inference...") - chunks = model.inference_stream( - "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", - "en", - gpt_cond_latent, - speaker_embedding, - ) - wav_chuncks = [] - for i, chunk in enumerate(chunks): - if i == 0: - assert chunk.shape[-1] > 5000 - wav_chuncks.append(chunk) - assert len(wav_chuncks) > 1 - normal_len = sum([len(chunk) for chunk in wav_chuncks]) - - chunks = model.inference_stream( - "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", - "en", - gpt_cond_latent, - speaker_embedding, - speed=1.5, - ) - wav_chuncks = [] - for i, chunk in enumerate(chunks): - wav_chuncks.append(chunk) - fast_len = sum([len(chunk) for chunk in wav_chuncks]) - - chunks = model.inference_stream( - "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", - "en", - gpt_cond_latent, - speaker_embedding, - speed=0.66, - ) - wav_chuncks = [] - for i, chunk in enumerate(chunks): - wav_chuncks.append(chunk) - slow_len = sum([len(chunk) for chunk in wav_chuncks]) - - assert slow_len > normal_len - assert normal_len > fast_len - - -def test_tortoise(): - output_path = os.path.join(get_tests_output_path(), "output.wav") - use_gpu = torch.cuda.is_available() - if use_gpu: - run_cli( - f" tts --model_name tts_models/en/multi-dataset/tortoise-v2 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda' - ) - else: - run_cli( - f" tts --model_name tts_models/en/multi-dataset/tortoise-v2 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar' - ) - - -def test_bark(): - """Bark is too big to run on github actions. We need to test it locally""" - output_path = os.path.join(get_tests_output_path(), "output.wav") - use_gpu = torch.cuda.is_available() - if use_gpu: - run_cli( - f" tts --model_name tts_models/multilingual/multi-dataset/bark " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda' - ) - else: - run_cli( - f" tts --model_name tts_models/multilingual/multi-dataset/bark " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar' - ) - - -def test_voice_conversion(): +def test_voice_conversion(tmp_path): print(" > Run voice conversion inference using YourTTS model.") - model_name = "tts_models/multilingual/multi-dataset/your_tts" - language_id = "en" - speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") - reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav") - output_path = os.path.join(get_tests_output_path(), "output.wav") - run_cli( - f"tts --model_name {model_name}" - f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} --no-progress_bar" - ) - - -""" -These are used to split tests into different actions on Github. -""" - - -def test_models_offset_0_step_3(): - run_models(offset=0, step=3) - - -def test_models_offset_1_step_3(): - run_models(offset=1, step=3) - - -def test_models_offset_2_step_3(): - run_models(offset=2, step=3) + args = [ + "--model_name", + "tts_models/multilingual/multi-dataset/your_tts", + "--out_path", + str(tmp_path / "output.wav"), + "--speaker_wav", + os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"), + "--reference_wav", + os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav"), + "--language_idx", + "en", + "--no-progress_bar", + ] + run_main(main, args)