From 55eeaf227f56b6bf453678a6c2167293e82a352a Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Sat, 11 Jan 2025 00:01:53 +0100 Subject: [PATCH 1/8] build: update dev dependencies --- .github/actions/setup-uv/action.yml | 5 +++-- .github/workflows/style_check.yml | 6 ------ .github/workflows/tests.yml | 6 ------ .pre-commit-config.yaml | 4 ++-- TTS/utils/generic_utils.py | 4 ++-- pyproject.toml | 6 +++--- 6 files changed, 10 insertions(+), 21 deletions(-) diff --git a/.github/actions/setup-uv/action.yml b/.github/actions/setup-uv/action.yml index c7dd4f5f99..88a73e8481 100644 --- a/.github/actions/setup-uv/action.yml +++ b/.github/actions/setup-uv/action.yml @@ -4,8 +4,9 @@ runs: using: 'composite' steps: - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: - version: "0.5.4" + version: "0.5.17" enable-cache: true cache-dependency-glob: "**/pyproject.toml" + python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/style_check.yml b/.github/workflows/style_check.yml index d1060f6be2..03426808cc 100644 --- a/.github/workflows/style_check.yml +++ b/.github/workflows/style_check.yml @@ -9,15 +9,9 @@ on: jobs: lint: runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9] steps: - uses: actions/checkout@v4 - name: Setup uv uses: ./.github/actions/setup-uv - - name: Set up Python ${{ matrix.python-version }} - run: uv python install ${{ matrix.python-version }} - name: Lint check run: make lint diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7905add3f7..58be50da40 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -28,8 +28,6 @@ jobs: - uses: actions/checkout@v4 - name: Setup uv uses: ./.github/actions/setup-uv - - name: Set up Python ${{ matrix.python-version }} - run: uv python install ${{ matrix.python-version }} - name: Install Espeak if: contains(fromJSON('["inference_tests", "test_text"]'), matrix.subset) run: | @@ -72,8 +70,6 @@ jobs: - uses: actions/checkout@v4 - name: Setup uv uses: ./.github/actions/setup-uv - - name: Set up Python ${{ matrix.python-version }} - run: uv python install ${{ matrix.python-version }} - name: Install Espeak if: contains(fromJSON('["test_tts", "test_tts2", "test_xtts"]'), matrix.subset) run: | @@ -116,8 +112,6 @@ jobs: - uses: actions/checkout@v4 - name: Setup uv uses: ./.github/actions/setup-uv - - name: Set up Python ${{ matrix.python-version }} - run: uv python install ${{ matrix.python-version }} - name: Install Espeak run: | sudo apt-get update diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 62420e9958..97542c8cc3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,12 +8,12 @@ repos: - id: end-of-file-fixer - id: trailing-whitespace - repo: "https://github.com/psf/black" - rev: 24.2.0 + rev: 24.10.0 hooks: - id: black language_version: python3 - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.0 + rev: v0.9.1 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 77566c3f6a..090522493b 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -80,9 +80,9 @@ def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict: Dict: arguments with formatted auxilary inputs. """ kwargs = kwargs.copy() - for name in def_args: + for name, arg in def_args.items(): if name not in kwargs or kwargs[name] is None: - kwargs[name] = def_args[name] + kwargs[name] = arg return kwargs diff --git a/pyproject.toml b/pyproject.toml index 4b87a10b20..fa58e5b934 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -136,11 +136,11 @@ all = [ [dependency-groups] dev = [ - "black==24.2.0", + "black==24.10.0", "coverage[toml]>=7", - "pre-commit>=3", + "pre-commit>=4", "pytest>=8", - "ruff==0.7.0", + "ruff==0.9.1", ] # Dependencies for building the documentation docs = [ From 38e3c94317ff7cdf5bd9dc28ffab58992748d739 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Sat, 11 Jan 2025 00:06:29 +0100 Subject: [PATCH 2/8] build: drop python 3.9 support --- .github/workflows/tests.yml | 8 ++++---- README.md | 2 +- pyproject.toml | 9 ++++----- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 58be50da40..aa01abb874 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -22,7 +22,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9, "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12"] subset: ["data_tests", "inference_tests", "test_aux", "test_text"] steps: - uses: actions/checkout@v4 @@ -48,7 +48,7 @@ jobs: - name: Unit tests run: | resolution=highest - if [ "${{ matrix.python-version }}" == "3.9" ]; then + if [ "${{ matrix.python-version }}" == "3.10" ]; then resolution=lowest-direct fi uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }} @@ -64,7 +64,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9", "3.12"] + python-version: ["3.10", "3.12"] subset: ["test_tts", "test_tts2", "test_vocoder", "test_xtts"] steps: - uses: actions/checkout@v4 @@ -90,7 +90,7 @@ jobs: - name: Integration tests run: | resolution=highest - if [ "${{ matrix.python-version }}" == "3.9" ]; then + if [ "${{ matrix.python-version }}" == "3.10" ]; then resolution=lowest-direct fi uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }} diff --git a/README.md b/README.md index c0843b731d..db8868b26d 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ You can also help us implement more models. ## Installation -🐸TTS is tested on Ubuntu 24.04 with **python >= 3.9, < 3.13**, but should also +🐸TTS is tested on Ubuntu 24.04 with **python >= 3.10, < 3.13**, but should also work on Mac and Windows. If you are only interested in [synthesizing speech](https://coqui-tts.readthedocs.io/en/latest/inference.html) with the pretrained 🐸TTS models, installing from PyPI is the easiest option. diff --git a/pyproject.toml b/pyproject.toml index fa58e5b934..95a1cead69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ name = "coqui-tts" version = "0.25.3" description = "Deep learning for Text to Speech." readme = "README.md" -requires-python = ">=3.9, <3.13" +requires-python = ">=3.10, <3.13" license = {text = "MPL-2.0"} authors = [ {name = "Eren GÃļlge", email = "egolge@coqui.ai"} @@ -39,7 +39,6 @@ maintainers = [ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -88,7 +87,7 @@ dependencies = [ "encodec>=0.1.1", # XTTS "num2words>=0.5.14", - "spacy[ja]>=3,<3.8", + "spacy[ja]>=3.2,<3.8", ] [project.optional-dependencies] @@ -116,7 +115,7 @@ ko = [ ] # Japanese ja = [ - "mecab-python3>=1.0.2", + "mecab-python3>=1.0.6", "unidic-lite==1.0.8", "cutlet>=0.2.0", ] @@ -233,7 +232,7 @@ max-returns = 7 [tool.black] line-length = 120 -target-version = ['py39'] +target-version = ['py310'] [tool.coverage.report] skip_covered = true From 2e4fbb671dcb2acbb0b23d5ac2359ea93cd3313a Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Sat, 11 Jan 2025 00:42:42 +0100 Subject: [PATCH 3/8] refactor: enable ruff rule G004 --- TTS/encoder/utils/prepare_voxceleb.py | 12 ++++++------ TTS/tts/datasets/dataset.py | 18 +++++++++--------- TTS/tts/layers/bark/inference_funcs.py | 4 ++-- TTS/tts/layers/bark/load_model.py | 8 ++++---- TTS/vc/layers/freevc/wavlm/wavlm.py | 2 +- TTS/vc/models/knnvc.py | 2 +- pyproject.toml | 1 + 7 files changed, 24 insertions(+), 23 deletions(-) diff --git a/TTS/encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py index 37619ed0f8..8ca7ea2fac 100644 --- a/TTS/encoder/utils/prepare_voxceleb.py +++ b/TTS/encoder/utils/prepare_voxceleb.py @@ -81,7 +81,7 @@ def download_and_extract(directory, subset, urls): zip_filepath = os.path.join(directory, url.split("/")[-1]) if os.path.exists(zip_filepath): continue - logger.info("Downloading %s to %s" % (url, zip_filepath)) + logger.info("Downloading %s to %s", url, zip_filepath) subprocess.call( "wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath), shell=True, @@ -122,9 +122,9 @@ def exec_cmd(cmd): try: retcode = subprocess.call(cmd, shell=True) if retcode < 0: - logger.info(f"Child was terminated by signal {retcode}") + logger.info("Child was terminated by signal %d", retcode) except OSError as e: - logger.info(f"Execution failed: {e}") + logger.info("Execution failed: %s", e) retcode = -999 return retcode @@ -138,10 +138,10 @@ def decode_aac_with_ffmpeg(aac_file, wav_file): bool, True if success. """ cmd = f"ffmpeg -i {aac_file} {wav_file}" - logger.info(f"Decoding aac file using command line: {cmd}") + logger.info("Decoding aac file using command line: %s", cmd) ret = exec_cmd(cmd) if ret != 0: - logger.error(f"Failed to decode aac file with retcode {ret}") + logger.error("Failed to decode aac file with retcode %s", ret) logger.error("Please check your ffmpeg installation.") return False return True @@ -156,7 +156,7 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file): output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv """ - logger.info("Preprocessing audio and label for subset %s" % subset) + logger.info("Preprocessing audio and label for subset %s", subset) source_dir = os.path.join(input_dir, subset) files = [] diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 5f629f32a9..2b0cdf2c9b 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -231,7 +231,7 @@ def lengths(self) -> list[int]: try: audio_len = get_audio_size(wav_file) except RuntimeError: - logger.warning(f"Failed to compute length for {item['audio_file']}") + logger.warning("Failed to compute length for %s", item["audio_file"]) audio_len = 0 lens.append(audio_len) return lens @@ -352,7 +352,7 @@ def _compute_lengths(samples): try: audio_length = get_audio_size(item["audio_file"]) except RuntimeError: - logger.warning(f"Failed to compute length, skipping {item['audio_file']}") + logger.warning("Failed to compute length, skipping %s", item["audio_file"]) continue text_lenght = len(item["text"]) item["audio_length"] = audio_length @@ -437,14 +437,14 @@ def preprocess_samples(self) -> None: self.samples = samples logger.info("Preprocessing samples") - logger.info(f"Max text length: {np.max(text_lengths)}") - logger.info(f"Min text length: {np.min(text_lengths)}") - logger.info(f"Avg text length: {np.mean(text_lengths)}") - logger.info(f"Max audio length: {np.max(audio_lengths)}") - logger.info(f"Min audio length: {np.min(audio_lengths)}") - logger.info(f"Avg audio length: {np.mean(audio_lengths)}") + logger.info("Max text length: %d", np.max(text_lengths)) + logger.info("Min text length: %d", np.min(text_lengths)) + logger.info("Avg text length: %.2f", np.mean(text_lengths)) + logger.info("Max audio length: %.2f", np.max(audio_lengths)) + logger.info("Min audio length: %.2f", np.min(audio_lengths)) + logger.info("Avg audio length: %.2f", np.mean(audio_lengths)) logger.info("Num. instances discarded samples: %d", len(ignore_idx)) - logger.info(f"Batch group size: {self.batch_group_size}.") + logger.info("Batch group size: %d", self.batch_group_size) @staticmethod def _sort_batch(batch, text_lengths): diff --git a/TTS/tts/layers/bark/inference_funcs.py b/TTS/tts/layers/bark/inference_funcs.py index 65c7800dcf..58331bc096 100644 --- a/TTS/tts/layers/bark/inference_funcs.py +++ b/TTS/tts/layers/bark/inference_funcs.py @@ -206,8 +206,8 @@ def generate_text_semantic( semantic_history = None encoded_text = np.array(_tokenize(model.tokenizer, text)) + model.config.TEXT_ENCODING_OFFSET if len(encoded_text) > 256: - p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1) - logger.warning(f"warning, text too long, lopping of last {p}%") + p = (len(encoded_text) - 256) / len(encoded_text) * 100 + logger.warning("warning, text too long, lopping of last %.1f%%", p) encoded_text = encoded_text[:256] encoded_text = np.pad( encoded_text, diff --git a/TTS/tts/layers/bark/load_model.py b/TTS/tts/layers/bark/load_model.py index c1e0d006cb..dcec5b5bbc 100644 --- a/TTS/tts/layers/bark/load_model.py +++ b/TTS/tts/layers/bark/load_model.py @@ -88,7 +88,7 @@ def clear_cuda_cache(): def load_model(ckpt_path, device, config, model_type="text"): - logger.info(f"loading {model_type} model from {ckpt_path}...") + logger.info("loading %s model from %s...", model_type, ckpt_path) if device == "cpu": logger.warning("No GPU being used. Careful, Inference might be extremely slow!") @@ -108,10 +108,10 @@ def load_model(ckpt_path, device, config, model_type="text"): and os.path.exists(ckpt_path) and _md5(ckpt_path) != config.REMOTE_MODEL_PATHS[model_type]["checksum"] ): - logger.warning(f"found outdated {model_type} model, removing...") + logger.warning("found outdated %s model, removing...", model_type) os.remove(ckpt_path) if not os.path.exists(ckpt_path): - logger.info(f"{model_type} model not found, downloading...") + logger.info("%s model not found, downloading...", model_type) # The URL in the config is a 404 and needs to be fixed download_url = config.REMOTE_MODEL_PATHS[model_type]["path"].replace("tree", "resolve") _download(download_url, ckpt_path, config.CACHE_DIR) @@ -150,7 +150,7 @@ def load_model(ckpt_path, device, config, model_type="text"): model.load_state_dict(state_dict, strict=False) n_params = model.get_num_params() val_loss = checkpoint["best_val_loss"].item() - logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss") + logger.info("model loaded: %.1fM params, %.3f loss", n_params / 1e6, val_loss) model.eval() model.to(device) del checkpoint, state_dict diff --git a/TTS/vc/layers/freevc/wavlm/wavlm.py b/TTS/vc/layers/freevc/wavlm/wavlm.py index 0247ec53c1..a398aaf977 100644 --- a/TTS/vc/layers/freevc/wavlm/wavlm.py +++ b/TTS/vc/layers/freevc/wavlm/wavlm.py @@ -225,7 +225,7 @@ def __init__( cfg: WavLMConfig, ) -> None: super().__init__() - logger.info(f"WavLM Config: {cfg.__dict__}") + logger.info("WavLM Config: %s", cfg.__dict__) self.cfg = cfg feature_enc_layers = eval(cfg.conv_feature_layers) diff --git a/TTS/vc/models/knnvc.py b/TTS/vc/models/knnvc.py index 2f504704ef..968873cfec 100644 --- a/TTS/vc/models/knnvc.py +++ b/TTS/vc/models/knnvc.py @@ -74,7 +74,7 @@ def get_features(self, audio: PathOrTensor, vad_trigger_level=0) -> torch.Tensor x, sr = torchaudio.load(audio, normalize=True) if not sr == self.config.audio.sample_rate: - logger.info(f"Resampling {sr} to {self.config.audio.sample_rate} in {audio}") + logger.info("Resampling %d to %d in %s", sr, self.config.audio.sample_rate, audio) x = torchaudio.functional.resample(x, orig_freq=sr, new_freq=self.config.audio.sample_rate) sr = self.config.audio.sample_rate diff --git a/pyproject.toml b/pyproject.toml index 95a1cead69..bc5b869dce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -191,6 +191,7 @@ lint.extend-select = [ "F704", # yield-outside-function "F706", # return-outside-function "F841", # unused-variable + "G004", # no f-string in logging "I", # import sorting "PIE790", # unnecessary-pass "PLC", From f67f0a1718f77d3f11801357ace632d1fcc26d7e Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Sat, 11 Jan 2025 00:44:25 +0100 Subject: [PATCH 4/8] refactor: apply safe automatic ruff lint fixes Not manually checked. Generated with: uv run ruff check tests/ TTS/ notebooks/ recipes/ --fix --- TTS/api.py | 77 +++++++++---------- TTS/bin/compute_embeddings.py | 5 +- TTS/bin/compute_statistics.py | 6 +- TTS/bin/extract_tts_spectrograms.py | 5 +- TTS/bin/find_unique_phonemes.py | 5 +- TTS/bin/synthesize.py | 5 +- TTS/bin/train_encoder.py | 17 ++-- TTS/bin/train_vocoder.py | 3 +- TTS/config/__init__.py | 4 +- TTS/config/shared_configs.py | 3 +- TTS/demos/xtts_ft_demo/xtts_demo.py | 2 +- TTS/encoder/configs/base_encoder_config.py | 9 +-- TTS/encoder/utils/generic_utils.py | 2 +- TTS/encoder/utils/prepare_voxceleb.py | 3 +- TTS/model.py | 4 +- TTS/server/server.py | 3 +- TTS/tts/configs/align_tts_config.py | 5 +- TTS/tts/configs/bark_config.py | 7 +- TTS/tts/configs/delightful_tts_config.py | 5 +- TTS/tts/configs/fast_pitch_config.py | 3 +- TTS/tts/configs/fast_speech_config.py | 3 +- TTS/tts/configs/fastspeech2_config.py | 3 +- TTS/tts/configs/glow_tts_config.py | 3 +- TTS/tts/configs/neuralhmm_tts_config.py | 5 +- TTS/tts/configs/overflow_config.py | 5 +- TTS/tts/configs/shared_configs.py | 7 +- TTS/tts/configs/speedy_speech_config.py | 3 +- TTS/tts/configs/tacotron_config.py | 5 +- TTS/tts/configs/vits_config.py | 7 +- TTS/tts/configs/xtts_config.py | 3 +- TTS/tts/datasets/__init__.py | 13 ++-- TTS/tts/datasets/dataset.py | 30 ++++---- TTS/tts/datasets/formatters.py | 59 +++++++------- TTS/tts/layers/bark/inference_funcs.py | 13 ++-- .../layers/delightful_tts/acoustic_model.py | 12 +-- TTS/tts/layers/delightful_tts/conv_layers.py | 4 +- TTS/tts/layers/delightful_tts/encoders.py | 16 ++-- .../layers/delightful_tts/energy_adaptor.py | 4 +- TTS/tts/layers/delightful_tts/networks.py | 3 +- .../layers/delightful_tts/pitch_adaptor.py | 4 +- TTS/tts/layers/generic/aligner.py | 4 +- TTS/tts/layers/generic/pos_encoding.py | 4 +- TTS/tts/layers/losses.py | 4 +- TTS/tts/layers/overflow/common_layers.py | 7 +- TTS/tts/layers/overflow/neural_hmm.py | 4 +- TTS/tts/layers/tacotron/tacotron.py | 1 - TTS/tts/layers/tortoise/audio_utils.py | 9 +-- TTS/tts/layers/tortoise/autoregressive.py | 5 +- TTS/tts/layers/tortoise/dpm_solver.py | 26 +++---- TTS/tts/layers/tortoise/transformer.py | 4 +- TTS/tts/layers/tortoise/vocoder.py | 4 +- TTS/tts/layers/vits/transforms.py | 2 +- TTS/tts/layers/xtts/stream_generator.py | 40 +++++----- TTS/tts/layers/xtts/tokenizer.py | 2 +- TTS/tts/layers/xtts/trainer/gpt_trainer.py | 19 +++-- TTS/tts/layers/xtts/zh_num2words.py | 16 ++-- TTS/tts/models/__init__.py | 2 +- TTS/tts/models/align_tts.py | 3 +- TTS/tts/models/bark.py | 7 +- TTS/tts/models/base_tacotron.py | 13 ++-- TTS/tts/models/base_tts.py | 23 +++--- TTS/tts/models/delightful_tts.py | 41 +++++----- TTS/tts/models/forward_tts.py | 19 +++-- TTS/tts/models/glow_tts.py | 15 ++-- TTS/tts/models/neuralhmm_tts.py | 11 ++- TTS/tts/models/overflow.py | 11 ++- TTS/tts/models/tacotron.py | 10 +-- TTS/tts/models/tacotron2.py | 10 +-- TTS/tts/models/vits.py | 54 ++++++------- TTS/tts/models/xtts.py | 9 +-- TTS/tts/utils/data.py | 6 +- TTS/tts/utils/helpers.py | 6 +- TTS/tts/utils/languages.py | 16 ++-- TTS/tts/utils/managers.py | 52 ++++++------- TTS/tts/utils/speakers.py | 16 ++-- TTS/tts/utils/ssim.py | 19 +++-- TTS/tts/utils/synthesis.py | 12 +-- TTS/tts/utils/text/characters.py | 3 +- .../utils/text/chinese_mandarin/numbers.py | 1 - .../utils/text/chinese_mandarin/phonemizer.py | 8 +- TTS/tts/utils/text/cleaners.py | 3 +- TTS/tts/utils/text/cmudict.py | 2 - TTS/tts/utils/text/english/number_norm.py | 3 +- TTS/tts/utils/text/korean/ko_dictionary.py | 1 - TTS/tts/utils/text/korean/korean.py | 3 +- .../text/phonemizers/bangla_phonemizer.py | 4 +- TTS/tts/utils/text/phonemizers/base.py | 5 +- .../text/phonemizers/belarusian_phonemizer.py | 4 +- .../utils/text/phonemizers/espeak_wrapper.py | 3 +- .../utils/text/phonemizers/gruut_wrapper.py | 3 +- .../text/phonemizers/ja_jp_phonemizer.py | 4 +- .../text/phonemizers/ko_kr_phonemizer.py | 4 +- .../text/phonemizers/multi_phonemizer.py | 7 +- .../text/phonemizers/zh_cn_phonemizer.py | 4 +- TTS/tts/utils/text/tokenizer.py | 17 ++-- TTS/utils/audio/numpy_transforms.py | 20 ++--- TTS/utils/audio/processor.py | 8 +- TTS/utils/capacitron_optimizer.py | 2 +- TTS/utils/download.py | 21 ++--- TTS/utils/downloaders.py | 5 +- TTS/utils/generic_utils.py | 18 ++--- TTS/utils/manage.py | 30 ++++---- TTS/utils/radam.py | 8 +- TTS/utils/samplers.py | 4 +- TTS/utils/synthesizer.py | 34 ++++---- TTS/vc/configs/freevc_config.py | 13 ++-- TTS/vc/configs/openvoice_config.py | 7 +- TTS/vc/configs/shared_configs.py | 5 +- TTS/vc/layers/freevc/speaker_encoder/audio.py | 3 +- .../freevc/speaker_encoder/speaker_encoder.py | 5 +- TTS/vc/layers/freevc/wavlm/modules.py | 37 +++++---- TTS/vc/layers/freevc/wavlm/wavlm.py | 12 +-- TTS/vc/models/base_vc.py | 24 +++--- TTS/vc/models/freevc.py | 15 ++-- TTS/vc/models/knnvc.py | 15 ++-- TTS/vc/models/openvoice.py | 21 +++-- TTS/vocoder/configs/univnet_config.py | 9 +-- TTS/vocoder/datasets/__init__.py | 2 +- TTS/vocoder/datasets/wavegrad_dataset.py | 3 +- TTS/vocoder/layers/losses.py | 6 +- TTS/vocoder/models/gan.py | 27 ++++--- TTS/vocoder/models/univnet_generator.py | 3 +- TTS/vocoder/models/wavegrad.py | 25 +++--- TTS/vocoder/models/wavernn.py | 21 +++-- TTS/vocoder/utils/distribution.py | 2 +- TTS/vocoder/utils/generic_utils.py | 3 +- notebooks/dataset_analysis/analyze.py | 6 +- pyproject.toml | 1 + tests/__init__.py | 5 +- tests/data_tests/test_loader.py | 2 +- tests/tts_tests/test_losses.py | 34 ++++---- .../test_tacotron2_d-vectors_train.py | 2 +- tests/tts_tests/test_tacotron2_model.py | 36 ++++----- tests/tts_tests/test_tacotron_layers.py | 4 +- tests/tts_tests/test_tacotron_model.py | 36 ++++----- tests/tts_tests/test_vits.py | 6 +- tests/tts_tests2/test_glow_tts.py | 6 +- tests/vocoder_tests/test_wavegrad.py | 4 +- 138 files changed, 690 insertions(+), 820 deletions(-) diff --git a/TTS/api.py b/TTS/api.py index 6db929411c..126a7a68f8 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -4,7 +4,6 @@ import tempfile import warnings from pathlib import Path -from typing import Optional, Union from torch import nn @@ -22,15 +21,15 @@ def __init__( self, model_name: str = "", *, - model_path: Optional[str] = None, - config_path: Optional[str] = None, - vocoder_name: Optional[str] = None, - vocoder_path: Optional[str] = None, - vocoder_config_path: Optional[str] = None, - encoder_path: Optional[str] = None, - encoder_config_path: Optional[str] = None, - speakers_file_path: Optional[str] = None, - language_ids_file_path: Optional[str] = None, + model_path: str | None = None, + config_path: str | None = None, + vocoder_name: str | None = None, + vocoder_path: str | None = None, + vocoder_config_path: str | None = None, + encoder_path: str | None = None, + encoder_config_path: str | None = None, + speakers_file_path: str | None = None, + language_ids_file_path: str | None = None, progress_bar: bool = True, gpu: bool = False, ) -> None: @@ -77,8 +76,8 @@ def __init__( super().__init__() self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar) self.config = load_config(config_path) if config_path else None - self.synthesizer: Optional[Synthesizer] = None - self.voice_converter: Optional[Synthesizer] = None + self.synthesizer: Synthesizer | None = None + self.voice_converter: Synthesizer | None = None self.model_name = "" self.vocoder_path = vocoder_path @@ -156,8 +155,8 @@ def list_models() -> list[str]: return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False).list_models() def download_model_by_name( - self, model_name: str, vocoder_name: Optional[str] = None - ) -> tuple[Optional[Path], Optional[Path], Optional[Path], Optional[Path], Optional[Path]]: + self, model_name: str, vocoder_name: str | None = None + ) -> tuple[Path | None, Path | None, Path | None, Path | None, Path | None]: model_path, config_path, model_item = self.manager.download_model(model_name) if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)): # return model directory if there are multiple files @@ -176,7 +175,7 @@ def download_model_by_name( vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name) return model_path, config_path, vocoder_path, vocoder_config_path, None - def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None: + def load_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None: """Load one of the 🐸TTS models by name. Args: @@ -185,7 +184,7 @@ def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None """ self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu) - def load_vc_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None: + def load_vc_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None: """Load one of the voice conversion models by name. Args: @@ -205,7 +204,7 @@ def load_vc_model_by_name(self, model_name: str, vocoder_name: Optional[str] = N use_cuda=gpu, ) - def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None: + def load_tts_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None: """Load one of 🐸TTS models by name. Args: @@ -261,11 +260,11 @@ def load_tts_model_by_path(self, model_path: str, config_path: str, *, gpu: bool def _check_arguments( self, - speaker: Optional[str] = None, - language: Optional[str] = None, - speaker_wav: Optional[str] = None, - emotion: Optional[str] = None, - speed: Optional[float] = None, + speaker: str | None = None, + language: str | None = None, + speaker_wav: str | None = None, + emotion: str | None = None, + speed: float | None = None, **kwargs, ) -> None: """Check if the arguments are valid for the model.""" @@ -284,11 +283,11 @@ def _check_arguments( def tts( self, text: str, - speaker: Optional[str] = None, - language: Optional[str] = None, - speaker_wav: Optional[str] = None, - emotion: Optional[str] = None, - speed: Optional[float] = None, + speaker: str | None = None, + language: str | None = None, + speaker_wav: str | None = None, + emotion: str | None = None, + speed: float | None = None, split_sentences: bool = True, **kwargs, ): @@ -333,10 +332,10 @@ def tts( def tts_to_file( self, text: str, - speaker: Optional[str] = None, - language: Optional[str] = None, - speaker_wav: Optional[str] = None, - emotion: Optional[str] = None, + speaker: str | None = None, + language: str | None = None, + speaker_wav: str | None = None, + emotion: str | None = None, speed: float = 1.0, pipe_out=None, file_path: str = "output.wav", @@ -388,7 +387,7 @@ def tts_to_file( def voice_conversion( self, source_wav: str, - target_wav: Union[str, list[str]], + target_wav: str | list[str], ): """Voice conversion with FreeVC. Convert source wav to target speaker. @@ -406,7 +405,7 @@ def voice_conversion( def voice_conversion_to_file( self, source_wav: str, - target_wav: Union[str, list[str]], + target_wav: str | list[str], file_path: str = "output.wav", pipe_out=None, ) -> str: @@ -430,9 +429,9 @@ def tts_with_vc( self, text: str, *, - language: Optional[str] = None, - speaker_wav: Union[str, list[str]], - speaker: Optional[str] = None, + language: str | None = None, + speaker_wav: str | list[str], + speaker: str | None = None, split_sentences: bool = True, ): """Convert text to speech with voice conversion. @@ -473,10 +472,10 @@ def tts_with_vc_to_file( self, text: str, *, - language: Optional[str] = None, - speaker_wav: Union[str, list[str]], + language: str | None = None, + speaker_wav: str | list[str], file_path: str = "output.wav", - speaker: Optional[str] = None, + speaker: str | None = None, split_sentences: bool = True, pipe_out=None, ) -> str: diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index f103350912..d450e26fba 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -3,7 +3,6 @@ import os import sys from argparse import RawTextHelpFormatter -from typing import Optional import torch from tqdm import tqdm @@ -16,7 +15,7 @@ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger -def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace: +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: parser = argparse.ArgumentParser( description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n""" """ @@ -185,7 +184,7 @@ def compute_embeddings( print("Speaker embeddings saved at:", mapping_file_path) -def main(arg_list: Optional[list[str]] = None): +def main(arg_list: list[str] | None = None): setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) args = parse_args(arg_list) diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index b7c52ac6c5..1da7a092fb 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -1,12 +1,10 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import argparse import glob import logging import os import sys -from typing import Optional import numpy as np from tqdm import tqdm @@ -18,7 +16,7 @@ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger -def parse_args(arg_list: Optional[list[str]]) -> tuple[argparse.Namespace, list[str]]: +def parse_args(arg_list: list[str] | None) -> tuple[argparse.Namespace, list[str]]: parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.") parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.") parser.add_argument("out_path", type=str, help="save path (directory and filename).") @@ -31,7 +29,7 @@ def parse_args(arg_list: Optional[list[str]]) -> tuple[argparse.Namespace, list[ return parser.parse_known_args(arg_list) -def main(arg_list: Optional[list[str]] = None): +def main(arg_list: list[str] | None = None): """Run preprocessing process.""" setup_logger("TTS", level=logging.INFO, stream=sys.stderr, formatter=ConsoleFormatter()) args, overrides = parse_args(arg_list) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 77072f9efa..be9387f015 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -5,7 +5,6 @@ import logging import sys from pathlib import Path -from typing import Optional import numpy as np import torch @@ -27,7 +26,7 @@ use_cuda = torch.cuda.is_available() -def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace: +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True) parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True) @@ -244,7 +243,7 @@ def extract_spectrograms( f.write(f"{data[0] / data[1]}.npy\n") -def main(arg_list: Optional[list[str]] = None) -> None: +def main(arg_list: list[str] | None = None) -> None: setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) args = parse_args(arg_list) config = load_config(args.config_path) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index 0c453db85b..40afa1456c 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -5,7 +5,6 @@ import multiprocessing import sys from argparse import RawTextHelpFormatter -from typing import Optional from tqdm.contrib.concurrent import process_map @@ -21,7 +20,7 @@ def compute_phonemes(item: dict) -> set[str]: return set(ph) -def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace: +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: parser = argparse.ArgumentParser( description="""Find all the unique characters or phonemes in a dataset.\n\n""" """ @@ -35,7 +34,7 @@ def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace: return parser.parse_args(arg_list) -def main(arg_list: Optional[list[str]] = None) -> None: +def main(arg_list: list[str] | None = None) -> None: setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) global phonemizer args = parse_args(arg_list) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index f963485c5d..00d7530427 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -7,7 +7,6 @@ import logging import sys from argparse import RawTextHelpFormatter -from typing import Optional # pylint: disable=redefined-outer-name, unused-argument from TTS.utils.generic_utils import ConsoleFormatter, setup_logger @@ -135,7 +134,7 @@ """ -def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace: +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: """Parse arguments.""" parser = argparse.ArgumentParser( description=description.replace(" ```\n", ""), @@ -311,7 +310,7 @@ def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace: return args -def main(arg_list: Optional[list[str]] = None) -> None: +def main(arg_list: list[str] | None = None) -> None: """Entry point for `tts` command line interface.""" args = parse_args(arg_list) stream = sys.stderr if args.pipe_out else sys.stdout diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index a37ab8efc9..8d377db241 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging import os @@ -219,10 +218,8 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, if global_step % c.print_step == 0: print( - " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} " - "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format( - global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr - ), + f" | > Step:{global_step} Loss:{loss.item():.5f} GradNorm:{grad_norm:.5f} " + f"StepTime:{step_time:.2f} LoaderTime:{loader_time:.2f} AvGLoaderTime:{avg_loader_time:.2f} LR:{current_lr:.6f}", flush=True, ) @@ -236,10 +233,8 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, print("") print( - ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} " - "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format( - epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time - ), + f">>> Epoch:{epoch} AvgLoss: {tot_loss / len(data_loader):.5f} GradNorm:{grad_norm:.5f} " + f"EpochTime:{epoch_time:.2f} AvGLoaderTime:{avg_loader_time:.2f} ", flush=True, ) # evaluation @@ -249,7 +244,7 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, print("\n\n") print("--> EVAL PERFORMANCE") print( - " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss), + f" | > Epoch:{epoch} AvgLoss: {eval_loss:.5f} ", flush=True, ) # save the best checkpoint @@ -311,7 +306,7 @@ def main(args): # pylint: disable=redefined-outer-name scheduler = None num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) + print(f"\n > Model has {num_params} parameters", flush=True) if use_cuda: model = model.cuda() diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py index 7cf5696237..58122b9005 100644 --- a/TTS/bin/train_vocoder.py +++ b/TTS/bin/train_vocoder.py @@ -2,7 +2,6 @@ import os import sys from dataclasses import dataclass, field -from typing import Optional from trainer import Trainer, TrainerArgs @@ -18,7 +17,7 @@ class TrainVocoderArgs(TrainerArgs): config_path: str = field(default=None, metadata={"help": "Path to the config file."}) -def main(arg_list: Optional[list[str]] = None): +def main(arg_list: list[str] | None = None): """Run `tts` model training directly by a `config.json` file.""" setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index e5f40c0296..e47a141c73 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -54,7 +54,7 @@ def register_config(model_name: str) -> Coqpit: return config_class -def _process_model_name(config_dict: Dict) -> str: +def _process_model_name(config_dict: dict) -> str: """Format the model name as expected. It is a band-aid for the old `vocoder` model names. Args: @@ -68,7 +68,7 @@ def _process_model_name(config_dict: Dict) -> str: return model_name -def load_config(config_path: Union[str, os.PathLike[Any]]) -> Coqpit: +def load_config(config_path: str | os.PathLike[Any]) -> Coqpit: """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name to find the corresponding Config class. Then initialize the Config. diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index 7fae77d613..a0a013b0de 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -1,5 +1,4 @@ from dataclasses import asdict, dataclass -from typing import List from coqpit import Coqpit, check_argument from trainer import TrainerConfig @@ -227,7 +226,7 @@ class BaseDatasetConfig(Coqpit): dataset_name: str = "" path: str = "" meta_file_train: str = "" - ignored_speakers: List[str] = None + ignored_speakers: list[str] = None language: str = "" phonemizer: str = "" meta_file_val: str = "" diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 7ac38ed6ee..dac5f0870a 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -104,7 +104,7 @@ def isatty(self): def read_logs(): sys.stdout.flush() - with open(sys.stdout.log_file, "r") as f: + with open(sys.stdout.log_file) as f: return f.read() diff --git a/TTS/encoder/configs/base_encoder_config.py b/TTS/encoder/configs/base_encoder_config.py index ebbaa0457b..97cbf47893 100644 --- a/TTS/encoder/configs/base_encoder_config.py +++ b/TTS/encoder/configs/base_encoder_config.py @@ -1,5 +1,4 @@ from dataclasses import asdict, dataclass, field -from typing import Dict, List from coqpit import MISSING @@ -12,9 +11,9 @@ class BaseEncoderConfig(BaseTrainingConfig): model: str = None audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) - datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) + datasets: list[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # model params - model_params: Dict = field( + model_params: dict = field( default_factory=lambda: { "model_name": "lstm", "input_dim": 80, @@ -25,7 +24,7 @@ class BaseEncoderConfig(BaseTrainingConfig): } ) - audio_augmentation: Dict = field(default_factory=lambda: {}) + audio_augmentation: dict = field(default_factory=lambda: {}) # training params epochs: int = 10000 @@ -33,7 +32,7 @@ class BaseEncoderConfig(BaseTrainingConfig): grad_clip: float = 3.0 lr: float = 0.0001 optimizer: str = "radam" - optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0}) lr_decay: bool = False warmup_steps: int = 4000 diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py index 495b4def5a..d6c4f9fa50 100644 --- a/TTS/encoder/utils/generic_utils.py +++ b/TTS/encoder/utils/generic_utils.py @@ -12,7 +12,7 @@ logger = logging.getLogger(__name__) -class AugmentWAV(object): +class AugmentWAV: def __init__(self, ap, augmentation_config): self.ap = ap self.use_additive_noise = False diff --git a/TTS/encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py index 8ca7ea2fac..18ca21c876 100644 --- a/TTS/encoder/utils/prepare_voxceleb.py +++ b/TTS/encoder/utils/prepare_voxceleb.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo # All rights reserved. # @@ -194,7 +193,7 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file): writer.writerow(["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"]) for wav_file in files: writer.writerow(wav_file) - logger.info("Successfully generated csv file {}".format(csv_file_path)) + logger.info(f"Successfully generated csv file {csv_file_path}") def processor(directory, subset, force_process): diff --git a/TTS/model.py b/TTS/model.py index e024ad1a44..39faa7f690 100644 --- a/TTS/model.py +++ b/TTS/model.py @@ -1,6 +1,6 @@ import os from abc import abstractmethod -from typing import Any, Union +from typing import Any import torch from coqpit import Coqpit @@ -48,7 +48,7 @@ def inference(self, input: torch.Tensor, aux_input: dict[str, Any] = {}) -> dict def load_checkpoint( self, config: Coqpit, - checkpoint_path: Union[str, os.PathLike[Any]], + checkpoint_path: str | os.PathLike[Any], eval: bool = False, strict: bool = True, cache: bool = False, diff --git a/TTS/server/server.py b/TTS/server/server.py index cb4ed4d9b2..753e9103ab 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -9,7 +9,6 @@ import os import sys from threading import Lock -from typing import Union from urllib.parse import parse_qs try: @@ -108,7 +107,7 @@ def create_argparser() -> argparse.ArgumentParser: app = Flask(__name__) -def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]: +def style_wav_uri_to_dict(style_wav: str) -> str | dict: """Transform an uri style_wav, in either a string (path to wav file to be use for style transfer) or a dict (gst tokens/values to be use for styling) diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py index 317a01af53..2224396d1e 100644 --- a/TTS/tts/configs/align_tts_config.py +++ b/TTS/tts/configs/align_tts_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.align_tts import AlignTTSArgs @@ -70,7 +69,7 @@ class AlignTTSConfig(BaseTTSConfig): model: str = "align_tts" # model specific params model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs) - phase_start_steps: List[int] = None + phase_start_steps: list[int] = None ssim_alpha: float = 1.0 spec_loss_alpha: float = 1.0 @@ -96,7 +95,7 @@ class AlignTTSConfig(BaseTTSConfig): r: int = 1 # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/bark_config.py b/TTS/tts/configs/bark_config.py index b846febe85..61d67b987a 100644 --- a/TTS/tts/configs/bark_config.py +++ b/TTS/tts/configs/bark_config.py @@ -1,6 +1,5 @@ import os from dataclasses import dataclass, field -from typing import Dict from trainer.io import get_user_data_dir @@ -70,9 +69,9 @@ class BarkConfig(BaseTTSConfig): COARSE_INFER_TOKEN: int = 12_050 REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/" - REMOTE_MODEL_PATHS: Dict = None - LOCAL_MODEL_PATHS: Dict = None - SMALL_REMOTE_MODEL_PATHS: Dict = None + REMOTE_MODEL_PATHS: dict = None + LOCAL_MODEL_PATHS: dict = None + SMALL_REMOTE_MODEL_PATHS: dict = None CACHE_DIR: str = str(get_user_data_dir("tts/suno/bark_v0")) DEF_SPEAKER_DIR: str = str(get_user_data_dir("tts/bark_v0/speakers")) diff --git a/TTS/tts/configs/delightful_tts_config.py b/TTS/tts/configs/delightful_tts_config.py index 805d995369..7f9e7a6ab2 100644 --- a/TTS/tts/configs/delightful_tts_config.py +++ b/TTS/tts/configs/delightful_tts_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig @@ -73,7 +72,7 @@ class DelightfulTTSConfig(BaseTTSConfig): # optimizer steps_to_start_discriminator: int = 200000 - grad_clip: List[float] = field(default_factory=lambda: [1000, 1000]) + grad_clip: list[float] = field(default_factory=lambda: [1000, 1000]) lr_gen: float = 0.0002 lr_disc: float = 0.0002 lr_scheduler_gen: str = "ExponentialLR" @@ -140,7 +139,7 @@ class DelightfulTTSConfig(BaseTTSConfig): d_vector_dim: int = None # testing - test_sentences: List[List[str]] = field( + test_sentences: list[list[str]] = field( default_factory=lambda: [ ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."], ["Be a voice, not an echo."], diff --git a/TTS/tts/configs/fast_pitch_config.py b/TTS/tts/configs/fast_pitch_config.py index d086d26564..5b50122e09 100644 --- a/TTS/tts/configs/fast_pitch_config.py +++ b/TTS/tts/configs/fast_pitch_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.forward_tts import ForwardTTSArgs @@ -153,7 +152,7 @@ class FastPitchConfig(BaseTTSConfig): f0_cache_path: str = None # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/fast_speech_config.py b/TTS/tts/configs/fast_speech_config.py index af6c2db6fa..f375292256 100644 --- a/TTS/tts/configs/fast_speech_config.py +++ b/TTS/tts/configs/fast_speech_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.forward_tts import ForwardTTSArgs @@ -147,7 +146,7 @@ class FastSpeechConfig(BaseTTSConfig): f0_cache_path: str = None # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/fastspeech2_config.py b/TTS/tts/configs/fastspeech2_config.py index d179617fb0..3d6ce4f4b3 100644 --- a/TTS/tts/configs/fastspeech2_config.py +++ b/TTS/tts/configs/fastspeech2_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.forward_tts import ForwardTTSArgs @@ -168,7 +167,7 @@ class Fastspeech2Config(BaseTTSConfig): energy_cache_path: str = None # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py index f42f3e5a51..34b4057093 100644 --- a/TTS/tts/configs/glow_tts_config.py +++ b/TTS/tts/configs/glow_tts_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig @@ -171,7 +170,7 @@ class GlowTTSConfig(BaseTTSConfig): r: int = 1 # DO NOT CHANGE - TODO: make this immutable once coqpit implements it. # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/neuralhmm_tts_config.py b/TTS/tts/configs/neuralhmm_tts_config.py index 50f72847ed..be7a81fa89 100644 --- a/TTS/tts/configs/neuralhmm_tts_config.py +++ b/TTS/tts/configs/neuralhmm_tts_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig @@ -126,7 +125,7 @@ class NeuralhmmTTSConfig(BaseTTSConfig): memory_rnn_dim: int = 1024 ## Outputnet parameters - outputnet_size: List[int] = field(default_factory=lambda: [1024]) + outputnet_size: list[int] = field(default_factory=lambda: [1024]) flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14}) std_floor: float = 0.001 @@ -143,7 +142,7 @@ class NeuralhmmTTSConfig(BaseTTSConfig): min_audio_len: int = 512 # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "Be a voice, not an echo.", ] diff --git a/TTS/tts/configs/overflow_config.py b/TTS/tts/configs/overflow_config.py index dc3e5548b8..8a113f1f33 100644 --- a/TTS/tts/configs/overflow_config.py +++ b/TTS/tts/configs/overflow_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig @@ -145,7 +144,7 @@ class OverflowConfig(BaseTTSConfig): # The classname has to be camel case memory_rnn_dim: int = 1024 ## Outputnet parameters - outputnet_size: List[int] = field(default_factory=lambda: [1024]) + outputnet_size: list[int] = field(default_factory=lambda: [1024]) flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14}) std_floor: float = 0.01 @@ -174,7 +173,7 @@ class OverflowConfig(BaseTTSConfig): # The classname has to be camel case min_audio_len: int = 512 # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "Be a voice, not an echo.", ] diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index bf17322c19..bd5a28b43c 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -1,5 +1,4 @@ from dataclasses import asdict, dataclass, field -from typing import Dict, List from coqpit import Coqpit, check_argument @@ -138,7 +137,7 @@ class CharactersConfig(Coqpit): characters_class: str = None # using BaseVocabulary - vocab_dict: Dict = None + vocab_dict: dict = None # using on BaseCharacters pad: str = None @@ -323,7 +322,7 @@ class BaseTTSConfig(BaseTrainingConfig): shuffle: bool = False drop_last: bool = False # dataset - datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) + datasets: list[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # optimizer optimizer: str = "radam" optimizer_params: dict = None @@ -331,7 +330,7 @@ class BaseTTSConfig(BaseTrainingConfig): lr_scheduler: str = None lr_scheduler_params: dict = field(default_factory=lambda: {}) # testing - test_sentences: List[str] = field(default_factory=lambda: []) + test_sentences: list[str] = field(default_factory=lambda: []) # evaluation eval_split_max_size: int = None eval_split_size: float = 0.01 diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py index bf8517dfc4..29221d7b25 100644 --- a/TTS/tts/configs/speedy_speech_config.py +++ b/TTS/tts/configs/speedy_speech_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.forward_tts import ForwardTTSArgs @@ -164,7 +163,7 @@ class SpeedySpeechConfig(BaseTTSConfig): f0_cache_path: str = None # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index 350b5ea996..7badbfac59 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig, CapacitronVAEConfig, GSTConfig @@ -154,7 +153,7 @@ class TacotronConfig(BaseTTSConfig): num_speakers: int = 1 num_chars: int = 0 r: int = 2 - gradual_training: List[List[int]] = None + gradual_training: list[list[int]] = None memory_size: int = -1 prenet_type: str = "original" prenet_dropout: bool = True @@ -212,7 +211,7 @@ class TacotronConfig(BaseTTSConfig): ga_alpha: float = 5.0 # testing - test_sentences: List[str] = field( + test_sentences: list[str] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index 2d0242bf13..d85684c721 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.vits import VitsArgs, VitsAudioConfig @@ -112,7 +111,7 @@ class VitsConfig(BaseTTSConfig): audio: VitsAudioConfig = field(default_factory=VitsAudioConfig) # optimizer - grad_clip: List[float] = field(default_factory=lambda: [1000, 1000]) + grad_clip: list[float] = field(default_factory=lambda: [1000, 1000]) lr_gen: float = 0.0002 lr_disc: float = 0.0002 lr_scheduler_gen: str = "ExponentialLR" @@ -146,7 +145,7 @@ class VitsConfig(BaseTTSConfig): add_blank: bool = True # testing - test_sentences: List[List] = field( + test_sentences: list[list] = field( default_factory=lambda: [ ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."], ["Be a voice, not an echo."], @@ -167,7 +166,7 @@ class VitsConfig(BaseTTSConfig): # use d-vectors use_d_vector_file: bool = False - d_vector_file: List[str] = None + d_vector_file: list[str] = None d_vector_dim: int = None def __post_init__(self): diff --git a/TTS/tts/configs/xtts_config.py b/TTS/tts/configs/xtts_config.py index bbf048e1ab..da6cc6edc6 100644 --- a/TTS/tts/configs/xtts_config.py +++ b/TTS/tts/configs/xtts_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig @@ -70,7 +69,7 @@ class XttsConfig(BaseTTSConfig): model_args: XttsArgs = field(default_factory=XttsArgs) audio: XttsAudioConfig = field(default_factory=XttsAudioConfig) model_dir: str = None - languages: List[str] = field( + languages: list[str] = field( default_factory=lambda: [ "en", "es", diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index d1a37da4c1..a99eb4290d 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -2,8 +2,9 @@ import os import sys from collections import Counter +from collections.abc import Callable from pathlib import Path -from typing import Callable, Dict, List, Tuple, Union +from typing import Dict, List, Tuple, Union import numpy as np @@ -39,9 +40,7 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01): assert ( eval_split_size > 0 - ), " [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {}".format( - 1 / len(items) - ) + ), f" [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {1 / len(items)}" np.random.seed(0) np.random.shuffle(items) if is_multi_speaker: @@ -71,12 +70,12 @@ def add_extra_keys(metadata, language, dataset_name): def load_tts_samples( - datasets: Union[List[Dict], Dict], + datasets: list[dict] | dict, eval_split=True, formatter: Callable = None, eval_split_max_size=None, eval_split_size=0.01, -) -> Tuple[List[List], List[List]]: +) -> tuple[list[list], list[list]]: """Parse the dataset from the datasets config, load the samples as a List and load the attention alignments if provided. If `formatter` is not None, apply the formatter to the samples else pick the formatter from the available ones based on the dataset name. @@ -153,7 +152,7 @@ def load_tts_samples( def load_attention_mask_meta_data(metafile_path): """Load meta data file created by compute_attention_masks.py""" - with open(metafile_path, "r", encoding="utf-8") as f: + with open(metafile_path, encoding="utf-8") as f: lines = f.readlines() meta_data = [] diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 2b0cdf2c9b..6f21dcd1e0 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -3,7 +3,7 @@ import logging import os import random -from typing import Any, Optional, Union +from typing import Any import numpy as np import numpy.typing as npt @@ -47,7 +47,7 @@ def string2filename(string: str) -> str: return base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore") -def get_audio_size(audiopath: Union[str, os.PathLike[Any]]) -> int: +def get_audio_size(audiopath: str | os.PathLike[Any]) -> int: """Return the number of samples in the audio file.""" if not isinstance(audiopath, str): audiopath = str(audiopath) @@ -63,7 +63,7 @@ def get_audio_size(audiopath: Union[str, os.PathLike[Any]]) -> int: raise RuntimeError(msg) from e -def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: Optional[dict] = None): +def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: dict | None = None): """Create inverse frequency weights for balancing the dataset. Use `multi_dict` to scale relative weights.""" @@ -94,23 +94,23 @@ def __init__( outputs_per_step: int = 1, compute_linear_spec: bool = False, ap: AudioProcessor = None, - samples: Optional[list[dict]] = None, + samples: list[dict] | None = None, tokenizer: "TTSTokenizer" = None, compute_f0: bool = False, compute_energy: bool = False, - f0_cache_path: Optional[str] = None, - energy_cache_path: Optional[str] = None, + f0_cache_path: str | None = None, + energy_cache_path: str | None = None, return_wav: bool = False, batch_group_size: int = 0, min_text_len: int = 0, max_text_len: int = float("inf"), min_audio_len: int = 0, max_audio_len: int = float("inf"), - phoneme_cache_path: Optional[str] = None, + phoneme_cache_path: str | None = None, precompute_num_workers: int = 0, - speaker_id_mapping: Optional[dict] = None, - d_vector_mapping: Optional[dict] = None, - language_id_mapping: Optional[dict] = None, + speaker_id_mapping: dict | None = None, + d_vector_mapping: dict | None = None, + language_id_mapping: dict | None = None, use_noise_augment: bool = False, start_by_longest: bool = False, ) -> None: @@ -640,7 +640,7 @@ class PhonemeDataset(Dataset): def __init__( self, - samples: Union[list[dict], list[list]], + samples: list[dict] | list[list], tokenizer: "TTSTokenizer", cache_path: str, precompute_num_workers: int = 0, @@ -744,10 +744,10 @@ class F0Dataset: def __init__( self, - samples: Union[list[list], list[dict]], + samples: list[list] | list[dict], ap: "AudioProcessor", audio_config=None, # pylint: disable=unused-argument - cache_path: Optional[str] = None, + cache_path: str | None = None, precompute_num_workers: int = 0, normalize_f0: bool = True, ) -> None: @@ -896,9 +896,9 @@ class EnergyDataset: def __init__( self, - samples: Union[list[list], list[dict]], + samples: list[list] | list[dict], ap: "AudioProcessor", - cache_path: Optional[str] = None, + cache_path: str | None = None, precompute_num_workers=0, normalize_energy=True, ) -> None: diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index ff1a76e2c9..6cf65c9b5e 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -5,7 +5,6 @@ import xml.etree.ElementTree as ET from glob import glob from pathlib import Path -from typing import List from tqdm import tqdm @@ -21,7 +20,7 @@ def cml_tts(root_path, meta_file, ignored_speakers=None): https://github.com/freds0/CML-TTS-Dataset/""" filepath = os.path.join(root_path, meta_file) # ensure there are 4 columns for every line - with open(filepath, "r", encoding="utf8") as f: + with open(filepath, encoding="utf8") as f: lines = f.readlines() num_cols = len(lines[0].split("|")) # take the first row as reference for idx, line in enumerate(lines[1:]): @@ -61,7 +60,7 @@ def coqui(root_path, meta_file, ignored_speakers=None): """Interal dataset formatter.""" filepath = os.path.join(root_path, meta_file) # ensure there are 4 columns for every line - with open(filepath, "r", encoding="utf8") as f: + with open(filepath, encoding="utf8") as f: lines = f.readlines() num_cols = len(lines[0].split("|")) # take the first row as reference for idx, line in enumerate(lines[1:]): @@ -104,7 +103,7 @@ def tweb(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "tweb" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("\t") wav_file = os.path.join(root_path, cols[0] + ".wav") @@ -118,7 +117,7 @@ def mozilla(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "mozilla" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = cols[1].strip() @@ -133,7 +132,7 @@ def mozilla_de(root_path, meta_file, **kwargs): # pylint: disable=unused-argume txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "mozilla" - with open(txt_file, "r", encoding="ISO 8859-1") as ttf: + with open(txt_file, encoding="ISO 8859-1") as ttf: for line in ttf: cols = line.strip().split("|") wav_file = cols[0].strip() @@ -177,7 +176,7 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None): if speaker_name in ignored_speakers: continue logger.info(csv_file) - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") if not meta_files: @@ -201,7 +200,7 @@ def ljspeech(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "ljspeech" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") @@ -215,7 +214,7 @@ def ljspeech_test(root_path, meta_file, **kwargs): # pylint: disable=unused-arg https://keithito.com/LJ-Speech-Dataset/""" txt_file = os.path.join(root_path, meta_file) items = [] - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: speaker_id = 0 for idx, line in enumerate(ttf): # 2 samples per speaker to avoid eval split issues @@ -236,7 +235,7 @@ def thorsten(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "thorsten" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") @@ -268,7 +267,7 @@ def ruslan(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "ruslan" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, "RUSLAN", cols[0] + ".wav") @@ -282,7 +281,7 @@ def css10(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "css10" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, cols[0]) @@ -296,7 +295,7 @@ def nancy(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "nancy" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: utt_id = line.split()[1] text = line[line.find('"') + 1 : line.rfind('"') - 1] @@ -309,7 +308,7 @@ def common_voice(root_path, meta_file, ignored_speakers=None): """Normalize the common voice meta data file to TTS format.""" txt_file = os.path.join(root_path, meta_file) items = [] - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: if line.startswith("client_id"): continue @@ -338,7 +337,7 @@ def libri_tts(root_path, meta_files=None, ignored_speakers=None): for meta_file in meta_files: _meta_file = os.path.basename(meta_file).split(".")[0] - with open(meta_file, "r", encoding="utf-8") as ttf: + with open(meta_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("\t") file_name = cols[0] @@ -368,7 +367,7 @@ def custom_turkish(root_path, meta_file, **kwargs): # pylint: disable=unused-ar items = [] speaker_name = "turkish-female" skipped_files = [] - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, "wavs", cols[0].strip() + ".wav") @@ -386,7 +385,7 @@ def brspeech(root_path, meta_file, ignored_speakers=None): """BRSpeech 3.0 beta""" txt_file = os.path.join(root_path, meta_file) items = [] - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: if line.startswith("wav_filename"): continue @@ -433,7 +432,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic if isinstance(ignored_speakers, list): if speaker_id in ignored_speakers: continue - with open(meta_file, "r", encoding="utf-8") as file_text: + with open(meta_file, encoding="utf-8") as file_text: text = file_text.readlines()[0] # p280 has no mic2 recordings if speaker_id == "p280": @@ -460,7 +459,7 @@ def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=Non if isinstance(ignored_speakers, list): if speaker_id in ignored_speakers: continue - with open(meta_file, "r", encoding="utf-8") as file_text: + with open(meta_file, encoding="utf-8") as file_text: text = file_text.readlines()[0] wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav") items.append( @@ -482,7 +481,7 @@ def synpaflex(root_path, metafiles=None, **kwargs): # pylint: disable=unused-ar os.path.dirname(wav_file), "txt", os.path.basename(wav_file).replace(".wav", ".txt") ) if os.path.exists(txt_file) and os.path.exists(wav_file): - with open(txt_file, "r", encoding="utf-8") as file_text: + with open(txt_file, encoding="utf-8") as file_text: text = file_text.readlines()[0] items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}) return items @@ -500,7 +499,7 @@ def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, igno if isinstance(ignored_speakers, list): if speaker_id in ignored_speakers: continue - with open(meta_file, "r", encoding="utf-8") as file_text: + with open(meta_file, encoding="utf-8") as file_text: text = file_text.readline().replace("\n", "") # ignore sentences that contains digits if ignore_digits_sentences and any(map(str.isdigit, text)): @@ -513,7 +512,7 @@ def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, igno def mls(root_path, meta_files=None, ignored_speakers=None): """http://www.openslr.org/94/""" items = [] - with open(os.path.join(root_path, meta_files), "r", encoding="utf-8") as meta: + with open(os.path.join(root_path, meta_files), encoding="utf-8") as meta: for line in meta: file, text = line.split("\t") text = text[:-1] @@ -553,7 +552,7 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): # if not exists meta file, crawl recursively for 'wav' files if meta_file is not None: - with open(str(meta_file), "r", encoding="utf-8") as f: + with open(str(meta_file), encoding="utf-8") as f: return [x.strip().split("|") for x in f.readlines()] elif not cache_to.exists(): @@ -575,7 +574,7 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): if cnt < expected_count: raise ValueError(f"Found too few instances for Voxceleb. Should be around {expected_count}, is: {cnt}") - with open(str(cache_to), "r", encoding="utf-8") as f: + with open(str(cache_to), encoding="utf-8") as f: return [x.strip().split("|") for x in f.readlines()] @@ -583,7 +582,7 @@ def emotion(root_path, meta_file, ignored_speakers=None): """Generic emotion dataset""" txt_file = os.path.join(root_path, meta_file) items = [] - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: if line.startswith("file_path"): continue @@ -601,7 +600,7 @@ def emotion(root_path, meta_file, ignored_speakers=None): return items -def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: # pylint: disable=unused-argument +def baker(root_path: str, meta_file: str, **kwargs) -> list[list[str]]: # pylint: disable=unused-argument """Normalizes the Baker meta data file to TTS format Args: @@ -613,7 +612,7 @@ def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: # pylin txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "baker" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: wav_name, text = line.rstrip("\n").split("|") wav_path = os.path.join(root_path, "clips_22", wav_name) @@ -626,7 +625,7 @@ def kokoro(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "kokoro" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") @@ -640,7 +639,7 @@ def kss(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "kss" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, cols[0]) @@ -653,7 +652,7 @@ def bel_tts_formatter(root_path, meta_file, **kwargs): # pylint: disable=unused txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "bel_tts" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, cols[0]) diff --git a/TTS/tts/layers/bark/inference_funcs.py b/TTS/tts/layers/bark/inference_funcs.py index 58331bc096..1d141dc537 100644 --- a/TTS/tts/layers/bark/inference_funcs.py +++ b/TTS/tts/layers/bark/inference_funcs.py @@ -2,7 +2,6 @@ import os import re from glob import glob -from typing import Dict, List, Optional, Tuple import librosa import numpy as np @@ -34,9 +33,9 @@ def _normalize_whitespace(text): return re.sub(r"\s+", " ", text).strip() -def get_voices(extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value +def get_voices(extra_voice_dirs: list[str] = []): # pylint: disable=dangerous-default-value dirs = extra_voice_dirs - voices: Dict[str, List[str]] = {} + voices: dict[str, list[str]] = {} for d in dirs: subs = os.listdir(d) for sub in subs: @@ -49,7 +48,7 @@ def get_voices(extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-d return voices -def load_npz(npz_file: str) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]: +def load_npz(npz_file: str) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]: x_history = np.load(npz_file) semantic = x_history["semantic_prompt"] coarse = x_history["coarse_prompt"] @@ -58,9 +57,9 @@ def load_npz(npz_file: str) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64 def load_voice( - model, voice: str, extra_voice_dirs: List[str] = [] -) -> Tuple[ - Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]] + model, voice: str, extra_voice_dirs: list[str] = [] +) -> tuple[ + npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None ]: # pylint: disable=dangerous-default-value if voice == "random": return None, None, None diff --git a/TTS/tts/layers/delightful_tts/acoustic_model.py b/TTS/tts/layers/delightful_tts/acoustic_model.py index 2aa82c9a88..9110ff5fd0 100644 --- a/TTS/tts/layers/delightful_tts/acoustic_model.py +++ b/TTS/tts/layers/delightful_tts/acoustic_model.py @@ -1,6 +1,6 @@ ### credit: https://github.com/dunky11/voicesmith import logging -from typing import Callable, Dict, Tuple +from collections.abc import Callable import torch import torch.nn.functional as F @@ -177,7 +177,7 @@ def init_multispeaker(self, args: Coqpit): # pylint: disable=unused-argument self._init_d_vector() @staticmethod - def _set_cond_input(aux_input: Dict): + def _set_cond_input(aux_input: dict): """Set the speaker conditioning input based on the multi-speaker mode.""" sid, g, lid, durations = None, None, None, None if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None: @@ -194,11 +194,11 @@ def _set_cond_input(aux_input: Dict): return sid, g, lid, durations - def get_aux_input(self, aux_input: Dict): + def get_aux_input(self, aux_input: dict): sid, g, lid, _ = self._set_cond_input(aux_input) return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid} - def _set_speaker_input(self, aux_input: Dict): + def _set_speaker_input(self, aux_input: dict): d_vectors = aux_input.get("d_vectors", None) speaker_ids = aux_input.get("speaker_ids", None) @@ -237,7 +237,7 @@ def _forward_aligner( x_mask: torch.IntTensor, y_mask: torch.IntTensor, attn_priors: torch.FloatTensor, - ) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: """Aligner forward pass. 1. Compute a mask to apply to the attention map. @@ -298,7 +298,7 @@ def forward( use_ground_truth: bool = True, d_vectors: torch.Tensor = None, speaker_idx: torch.Tensor = None, - ) -> Dict[str, torch.Tensor]: + ) -> dict[str, torch.Tensor]: sid, g, lid, _ = self._set_cond_input( # pylint: disable=unused-variable {"d_vectors": d_vectors, "speaker_ids": speaker_idx} ) # pylint: disable=unused-variable diff --git a/TTS/tts/layers/delightful_tts/conv_layers.py b/TTS/tts/layers/delightful_tts/conv_layers.py index 1d5139571e..588d236852 100644 --- a/TTS/tts/layers/delightful_tts/conv_layers.py +++ b/TTS/tts/layers/delightful_tts/conv_layers.py @@ -1,11 +1,9 @@ -from typing import Tuple - import torch import torch.nn as nn # pylint: disable=consider-using-from-import import torch.nn.functional as F -def calc_same_padding(kernel_size: int) -> Tuple[int, int]: +def calc_same_padding(kernel_size: int) -> tuple[int, int]: pad = kernel_size // 2 return (pad, pad - (kernel_size + 1) % 2) diff --git a/TTS/tts/layers/delightful_tts/encoders.py b/TTS/tts/layers/delightful_tts/encoders.py index bd0c319dc1..31bab8cc97 100644 --- a/TTS/tts/layers/delightful_tts/encoders.py +++ b/TTS/tts/layers/delightful_tts/encoders.py @@ -1,5 +1,3 @@ -from typing import List, Tuple, Union - import torch import torch.nn as nn # pylint: disable=consider-using-from-import import torch.nn.functional as F @@ -36,9 +34,9 @@ class ReferenceEncoder(nn.Module): def __init__( self, num_mels: int, - ref_enc_filters: List[Union[int, int, int, int, int, int]], + ref_enc_filters: list[int | int | int | int | int | int], ref_enc_size: int, - ref_enc_strides: List[Union[int, int, int, int, int]], + ref_enc_strides: list[int | int | int | int | int], ref_enc_gru_size: int, ): super().__init__() @@ -80,7 +78,7 @@ def __init__( batch_first=True, ) - def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ inputs --- [N, n_mels, timesteps] outputs --- [N, E//2] @@ -120,9 +118,9 @@ class UtteranceLevelProsodyEncoder(nn.Module): def __init__( self, num_mels: int, - ref_enc_filters: List[Union[int, int, int, int, int, int]], + ref_enc_filters: list[int | int | int | int | int | int], ref_enc_size: int, - ref_enc_strides: List[Union[int, int, int, int, int]], + ref_enc_strides: list[int | int | int | int | int], ref_enc_gru_size: int, dropout: float, n_hidden: int, @@ -192,9 +190,9 @@ class PhonemeLevelProsodyEncoder(nn.Module): def __init__( self, num_mels: int, - ref_enc_filters: List[Union[int, int, int, int, int, int]], + ref_enc_filters: list[int | int | int | int | int | int], ref_enc_size: int, - ref_enc_strides: List[Union[int, int, int, int, int]], + ref_enc_strides: list[int | int | int | int | int], ref_enc_gru_size: int, dropout: float, n_hidden: int, diff --git a/TTS/tts/layers/delightful_tts/energy_adaptor.py b/TTS/tts/layers/delightful_tts/energy_adaptor.py index ea0d1e4721..d2b4b0ffa8 100644 --- a/TTS/tts/layers/delightful_tts/energy_adaptor.py +++ b/TTS/tts/layers/delightful_tts/energy_adaptor.py @@ -1,4 +1,4 @@ -from typing import Callable, Tuple +from collections.abc import Callable import torch import torch.nn as nn # pylint: disable=consider-using-from-import @@ -59,7 +59,7 @@ def __init__( def get_energy_embedding_train( self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Shapes: x: :math: `[B, T_src, C]` diff --git a/TTS/tts/layers/delightful_tts/networks.py b/TTS/tts/layers/delightful_tts/networks.py index 4305022f18..d0a4adae79 100644 --- a/TTS/tts/layers/delightful_tts/networks.py +++ b/TTS/tts/layers/delightful_tts/networks.py @@ -1,5 +1,4 @@ import math -from typing import Tuple import numpy as np import torch @@ -9,7 +8,7 @@ from TTS.tts.layers.delightful_tts.conv_layers import ConvNorm -def initialize_embeddings(shape: Tuple[int]) -> torch.Tensor: +def initialize_embeddings(shape: tuple[int]) -> torch.Tensor: assert len(shape) == 2, "Can only initialize 2-D embedding matrices ..." # Kaiming initialization return torch.randn(shape) * np.sqrt(2 / shape[1]) diff --git a/TTS/tts/layers/delightful_tts/pitch_adaptor.py b/TTS/tts/layers/delightful_tts/pitch_adaptor.py index 9031369e0f..14e751d2e2 100644 --- a/TTS/tts/layers/delightful_tts/pitch_adaptor.py +++ b/TTS/tts/layers/delightful_tts/pitch_adaptor.py @@ -1,4 +1,4 @@ -from typing import Callable, Tuple +from collections.abc import Callable import torch import torch.nn as nn # pylint: disable=consider-using-from-import @@ -58,7 +58,7 @@ def __init__( def get_pitch_embedding_train( self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Shapes: x: :math: `[B, T_src, C]` diff --git a/TTS/tts/layers/generic/aligner.py b/TTS/tts/layers/generic/aligner.py index baa6f0e9c4..480c48f9a4 100644 --- a/TTS/tts/layers/generic/aligner.py +++ b/TTS/tts/layers/generic/aligner.py @@ -1,5 +1,3 @@ -from typing import Tuple - import torch from torch import nn @@ -68,7 +66,7 @@ def init_layers(self): def forward( self, queries: torch.tensor, keys: torch.tensor, mask: torch.tensor = None, attn_prior: torch.tensor = None - ) -> Tuple[torch.tensor, torch.tensor]: + ) -> tuple[torch.tensor, torch.tensor]: """Forward pass of the aligner encoder. Shapes: - queries: :math:`[B, C, T_de]` diff --git a/TTS/tts/layers/generic/pos_encoding.py b/TTS/tts/layers/generic/pos_encoding.py index 913add0d14..695e37a6e0 100644 --- a/TTS/tts/layers/generic/pos_encoding.py +++ b/TTS/tts/layers/generic/pos_encoding.py @@ -18,9 +18,7 @@ class PositionalEncoding(nn.Module): def __init__(self, channels, dropout_p=0.0, max_len=5000, use_scale=False): super().__init__() if channels % 2 != 0: - raise ValueError( - "Cannot use sin/cos positional encoding with " "odd channels (got channels={:d})".format(channels) - ) + raise ValueError("Cannot use sin/cos positional encoding with " f"odd channels (got channels={channels:d})") self.use_scale = use_scale if use_scale: self.scale = torch.nn.Parameter(torch.ones(1)) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index db62430c9d..1e744d62cf 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -814,7 +814,7 @@ def __init__(self, c): elif c.spec_loss_type == "l1": self.spec_loss = L1LossMasked(False) else: - raise ValueError(" [!] Unknown spec_loss_type {}".format(c.spec_loss_type)) + raise ValueError(f" [!] Unknown spec_loss_type {c.spec_loss_type}") if c.duration_loss_type == "mse": self.dur_loss = MSELossMasked(False) @@ -823,7 +823,7 @@ def __init__(self, c): elif c.duration_loss_type == "huber": self.dur_loss = Huber() else: - raise ValueError(" [!] Unknown duration_loss_type {}".format(c.duration_loss_type)) + raise ValueError(f" [!] Unknown duration_loss_type {c.duration_loss_type}") if c.model_args.use_aligner: self.aligner_loss = ForwardSumLoss() diff --git a/TTS/tts/layers/overflow/common_layers.py b/TTS/tts/layers/overflow/common_layers.py index 9f77af293c..a477b34f0b 100644 --- a/TTS/tts/layers/overflow/common_layers.py +++ b/TTS/tts/layers/overflow/common_layers.py @@ -1,5 +1,4 @@ import logging -from typing import List, Tuple import torch import torch.nn.functional as F @@ -44,7 +43,7 @@ def __init__(self, num_chars, state_per_phone, in_out_channels=512, n_convolutio ) self.rnn_state = None - def forward(self, x: torch.FloatTensor, x_len: torch.LongTensor) -> Tuple[torch.FloatTensor, torch.LongTensor]: + def forward(self, x: torch.FloatTensor, x_len: torch.LongTensor) -> tuple[torch.FloatTensor, torch.LongTensor]: """Forward pass to the encoder. Args: @@ -110,7 +109,7 @@ class ParameterModel(nn.Module): def __init__( self, - outputnet_size: List[int], + outputnet_size: list[int], input_size: int, output_size: int, frame_channels: int, @@ -152,7 +151,7 @@ def __init__( encoder_dim: int, memory_rnn_dim: int, frame_channels: int, - outputnet_size: List[int], + outputnet_size: list[int], flat_start_params: dict, std_floor: float = 1e-2, ): diff --git a/TTS/tts/layers/overflow/neural_hmm.py b/TTS/tts/layers/overflow/neural_hmm.py index a12becef03..9142f65e8c 100644 --- a/TTS/tts/layers/overflow/neural_hmm.py +++ b/TTS/tts/layers/overflow/neural_hmm.py @@ -1,5 +1,3 @@ -from typing import List - import torch import torch.distributions as tdist import torch.nn.functional as F @@ -57,7 +55,7 @@ def __init__( prenet_dropout: float, prenet_dropout_at_inference: bool, memory_rnn_dim: int, - outputnet_size: List[int], + outputnet_size: list[int], flat_start_params: dict, std_floor: float, use_grad_checkpointing: bool = True, diff --git a/TTS/tts/layers/tacotron/tacotron.py b/TTS/tts/layers/tacotron/tacotron.py index 32643dfcee..6f33edf3d7 100644 --- a/TTS/tts/layers/tacotron/tacotron.py +++ b/TTS/tts/layers/tacotron/tacotron.py @@ -1,4 +1,3 @@ -# coding: utf-8 # adapted from https://github.com/r9y9/tacotron_pytorch import logging diff --git a/TTS/tts/layers/tortoise/audio_utils.py b/TTS/tts/layers/tortoise/audio_utils.py index c67ee6c44b..6d6bb8cdb7 100644 --- a/TTS/tts/layers/tortoise/audio_utils.py +++ b/TTS/tts/layers/tortoise/audio_utils.py @@ -1,7 +1,6 @@ import logging import os from glob import glob -from typing import Dict, List import librosa import numpy as np @@ -88,9 +87,9 @@ def normalize_tacotron_mel(mel): return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1 -def get_voices(extra_voice_dirs: List[str] = []): +def get_voices(extra_voice_dirs: list[str] = []): dirs = extra_voice_dirs - voices: Dict[str, List[str]] = {} + voices: dict[str, list[str]] = {} for d in dirs: subs = os.listdir(d) for sub in subs: @@ -100,7 +99,7 @@ def get_voices(extra_voice_dirs: List[str] = []): return voices -def load_voice(voice: str, extra_voice_dirs: List[str] = []): +def load_voice(voice: str, extra_voice_dirs: list[str] = []): if voice == "random": return None, None @@ -116,7 +115,7 @@ def load_voice(voice: str, extra_voice_dirs: List[str] = []): return conds, None -def load_voices(voices: List[str], extra_voice_dirs: List[str] = []): +def load_voices(voices: list[str], extra_voice_dirs: list[str] = []): latents = [] clips = [] for voice in voices: diff --git a/TTS/tts/layers/tortoise/autoregressive.py b/TTS/tts/layers/tortoise/autoregressive.py index 00c884e973..cbfe076825 100644 --- a/TTS/tts/layers/tortoise/autoregressive.py +++ b/TTS/tts/layers/tortoise/autoregressive.py @@ -1,7 +1,6 @@ # AGPL: a notification must be added stating that changes have been made to that file. import functools import random -from typing import Optional import torch import torch.nn as nn @@ -640,8 +639,8 @@ def inference_speech( def _prepare_attention_mask_for_generation( inputs: torch.Tensor, - pad_token_id: Optional[torch.Tensor], - eos_token_id: Optional[torch.Tensor], + pad_token_id: torch.Tensor | None, + eos_token_id: torch.Tensor | None, ) -> torch.LongTensor: # No information for attention mask inference -> return default attention mask default_attention_mask = torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device) diff --git a/TTS/tts/layers/tortoise/dpm_solver.py b/TTS/tts/layers/tortoise/dpm_solver.py index 6a1d8ff784..d34b61f486 100644 --- a/TTS/tts/layers/tortoise/dpm_solver.py +++ b/TTS/tts/layers/tortoise/dpm_solver.py @@ -98,9 +98,7 @@ def __init__( if schedule not in ["discrete", "linear", "cosine"]: raise ValueError( - "Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format( - schedule - ) + f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear' or 'cosine'" ) self.schedule = schedule @@ -150,7 +148,7 @@ def marginal_log_mean_coeff(self, t): t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device), - ).reshape((-1)) + ).reshape(-1) elif self.schedule == "linear": return -0.25 * t**2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0 elif self.schedule == "cosine": @@ -447,7 +445,7 @@ def correcting_xt_fn(xt, t, step): Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b. """ - self.model = lambda x, t: model_fn(x, t.expand((x.shape[0]))) + self.model = lambda x, t: model_fn(x, t.expand(x.shape[0])) self.noise_schedule = noise_schedule assert algorithm_type in ["dpmsolver", "dpmsolver++"] self.algorithm_type = algorithm_type @@ -527,7 +525,7 @@ def get_time_steps(self, skip_type, t_T, t_0, N, device): return t else: raise ValueError( - "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type) + f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'" ) def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device): @@ -693,7 +691,7 @@ def singlestep_dpm_solver_second_update( x_t: A pytorch tensor. The approximated solution at time `t`. """ if solver_type not in ["dpmsolver", "taylor"]: - raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type)) + raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}") if r1 is None: r1 = 0.5 ns = self.noise_schedule @@ -790,7 +788,7 @@ def singlestep_dpm_solver_third_update( x_t: A pytorch tensor. The approximated solution at time `t`. """ if solver_type not in ["dpmsolver", "taylor"]: - raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type)) + raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}") if r1 is None: r1 = 1.0 / 3.0 if r2 is None: @@ -913,7 +911,7 @@ def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, x_t: A pytorch tensor. The approximated solution at time `t`. """ if solver_type not in ["dpmsolver", "taylor"]: - raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type)) + raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}") ns = self.noise_schedule model_prev_1, model_prev_0 = model_prev_list[-2], model_prev_list[-1] t_prev_1, t_prev_0 = t_prev_list[-2], t_prev_list[-1] @@ -1062,7 +1060,7 @@ def singlestep_dpm_solver_update( r2=r2, ) else: - raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order)) + raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}") def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type="dpmsolver"): """ @@ -1086,7 +1084,7 @@ def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, elif order == 3: return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type) else: - raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order)) + raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}") def dpm_solver_adaptive( self, @@ -1150,8 +1148,8 @@ def higher_update(x, s, t, **kwargs): return self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs) else: - raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order)) - while torch.abs((s - t_0)).mean() > t_err: + raise ValueError(f"For adaptive step size solver, order must be 2 or 3, got {order}") + while torch.abs(s - t_0).mean() > t_err: t = ns.inverse_lambda(lambda_s + h) x_lower, lower_noise_kwargs = lower_update(x, s, t) x_higher = higher_update(x, s, t, **lower_noise_kwargs) @@ -1487,7 +1485,7 @@ def sample( if return_intermediate: intermediates.append(x) else: - raise ValueError("Got wrong method {}".format(method)) + raise ValueError(f"Got wrong method {method}") if denoise_to_zero: t = torch.ones((1,)).to(device) * t_0 x = self.denoise_to_zero_fn(x, t) diff --git a/TTS/tts/layers/tortoise/transformer.py b/TTS/tts/layers/tortoise/transformer.py index ed4d79d4ab..c1854bd196 100644 --- a/TTS/tts/layers/tortoise/transformer.py +++ b/TTS/tts/layers/tortoise/transformer.py @@ -1,4 +1,4 @@ -from typing import TypeVar, Union +from typing import TypeVar import torch import torch.nn.functional as F @@ -11,7 +11,7 @@ _T = TypeVar("_T") -def cast_tuple(val: Union[tuple[_T], list[_T], _T], depth: int = 1) -> tuple[_T]: +def cast_tuple(val: tuple[_T] | list[_T] | _T, depth: int = 1) -> tuple[_T]: if isinstance(val, list): return tuple(val) return val if isinstance(val, tuple) else (val,) * depth diff --git a/TTS/tts/layers/tortoise/vocoder.py b/TTS/tts/layers/tortoise/vocoder.py index a5200c2673..6fd784f486 100644 --- a/TTS/tts/layers/tortoise/vocoder.py +++ b/TTS/tts/layers/tortoise/vocoder.py @@ -1,6 +1,6 @@ +from collections.abc import Callable from dataclasses import dataclass from enum import Enum -from typing import Callable, Optional import torch import torch.nn as nn @@ -378,7 +378,7 @@ def inference(self, c, z=None): class VocType: constructor: Callable[[], nn.Module] model_path: str - subkey: Optional[str] = None + subkey: str | None = None def optionally_index(self, model_dict): if self.subkey is not None: diff --git a/TTS/tts/layers/vits/transforms.py b/TTS/tts/layers/vits/transforms.py index 3cac1b8d6d..da5deea9ef 100644 --- a/TTS/tts/layers/vits/transforms.py +++ b/TTS/tts/layers/vits/transforms.py @@ -74,7 +74,7 @@ def unconstrained_rational_quadratic_spline( outputs[outside_interval_mask] = inputs[outside_interval_mask] logabsdet[outside_interval_mask] = 0 else: - raise RuntimeError("{} tails are not implemented.".format(tails)) + raise RuntimeError(f"{tails} tails are not implemented.") outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( inputs=inputs[inside_interval_mask], diff --git a/TTS/tts/layers/xtts/stream_generator.py b/TTS/tts/layers/xtts/stream_generator.py index 2f4b54cec1..303a990c27 100644 --- a/TTS/tts/layers/xtts/stream_generator.py +++ b/TTS/tts/layers/xtts/stream_generator.py @@ -4,7 +4,7 @@ import inspect import random import warnings -from typing import Callable, Optional, Union +from collections.abc import Callable import numpy as np import torch @@ -48,15 +48,15 @@ class NewGenerationMixin(GenerationMixin): @torch.inference_mode() def generate( # noqa: PLR0911 self, - inputs: Optional[torch.Tensor] = None, - generation_config: Optional[StreamGenerationConfig] = None, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None, - synced_gpus: Optional[bool] = False, + inputs: torch.Tensor | None = None, + generation_config: StreamGenerationConfig | None = None, + logits_processor: LogitsProcessorList | None = None, + stopping_criteria: StoppingCriteriaList | None = None, + prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], list[int]] | None = None, + synced_gpus: bool | None = False, seed: int = 0, **kwargs, - ) -> Union[GenerateOutput, torch.LongTensor]: + ) -> GenerateOutput | torch.LongTensor: r""" Generates sequences of token ids for models with a language modeling head. @@ -666,19 +666,19 @@ def typeerror(): def sample_stream( self, input_ids: torch.LongTensor, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_warper: Optional[LogitsProcessorList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[Union[int, list[int]]] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: Optional[bool] = False, + logits_processor: LogitsProcessorList | None = None, + stopping_criteria: StoppingCriteriaList | None = None, + logits_warper: LogitsProcessorList | None = None, + max_length: int | None = None, + pad_token_id: int | None = None, + eos_token_id: int | list[int] | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + output_scores: bool | None = None, + return_dict_in_generate: bool | None = None, + synced_gpus: bool | None = False, **model_kwargs, - ) -> Union[SampleOutput, torch.LongTensor]: + ) -> SampleOutput | torch.LongTensor: r""" Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index fec8358deb..8af4e78a40 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -505,7 +505,7 @@ def _expand_decimal_point(m, lang="en"): def _expand_currency(m, lang="en", currency="USD"): - amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", ".")))) + amount = float(re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))) full_amount = num2words(amount, to="currency", currency=currency, lang=lang) and_equivalents = { diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py index 9e8e753a61..271e7b46af 100644 --- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py +++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py @@ -1,6 +1,5 @@ import logging from dataclasses import dataclass, field -from typing import Dict, List, Tuple, Union import torch import torch.nn as nn @@ -31,7 +30,7 @@ class GPTTrainerConfig(XttsConfig): optimizer_wd_only_on_weights: bool = False weighted_loss_attrs: dict = field(default_factory=lambda: {}) weighted_loss_multipliers: dict = field(default_factory=lambda: {}) - test_sentences: List[dict] = field(default_factory=lambda: []) + test_sentences: list[dict] = field(default_factory=lambda: []) @dataclass @@ -222,7 +221,7 @@ def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels return losses @torch.inference_mode() - def test_run(self, assets) -> Tuple[Dict, Dict]: # pylint: disable=W0613 + def test_run(self, assets) -> tuple[dict, dict]: # pylint: disable=W0613 test_audios = {} if self.config.test_sentences: # init gpt for inference mode @@ -237,7 +236,7 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: # pylint: disable=W0613 s_info["language"], gpt_cond_len=3, )["wav"] - test_audios["{}-audio".format(idx)] = wav + test_audios[f"{idx}-audio"] = wav # delete inference layers del self.xtts.gpt.gpt_inference @@ -249,7 +248,7 @@ def test_log( ) -> None: logger.test_audios(steps, outputs["audios"], self.args.output_sample_rate) - def format_batch(self, batch: Dict) -> Dict: + def format_batch(self, batch: dict) -> dict: return batch @torch.no_grad() # torch no grad to avoid gradients from the pre-processing and DVAE codes extraction @@ -351,9 +350,9 @@ def get_sampler(self, dataset: TTSDataset, num_gpus=1): def get_data_loader( self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: bool, - samples: Union[List[Dict], List[List]], + samples: list[dict] | list[list], verbose: bool, num_gpus: int, rank: int = None, @@ -396,7 +395,7 @@ def get_data_loader( ) return loader - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: """Initiate and return the optimizer based on the config parameters.""" # ToDo: deal with multi GPU training if self.config.optimizer_wd_only_on_weights: @@ -460,7 +459,7 @@ def get_optimizer(self) -> List: parameters=self.xtts.gpt.parameters(), ) - def get_scheduler(self, optimizer) -> List: + def get_scheduler(self, optimizer) -> list: """Set the scheduler for the optimizer. Args: @@ -491,7 +490,7 @@ def load_checkpoint( assert not self.training @staticmethod - def init_from_config(config: "GPTTrainerConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "GPTTrainerConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/layers/xtts/zh_num2words.py b/TTS/tts/layers/xtts/zh_num2words.py index 69b8dae952..f0a1db786d 100644 --- a/TTS/tts/layers/xtts/zh_num2words.py +++ b/TTS/tts/layers/xtts/zh_num2words.py @@ -392,7 +392,7 @@ # ================================================================================ # # basic class # ================================================================================ # -class ChineseChar(object): +class ChineseChar: """ 中文字įŦĻ 每ä¸Ē字įŦĻ寚åē”įŽ€äŊ“å’ŒįšäŊ“, @@ -426,7 +426,7 @@ def __init__(self, power, simplified, traditional, big_s, big_t): self.big_t = big_t def __str__(self): - return "10^{}".format(self.power) + return f"10^{self.power}" @classmethod def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False): @@ -447,7 +447,7 @@ def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=Fals power=pow(2, index + 3), simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1] ) else: - raise ValueError("Counting type should be in {0} ({1} provided).".format(NUMBERING_TYPES, numbering_type)) + raise ValueError(f"Counting type should be in {NUMBERING_TYPES} ({numbering_type} provided).") class ChineseNumberDigit(ChineseChar): @@ -487,13 +487,13 @@ def __init__(self, simplified, traditional, symbol, expression=None): CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath -class NumberSystem(object): +class NumberSystem: """ 中文数字įŗģįģŸ """ -class MathSymbol(object): +class MathSymbol: """ į”¨äēŽä¸­æ–‡æ•°å­—įŗģįģŸįš„æ•°å­ĻįŦĻåˇ (įš/įŽ€äŊ“), e.g. positive = ['æ­Ŗ', 'æ­Ŗ'] @@ -640,7 +640,7 @@ def compute_value(integer_symbols): int_str = str(compute_value(int_part)) dec_str = "".join([str(d.value) for d in dec_part]) if dec_part: - return "{0}.{1}".format(int_str, dec_str) + return f"{int_str}.{dec_str}" else: return int_str @@ -686,7 +686,7 @@ def get_value(value_string, use_zeros=True): int_string = int_dec[0] dec_string = int_dec[1] else: - raise ValueError("invalid input num string with more than one dot: {}".format(number_string)) + raise ValueError(f"invalid input num string with more than one dot: {number_string}") if use_units and len(int_string) > 1: result_symbols = get_value(int_string) @@ -1166,7 +1166,7 @@ def __call__(self, text): ) ndone = 0 - with open(args.ifile, "r", encoding="utf8") as istream, open(args.ofile, "w+", encoding="utf8") as ostream: + with open(args.ifile, encoding="utf8") as istream, open(args.ofile, "w+", encoding="utf8") as ostream: if args.format == "tsv": reader = csv.DictReader(istream, delimiter="\t") assert "TEXT" in reader.fieldnames diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index ebfa171c80..0e1587b60b 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -6,7 +6,7 @@ logger = logging.getLogger(__name__) -def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS": +def setup_model(config: "Coqpit", samples: list[list] | list[dict] = None) -> "BaseTTS": logger.info("Using model: %s", config.model) # fetch the right model implementation. if "base_model" in config and config["base_model"] is not None: diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index c1d0cf0aea..12c3d18252 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import Dict, List, Union import torch from coqpit import Coqpit @@ -403,7 +402,7 @@ def on_epoch_start(self, trainer): self.phase = self._set_phase(trainer.config, trainer.total_steps_done) @staticmethod - def init_from_config(config: "AlignTTSConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "AlignTTSConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/models/bark.py b/TTS/tts/models/bark.py index 6a480e6f5c..ca1c301281 100644 --- a/TTS/tts/models/bark.py +++ b/TTS/tts/models/bark.py @@ -1,7 +1,6 @@ import os from dataclasses import dataclass from pathlib import Path -from typing import Optional import numpy as np from coqpit import Coqpit @@ -65,7 +64,7 @@ def train_step( def text_to_semantic( self, text: str, - history_prompt: Optional[str] = None, + history_prompt: str | None = None, temp: float = 0.7, base=None, allow_early_stop=True, @@ -95,7 +94,7 @@ def text_to_semantic( def semantic_to_waveform( self, semantic_tokens: np.ndarray, - history_prompt: Optional[str] = None, + history_prompt: str | None = None, temp: float = 0.7, base=None, ): @@ -129,7 +128,7 @@ def semantic_to_waveform( def generate_audio( self, text: str, - history_prompt: Optional[str] = None, + history_prompt: str | None = None, text_temp: float = 0.7, waveform_temp: float = 0.7, base=None, diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py index 79cdf1a7d4..8821036b5f 100644 --- a/TTS/tts/models/base_tacotron.py +++ b/TTS/tts/models/base_tacotron.py @@ -1,7 +1,6 @@ import copy import logging from abc import abstractmethod -from typing import Dict, Tuple import torch from coqpit import Coqpit @@ -62,7 +61,7 @@ def __init__( self.coarse_decoder = None @staticmethod - def _format_aux_input(aux_input: Dict) -> Dict: + def _format_aux_input(aux_input: dict) -> dict: """Set missing fields to their default values""" if aux_input: return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input) @@ -141,7 +140,7 @@ def init_from_config(config: Coqpit): # TEST AND LOG FUNCTIONS # ########################## - def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: + def test_run(self, assets: dict) -> tuple[dict, dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -169,13 +168,11 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: use_griffin_lim=True, do_trim_silence=False, ) - test_audios["{}-audio".format(idx)] = outputs_dict["wav"] - test_figures["{}-prediction".format(idx)] = plot_spectrogram( + test_audios[f"{idx}-audio"] = outputs_dict["wav"] + test_figures[f"{idx}-prediction"] = plot_spectrogram( outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False ) - test_figures["{}-alignment".format(idx)] = plot_alignment( - outputs_dict["outputs"]["alignments"], output_fig=False - ) + test_figures[f"{idx}-alignment"] = plot_alignment(outputs_dict["outputs"]["alignments"], output_fig=False) return {"figures": test_figures, "audios": test_audios} def test_log( diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 33a75598c9..0976e4cdab 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -1,7 +1,6 @@ import logging import os import random -from typing import Dict, List, Tuple, Union import torch import torch.distributed as dist @@ -79,7 +78,7 @@ def _set_model_args(self, config: Coqpit): else: raise ValueError("config must be either a *Config or *Args") - def init_multispeaker(self, config: Coqpit, data: List = None): + def init_multispeaker(self, config: Coqpit, data: list = None): """Set up for multi-speaker TTS. Initialize a speaker embedding layer if needed and define expected embedding @@ -114,7 +113,7 @@ def init_multispeaker(self, config: Coqpit, data: List = None): self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) - def get_aux_input(self, **kwargs) -> Dict: + def get_aux_input(self, **kwargs) -> dict: """Prepare and return `aux_input` used by `forward()`""" return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None} @@ -165,7 +164,7 @@ def get_aux_input_from_test_sentences(self, sentence_info): "language_id": language_id, } - def format_batch(self, batch: Dict) -> Dict: + def format_batch(self, batch: dict) -> dict: """Generic batch formatting for `TTSDataset`. You must override this if you use a custom dataset. @@ -285,9 +284,9 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1): def get_data_loader( self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: bool, - samples: Union[List[Dict], List[List]], + samples: list[dict] | list[list], verbose: bool, num_gpus: int, rank: int = None, @@ -366,7 +365,7 @@ def get_data_loader( def _get_test_aux_input( self, - ) -> Dict: + ) -> dict: d_vector = None if self.config.use_d_vector_file: d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings] @@ -383,7 +382,7 @@ def _get_test_aux_input( } return aux_inputs - def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: + def test_run(self, assets: dict) -> tuple[dict, dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -414,13 +413,11 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: use_griffin_lim=True, do_trim_silence=False, ) - test_audios["{}-audio".format(idx)] = outputs_dict["wav"] - test_figures["{}-prediction".format(idx)] = plot_spectrogram( + test_audios[f"{idx}-audio"] = outputs_dict["wav"] + test_figures[f"{idx}-prediction"] = plot_spectrogram( outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False ) - test_figures["{}-alignment".format(idx)] = plot_alignment( - outputs_dict["outputs"]["alignments"], output_fig=False - ) + test_figures[f"{idx}-alignment"] = plot_alignment(outputs_dict["outputs"]["alignments"], output_fig=False) return test_figures, test_audios def on_init_start(self, trainer): diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py index bee008e26f..5db0f3df60 100644 --- a/TTS/tts/models/delightful_tts.py +++ b/TTS/tts/models/delightful_tts.py @@ -3,7 +3,6 @@ from dataclasses import dataclass, field from itertools import chain from pathlib import Path -from typing import Dict, List, Optional, Tuple, Union import numpy as np import torch @@ -65,7 +64,7 @@ class ForwardTTSE2eF0Dataset(F0Dataset): def __init__( self, ap, - samples: Union[List[List], List[Dict]], + samples: list[list] | list[dict], cache_path: str = None, precompute_num_workers=0, normalize_f0=True, @@ -275,15 +274,15 @@ def collate_fn(self, batch): @dataclass class VocoderConfig(Coqpit): resblock_type_decoder: str = "1" - resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11]) - resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) - upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2]) + resblock_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [3, 7, 11]) + resblock_dilation_sizes_decoder: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) + upsample_rates_decoder: list[int] = field(default_factory=lambda: [8, 8, 2, 2]) upsample_initial_channel_decoder: int = 512 - upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4]) + upsample_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [16, 16, 4, 4]) use_spectral_norm_discriminator: bool = False - upsampling_rates_discriminator: List[int] = field(default_factory=lambda: [4, 4, 4, 4]) - periods_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11]) - pretrained_model_path: Optional[str] = None + upsampling_rates_discriminator: list[int] = field(default_factory=lambda: [4, 4, 4, 4]) + periods_discriminator: list[int] = field(default_factory=lambda: [2, 3, 5, 7, 11]) + pretrained_model_path: str | None = None @dataclass @@ -553,7 +552,7 @@ def forward( attn_priors: torch.FloatTensor = None, d_vectors: torch.FloatTensor = None, speaker_idx: torch.LongTensor = None, - ) -> Dict: + ) -> dict: """Model's forward pass. Args: @@ -1015,7 +1014,7 @@ def synthesize_with_gl(self, text: str, speaker_id, d_vector): return return_dict @torch.inference_mode() - def test_run(self, assets) -> Tuple[Dict, Dict]: + def test_run(self, assets) -> tuple[dict, dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -1041,9 +1040,9 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: d_vector=aux_inputs["d_vector"], ) # speaker_name = self.speaker_manager.speaker_names[aux_inputs["speaker_id"]] - test_audios["{}-audio".format(idx)] = outputs["wav"].T - test_audios["{}-audio_encoder".format(idx)] = outputs_gl["wav"].T - test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False) + test_audios[f"{idx}-audio"] = outputs["wav"].T + test_audios[f"{idx}-audio_encoder"] = outputs_gl["wav"].T + test_figures[f"{idx}-alignment"] = plot_alignment(outputs["alignments"], output_fig=False) return {"figures": test_figures, "audios": test_audios} def test_log( @@ -1052,7 +1051,7 @@ def test_log( logger.test_audios(steps, outputs["audios"], self.config.audio.sample_rate) logger.test_figures(steps, outputs["figures"]) - def format_batch(self, batch: Dict) -> Dict: + def format_batch(self, batch: dict) -> dict: """Compute speaker, langugage IDs and d_vector for the batch if necessary.""" speaker_ids = None d_vectors = None @@ -1160,9 +1159,9 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1): def get_data_loader( self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: bool, - samples: Union[List[Dict], List[List]], + samples: list[dict] | list[list], verbose: bool, num_gpus: int, rank: int = None, @@ -1217,7 +1216,7 @@ def get_data_loader( def get_criterion(self): return [VitsDiscriminatorLoss(self.config), DelightfulTTSLoss(self.config)] - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: """Initiate and return the GAN optimizers based on the config parameters. It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator. Returns: @@ -1232,7 +1231,7 @@ def get_optimizer(self) -> List: ) return [optimizer_disc, optimizer_gen] - def get_lr(self) -> List: + def get_lr(self) -> list: """Set the initial learning rates for each optimizer. Returns: @@ -1240,7 +1239,7 @@ def get_lr(self) -> List: """ return [self.config.lr_disc, self.config.lr_gen] - def get_scheduler(self, optimizer) -> List: + def get_scheduler(self, optimizer) -> list: """Set the schedulers for each optimizer. Args: @@ -1260,7 +1259,7 @@ def on_epoch_end(self, trainer): # pylint: disable=unused-argument @staticmethod def init_from_config( - config: "DelightfulTTSConfig", samples: Union[List[List], List[Dict]] = None + config: "DelightfulTTSConfig", samples: list[list] | list[dict] = None ): # pylint: disable=unused-argument """Initiate model from config diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py index 03166fa8c0..5b68475406 100644 --- a/TTS/tts/models/forward_tts.py +++ b/TTS/tts/models/forward_tts.py @@ -1,6 +1,5 @@ import logging from dataclasses import dataclass, field -from typing import Dict, List, Tuple, Union import torch from coqpit import Coqpit @@ -333,7 +332,7 @@ def format_durations(self, o_dr_log, x_mask): def _forward_encoder( self, x: torch.LongTensor, x_mask: torch.FloatTensor, g: torch.FloatTensor = None - ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: """Encoding forward pass. 1. Embed speaker IDs if multi-speaker mode. @@ -381,7 +380,7 @@ def _forward_decoder( x_mask: torch.FloatTensor, y_lengths: torch.IntTensor, g: torch.FloatTensor, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: """Decoding forward pass. 1. Compute the decoder output mask @@ -415,7 +414,7 @@ def _forward_pitch_predictor( x_mask: torch.IntTensor, pitch: torch.FloatTensor = None, dr: torch.IntTensor = None, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: """Pitch predictor forward pass. 1. Predict pitch from encoder outputs. @@ -451,7 +450,7 @@ def _forward_energy_predictor( x_mask: torch.IntTensor, energy: torch.FloatTensor = None, dr: torch.IntTensor = None, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: """Energy predictor forward pass. 1. Predict energy from encoder outputs. @@ -483,7 +482,7 @@ def _forward_energy_predictor( def _forward_aligner( self, x: torch.FloatTensor, y: torch.FloatTensor, x_mask: torch.IntTensor, y_mask: torch.IntTensor - ) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: """Aligner forward pass. 1. Compute a mask to apply to the attention map. @@ -522,7 +521,7 @@ def _forward_aligner( alignment_soft = alignment_soft.squeeze(1).transpose(1, 2) return o_alignment_dur, alignment_soft, alignment_logprob, alignment_mas - def _set_speaker_input(self, aux_input: Dict): + def _set_speaker_input(self, aux_input: dict): d_vectors = aux_input.get("d_vectors", None) speaker_ids = aux_input.get("speaker_ids", None) @@ -544,8 +543,8 @@ def forward( dr: torch.IntTensor = None, pitch: torch.FloatTensor = None, energy: torch.FloatTensor = None, - aux_input: Dict = {"d_vectors": None, "speaker_ids": None}, # pylint: disable=unused-argument - ) -> Dict: + aux_input: dict = {"d_vectors": None, "speaker_ids": None}, # pylint: disable=unused-argument + ) -> dict: """Model's forward pass. Args: @@ -805,7 +804,7 @@ def on_train_step_start(self, trainer): self.binary_loss_weight = min(trainer.epochs_done / self.config.binary_loss_warmup_epochs, 1.0) * 1.0 @staticmethod - def init_from_config(config: "ForwardTTSConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "ForwardTTSConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index aaf5190ada..68b175afcc 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -1,6 +1,5 @@ import logging import math -from typing import Dict, List, Tuple, Union import torch from coqpit import Coqpit @@ -162,7 +161,7 @@ def lock_act_norm_layers(self): if getattr(f, "set_ddi", False): f.set_ddi(False) - def _set_speaker_input(self, aux_input: Dict): + def _set_speaker_input(self, aux_input: dict): if aux_input is None: d_vectors = None speaker_ids = None @@ -179,7 +178,7 @@ def _set_speaker_input(self, aux_input: Dict): g = speaker_ids if speaker_ids is not None else d_vectors return g - def _speaker_embedding(self, aux_input: Dict) -> Union[torch.tensor, None]: + def _speaker_embedding(self, aux_input: dict) -> torch.Tensor | None: g = self._set_speaker_input(aux_input) # speaker embedding if g is not None: @@ -474,7 +473,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_audios(steps, audios, self.ap.sample_rate) @torch.inference_mode() - def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: + def test_run(self, assets: dict) -> tuple[dict, dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -503,11 +502,11 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: do_trim_silence=False, ) - test_audios["{}-audio".format(idx)] = outputs["wav"] - test_figures["{}-prediction".format(idx)] = plot_spectrogram( + test_audios[f"{idx}-audio"] = outputs["wav"] + test_figures[f"{idx}-prediction"] = plot_spectrogram( outputs["outputs"]["model_outputs"], self.ap, output_fig=False ) - test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False) + test_figures[f"{idx}-alignment"] = plot_alignment(outputs["alignments"], output_fig=False) return test_figures, test_audios def preprocess(self, y, y_lengths, y_max_length, attn=None): @@ -543,7 +542,7 @@ def on_train_step_start(self, trainer): self.run_data_dep_init = trainer.total_steps_done < self.data_dep_init_steps @staticmethod - def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "GlowTTSConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/models/neuralhmm_tts.py b/TTS/tts/models/neuralhmm_tts.py index b9a23000a0..a7c0ea7f14 100644 --- a/TTS/tts/models/neuralhmm_tts.py +++ b/TTS/tts/models/neuralhmm_tts.py @@ -1,6 +1,5 @@ import logging import os -from typing import Dict, List, Union import torch from coqpit import Coqpit @@ -102,7 +101,7 @@ def __init__( self.register_buffer("mean", torch.tensor(0)) self.register_buffer("std", torch.tensor(1)) - def update_mean_std(self, statistics_dict: Dict): + def update_mean_std(self, statistics_dict: dict): self.mean.data = torch.tensor(statistics_dict["mean"]) self.std.data = torch.tensor(statistics_dict["std"]) @@ -174,10 +173,10 @@ def train_step(self, batch: dict, criterion: nn.Module): loss_dict.update(self._training_stats(batch)) return outputs, loss_dict - def eval_step(self, batch: Dict, criterion: nn.Module): + def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) - def _format_aux_input(self, aux_input: Dict, default_input_dict): + def _format_aux_input(self, aux_input: dict, default_input_dict): """Set missing fields to their default value. Args: @@ -239,7 +238,7 @@ def get_criterion(): return NLLLoss() @staticmethod - def init_from_config(config: "NeuralhmmTTSConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "NeuralhmmTTSConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: @@ -355,7 +354,7 @@ def train_log( logger.train_audios(steps, audios, self.ap.sample_rate) def eval_log( - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int + self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int ): # pylint: disable=unused-argument """Compute and log evaluation metrics.""" # Plot model parameters histograms diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py index 10157e43a4..85e1523307 100644 --- a/TTS/tts/models/overflow.py +++ b/TTS/tts/models/overflow.py @@ -1,6 +1,5 @@ import logging import os -from typing import Dict, List, Union import torch from coqpit import Coqpit @@ -116,7 +115,7 @@ def __init__( self.register_buffer("mean", torch.tensor(0)) self.register_buffer("std", torch.tensor(1)) - def update_mean_std(self, statistics_dict: Dict): + def update_mean_std(self, statistics_dict: dict): self.mean.data = torch.tensor(statistics_dict["mean"]) self.std.data = torch.tensor(statistics_dict["std"]) @@ -188,10 +187,10 @@ def train_step(self, batch: dict, criterion: nn.Module): loss_dict.update(self._training_stats(batch)) return outputs, loss_dict - def eval_step(self, batch: Dict, criterion: nn.Module): + def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) - def _format_aux_input(self, aux_input: Dict, default_input_dict): + def _format_aux_input(self, aux_input: dict, default_input_dict): """Set missing fields to their default value. Args: @@ -255,7 +254,7 @@ def get_criterion(): return NLLLoss() @staticmethod - def init_from_config(config: "OverFlowConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "OverFlowConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: @@ -372,7 +371,7 @@ def train_log( logger.train_audios(steps, audios, self.ap.sample_rate) def eval_log( - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int + self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int ): # pylint: disable=unused-argument """Compute and log evaluation metrics.""" # Plot model parameters histograms diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index da85823f3f..879a2b94b5 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from typing import Dict, List, Tuple, Union - import torch from torch import nn from trainer.trainer_utils import get_optimizer, get_scheduler @@ -280,7 +276,7 @@ def before_backward_pass(self, loss_dict, optimizer) -> None: loss_dict["capacitron_vae_beta_loss"].backward() optimizer.first_step() - def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dict]: + def train_step(self, batch: dict, criterion: torch.nn.Module) -> tuple[dict, dict]: """Perform a single training step by fetching the right set of samples from the batch. Args: @@ -332,7 +328,7 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dic loss_dict["align_error"] = align_error return outputs, loss_dict - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: if self.use_capacitron_vae: return CapacitronOptimizer(self.config, self.named_parameters()) return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self) @@ -396,7 +392,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_audios(steps, audios, self.ap.sample_rate) @staticmethod - def init_from_config(config: "TacotronConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "TacotronConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index e2edd4bb5c..c8c0c875ad 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from typing import Dict, List, Union - import torch from torch import nn from trainer.trainer_utils import get_optimizer, get_scheduler @@ -309,7 +305,7 @@ def before_backward_pass(self, loss_dict, optimizer) -> None: loss_dict["capacitron_vae_beta_loss"].backward() optimizer.first_step() - def train_step(self, batch: Dict, criterion: torch.nn.Module): + def train_step(self, batch: dict, criterion: torch.nn.Module): """A single training step. Forward pass and loss computation. Args: @@ -360,7 +356,7 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module): loss_dict["align_error"] = align_error return outputs, loss_dict - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: if self.use_capacitron_vae: return CapacitronOptimizer(self.config, self.named_parameters()) return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self) @@ -420,7 +416,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_audios(steps, audios, self.ap.sample_rate) @staticmethod - def init_from_config(config: "Tacotron2Config", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "Tacotron2Config", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 135b8e5016..28c0ff99c1 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -4,7 +4,7 @@ from dataclasses import dataclass, field, replace from itertools import chain from pathlib import Path -from typing import Any, Dict, List, Tuple, Union +from typing import Any import numpy as np import torch @@ -401,12 +401,12 @@ class VitsArgs(Coqpit): dilation_rate_flow: int = 1 num_layers_flow: int = 4 resblock_type_decoder: str = "1" - resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11]) - resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) - upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2]) + resblock_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [3, 7, 11]) + resblock_dilation_sizes_decoder: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) + upsample_rates_decoder: list[int] = field(default_factory=lambda: [8, 8, 2, 2]) upsample_initial_channel_decoder: int = 512 - upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4]) - periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11]) + upsample_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [16, 16, 4, 4]) + periods_multi_period_discriminator: list[int] = field(default_factory=lambda: [2, 3, 5, 7, 11]) use_sdp: bool = True noise_scale: float = 1.0 inference_noise_scale: float = 0.667 @@ -419,7 +419,7 @@ class VitsArgs(Coqpit): use_speaker_embedding: bool = False num_speakers: int = 0 speakers_file: str = None - d_vector_file: List[str] = None + d_vector_file: list[str] = None speaker_embedding_channels: int = 256 use_d_vector_file: bool = False d_vector_dim: int = 0 @@ -680,7 +680,7 @@ def on_init_end(self, trainer): # pylint: disable=W0613 raise RuntimeError(" [!] The weights of Text Encoder was not reinit check it !") logger.info("Text Encoder was reinit.") - def get_aux_input(self, aux_input: Dict): + def get_aux_input(self, aux_input: dict): sid, g, lid, _ = self._set_cond_input(aux_input) return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid} @@ -710,7 +710,7 @@ def _freeze_layers(self): param.requires_grad = False @staticmethod - def _set_cond_input(aux_input: Dict): + def _set_cond_input(aux_input: dict): """Set the speaker conditioning input based on the multi-speaker mode.""" sid, g, lid, durations = None, None, None, None if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None: @@ -732,7 +732,7 @@ def _set_cond_input(aux_input: Dict): return sid, g, lid, durations - def _set_speaker_input(self, aux_input: Dict): + def _set_speaker_input(self, aux_input: dict): d_vectors = aux_input.get("d_vectors", None) speaker_ids = aux_input.get("speaker_ids", None) @@ -805,7 +805,7 @@ def forward( # pylint: disable=dangerous-default-value y_lengths: torch.tensor, waveform: torch.tensor, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}, - ) -> Dict: + ) -> dict: """Forward pass of the model. Args: @@ -1052,8 +1052,8 @@ def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt): assert self.num_speakers > 0, "num_speakers have to be larger than 0." # speaker embedding if self.args.use_speaker_embedding and not self.args.use_d_vector_file: - g_src = self.emb_g(torch.from_numpy((np.array(speaker_cond_src))).unsqueeze(0)).unsqueeze(-1) - g_tgt = self.emb_g(torch.from_numpy((np.array(speaker_cond_tgt))).unsqueeze(0)).unsqueeze(-1) + g_src = self.emb_g(torch.from_numpy(np.array(speaker_cond_src)).unsqueeze(0)).unsqueeze(-1) + g_tgt = self.emb_g(torch.from_numpy(np.array(speaker_cond_tgt)).unsqueeze(0)).unsqueeze(-1) elif not self.args.use_speaker_embedding and self.args.use_d_vector_file: g_src = F.normalize(speaker_cond_src).unsqueeze(-1) g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1) @@ -1066,7 +1066,7 @@ def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt): o_hat = self.waveform_decoder(z_hat * y_mask, g=g_tgt) return o_hat, y_mask, (z, z_p, z_hat) - def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]: + def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> tuple[dict, dict]: """Perform a single training step. Run the model forward pass and compute losses. Args: @@ -1264,7 +1264,7 @@ def get_aux_input_from_test_sentences(self, sentence_info): } @torch.inference_mode() - def test_run(self, assets) -> Tuple[Dict, Dict]: + def test_run(self, assets) -> tuple[dict, dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -1290,8 +1290,8 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: use_griffin_lim=True, do_trim_silence=False, ).values() - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.permute(2, 1, 0), output_fig=False) + test_audios[f"{idx}-audio"] = wav + test_figures[f"{idx}-alignment"] = plot_alignment(alignment.permute(2, 1, 0), output_fig=False) return {"figures": test_figures, "audios": test_audios} def test_log( @@ -1300,7 +1300,7 @@ def test_log( logger.test_audios(steps, outputs["audios"], self.ap.sample_rate) logger.test_figures(steps, outputs["figures"]) - def format_batch(self, batch: Dict) -> Dict: + def format_batch(self, batch: dict) -> dict: """Compute speaker, langugage IDs and d_vector for the batch if necessary.""" speaker_ids = None language_ids = None @@ -1423,9 +1423,9 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1, is_eval=F def get_data_loader( self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: bool, - samples: Union[List[Dict], List[List]], + samples: list[dict] | list[list], verbose: bool, num_gpus: int, rank: int = None, @@ -1487,7 +1487,7 @@ def get_data_loader( ) return loader - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: """Initiate and return the GAN optimizers based on the config parameters. It returns 2 optimizers in a list. First one is for the discriminator @@ -1505,7 +1505,7 @@ def get_optimizer(self) -> List: ) return [optimizer0, optimizer1] - def get_lr(self) -> List: + def get_lr(self) -> list: """Set the initial learning rates for each optimizer. Returns: @@ -1513,7 +1513,7 @@ def get_lr(self) -> List: """ return [self.config.lr_disc, self.config.lr_gen] - def get_scheduler(self, optimizer) -> List: + def get_scheduler(self, optimizer) -> list: """Set the schedulers for each optimizer. Args: @@ -1589,7 +1589,7 @@ def load_fairseq_checkpoint( checkpoint_file = checkpoint_dir / "G_100000.pth" vocab_file = checkpoint_dir / "vocab.txt" # set config params - with open(config_file, "r", encoding="utf-8") as f: + with open(config_file, encoding="utf-8") as f: # Load the JSON data as a dictionary config_org = json.load(f) self.config.audio.sample_rate = config_org["data"]["sampling_rate"] @@ -1613,7 +1613,7 @@ def load_fairseq_checkpoint( assert not self.training @staticmethod - def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "VitsConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: @@ -1825,7 +1825,7 @@ def to_config(self) -> "CharactersConfig": class FairseqVocab(BaseVocabulary): - def __init__(self, vocab: Union[str, os.PathLike[Any]]): + def __init__(self, vocab: str | os.PathLike[Any]): super(FairseqVocab).__init__() self.vocab = vocab @@ -1835,7 +1835,7 @@ def vocab(self): return self._vocab @vocab.setter - def vocab(self, vocab_file: Union[str, os.PathLike[Any]]): + def vocab(self, vocab_file: str | os.PathLike[Any]): with open(vocab_file, encoding="utf-8") as f: self._vocab = [x.replace("\n", "") for x in f.readlines()] self.blank = self._vocab[0] diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index c0a50800f6..63f161d05f 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -2,7 +2,6 @@ import os from dataclasses import dataclass from pathlib import Path -from typing import Optional import librosa import torch @@ -719,13 +718,13 @@ def get_compatible_checkpoint_state_dict(self, model_path): def load_checkpoint( self, config: "XttsConfig", - checkpoint_dir: Optional[str] = None, - checkpoint_path: Optional[str] = None, - vocab_path: Optional[str] = None, + checkpoint_dir: str | None = None, + checkpoint_path: str | None = None, + vocab_path: str | None = None, eval: bool = True, strict: bool = True, use_deepspeed: bool = False, - speaker_file_path: Optional[str] = None, + speaker_file_path: str | None = None, ): """ Loads a checkpoint from disk and initializes the model's state and tokenizer. diff --git a/TTS/tts/utils/data.py b/TTS/tts/utils/data.py index 22e46b683a..d0269060c8 100644 --- a/TTS/tts/utils/data.py +++ b/TTS/tts/utils/data.py @@ -11,7 +11,7 @@ def _pad_data(x, length): def prepare_data(inputs): - max_len = max((len(x) for x in inputs)) + max_len = max(len(x) for x in inputs) return np.stack([_pad_data(x, max_len) for x in inputs]) @@ -23,7 +23,7 @@ def _pad_tensor(x, length): def prepare_tensor(inputs, out_steps): - max_len = max((x.shape[1] for x in inputs)) + max_len = max(x.shape[1] for x in inputs) remainder = max_len % out_steps pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len return np.stack([_pad_tensor(x, pad_len) for x in inputs]) @@ -46,7 +46,7 @@ def _pad_stop_target(x: np.ndarray, length: int, pad_val=1) -> np.ndarray: def prepare_stop_target(inputs, out_steps): """Pad row vectors with 1.""" - max_len = max((x.shape[0] for x in inputs)) + max_len = max(x.shape[0] for x in inputs) remainder = max_len % out_steps pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len return np.stack([_pad_stop_target(x, pad_len) for x in inputs]) diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py index ff10f751f2..cf02e5282b 100644 --- a/TTS/tts/utils/helpers.py +++ b/TTS/tts/utils/helpers.py @@ -1,5 +1,3 @@ -from typing import Optional - import numpy as np import torch from scipy.stats import betabinom @@ -35,7 +33,7 @@ def inverse_transform(self, X): # from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 -def sequence_mask(sequence_length: torch.Tensor, max_len: Optional[int] = None) -> torch.Tensor: +def sequence_mask(sequence_length: torch.Tensor, max_len: int | None = None) -> torch.Tensor: """Create a sequence mask for filtering padding in a sequence tensor. Args: @@ -164,7 +162,7 @@ def generate_path(duration: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: def generate_attention( - duration: torch.Tensor, x_mask: torch.Tensor, y_mask: Optional[torch.Tensor] = None + duration: torch.Tensor, x_mask: torch.Tensor, y_mask: torch.Tensor | None = None ) -> torch.Tensor: """Generate an attention map from the linear scale durations. diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index c72de2d4e6..5ce7759dd8 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -1,5 +1,5 @@ import os -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional import fsspec import numpy as np @@ -27,8 +27,8 @@ class LanguageManager(BaseIDManager): def __init__( self, - language_ids_file_path: Union[str, os.PathLike[Any]] = "", - config: Optional[Coqpit] = None, + language_ids_file_path: str | os.PathLike[Any] = "", + config: Coqpit | None = None, ): super().__init__(id_file_path=language_ids_file_path) @@ -40,11 +40,11 @@ def num_languages(self) -> int: return len(list(self.name_to_id.keys())) @property - def language_names(self) -> List: + def language_names(self) -> list: return list(self.name_to_id.keys()) @staticmethod - def parse_language_ids_from_config(c: Coqpit) -> Dict: + def parse_language_ids_from_config(c: Coqpit) -> dict: """Set language id from config. Args: @@ -70,13 +70,13 @@ def set_language_ids_from_config(self, c: Coqpit) -> None: self.name_to_id = self.parse_language_ids_from_config(c) @staticmethod - def parse_ids_from_data(items: List, parse_key: str) -> Any: + def parse_ids_from_data(items: list, parse_key: str) -> Any: raise NotImplementedError - def set_ids_from_data(self, items: List, parse_key: str) -> Any: + def set_ids_from_data(self, items: list, parse_key: str) -> Any: raise NotImplementedError - def save_ids_to_file(self, file_path: Union[str, os.PathLike[Any]]) -> None: + def save_ids_to_file(self, file_path: str | os.PathLike[Any]) -> None: """Save language IDs to a json file. Args: diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py index e009a7c438..49e93454f2 100644 --- a/TTS/tts/utils/managers.py +++ b/TTS/tts/utils/managers.py @@ -1,7 +1,7 @@ import json import os import random -from typing import Any, Dict, List, Tuple, Union +from typing import Any import fsspec import numpy as np @@ -13,7 +13,7 @@ from TTS.utils.generic_utils import is_pytorch_at_least_2_4 -def load_file(path: Union[str, os.PathLike[Any]]): +def load_file(path: str | os.PathLike[Any]): path = str(path) if path.endswith(".json"): with fsspec.open(path, "r") as f: @@ -25,7 +25,7 @@ def load_file(path: Union[str, os.PathLike[Any]]): raise ValueError("Unsupported file type") -def save_file(obj: Any, path: Union[str, os.PathLike[Any]]): +def save_file(obj: Any, path: str | os.PathLike[Any]): path = str(path) if path.endswith(".json"): with fsspec.open(path, "w") as f: @@ -42,23 +42,23 @@ class BaseIDManager: It defines common `ID` manager specific functions. """ - def __init__(self, id_file_path: Union[str, os.PathLike[Any]] = ""): + def __init__(self, id_file_path: str | os.PathLike[Any] = ""): self.name_to_id = {} if id_file_path: self.load_ids_from_file(id_file_path) @staticmethod - def _load_json(json_file_path: Union[str, os.PathLike[Any]]) -> Dict: + def _load_json(json_file_path: str | os.PathLike[Any]) -> dict: with fsspec.open(str(json_file_path), "r") as f: return json.load(f) @staticmethod - def _save_json(json_file_path: Union[str, os.PathLike[Any]], data: dict) -> None: + def _save_json(json_file_path: str | os.PathLike[Any], data: dict) -> None: with fsspec.open(str(json_file_path), "w") as f: json.dump(data, f, indent=4) - def set_ids_from_data(self, items: List, parse_key: str) -> None: + def set_ids_from_data(self, items: list, parse_key: str) -> None: """Set IDs from data samples. Args: @@ -66,7 +66,7 @@ def set_ids_from_data(self, items: List, parse_key: str) -> None: """ self.name_to_id = self.parse_ids_from_data(items, parse_key=parse_key) - def load_ids_from_file(self, file_path: Union[str, os.PathLike[Any]]) -> None: + def load_ids_from_file(self, file_path: str | os.PathLike[Any]) -> None: """Set IDs from a file. Args: @@ -74,7 +74,7 @@ def load_ids_from_file(self, file_path: Union[str, os.PathLike[Any]]) -> None: """ self.name_to_id = load_file(file_path) - def save_ids_to_file(self, file_path: Union[str, os.PathLike[Any]]) -> None: + def save_ids_to_file(self, file_path: str | os.PathLike[Any]) -> None: """Save IDs to a json file. Args: @@ -96,7 +96,7 @@ def get_random_id(self) -> Any: return None @staticmethod - def parse_ids_from_data(items: List, parse_key: str) -> Tuple[Dict]: + def parse_ids_from_data(items: list, parse_key: str) -> tuple[dict]: """Parse IDs from data samples retured by `load_tts_samples()`. Args: @@ -133,10 +133,10 @@ class EmbeddingManager(BaseIDManager): def __init__( self, - embedding_file_path: Union[Union[str, os.PathLike[Any]], list[Union[str, os.PathLike[Any]]]] = "", - id_file_path: Union[str, os.PathLike[Any]] = "", - encoder_model_path: Union[str, os.PathLike[Any]] = "", - encoder_config_path: Union[str, os.PathLike[Any]] = "", + embedding_file_path: str | os.PathLike[Any] | list[str | os.PathLike[Any]] = "", + id_file_path: str | os.PathLike[Any] = "", + encoder_model_path: str | os.PathLike[Any] = "", + encoder_config_path: str | os.PathLike[Any] = "", use_cuda: bool = False, ): super().__init__(id_file_path=id_file_path) @@ -179,7 +179,7 @@ def embedding_names(self): """Get embedding names.""" return list(self.embeddings_by_names.keys()) - def save_embeddings_to_file(self, file_path: Union[str, os.PathLike[Any]]) -> None: + def save_embeddings_to_file(self, file_path: str | os.PathLike[Any]) -> None: """Save embeddings to a json file. Args: @@ -188,7 +188,7 @@ def save_embeddings_to_file(self, file_path: Union[str, os.PathLike[Any]]) -> No save_file(self.embeddings, file_path) @staticmethod - def read_embeddings_from_file(file_path: Union[str, os.PathLike[Any]]): + def read_embeddings_from_file(file_path: str | os.PathLike[Any]): """Load embeddings from a json file. Args: @@ -207,7 +207,7 @@ def read_embeddings_from_file(file_path: Union[str, os.PathLike[Any]]): embeddings_by_names[x["name"]].append(x["embedding"]) return name_to_id, clip_ids, embeddings, embeddings_by_names - def load_embeddings_from_file(self, file_path: Union[str, os.PathLike[Any]]) -> None: + def load_embeddings_from_file(self, file_path: str | os.PathLike[Any]) -> None: """Load embeddings from a json file. Args: @@ -217,7 +217,7 @@ def load_embeddings_from_file(self, file_path: Union[str, os.PathLike[Any]]) -> file_path ) - def load_embeddings_from_list_of_files(self, file_paths: list[Union[str, os.PathLike[Any]]]) -> None: + def load_embeddings_from_list_of_files(self, file_paths: list[str | os.PathLike[Any]]) -> None: """Load embeddings from a list of json files and don't allow duplicate keys. Args: @@ -242,7 +242,7 @@ def load_embeddings_from_list_of_files(self, file_paths: list[Union[str, os.Path # reset name_to_id to get the right speaker ids self.name_to_id = {name: i for i, name in enumerate(self.name_to_id)} - def get_embedding_by_clip(self, clip_idx: str) -> List: + def get_embedding_by_clip(self, clip_idx: str) -> list: """Get embedding by clip ID. Args: @@ -253,7 +253,7 @@ def get_embedding_by_clip(self, clip_idx: str) -> List: """ return self.embeddings[clip_idx]["embedding"] - def get_embeddings_by_name(self, idx: str) -> List[List]: + def get_embeddings_by_name(self, idx: str) -> list[list]: """Get all embeddings of a speaker. Args: @@ -264,7 +264,7 @@ def get_embeddings_by_name(self, idx: str) -> List[List]: """ return self.embeddings_by_names[idx] - def get_embeddings_by_names(self) -> Dict: + def get_embeddings_by_names(self) -> dict: """Get all embeddings by names. Returns: @@ -313,11 +313,11 @@ def get_random_embedding(self) -> Any: return None - def get_clips(self) -> List: + def get_clips(self) -> list: return sorted(self.embeddings.keys()) def init_encoder( - self, model_path: Union[str, os.PathLike[Any]], config_path: Union[str, os.PathLike[Any]], use_cuda=False + self, model_path: str | os.PathLike[Any], config_path: str | os.PathLike[Any], use_cuda=False ) -> None: """Initialize a speaker encoder model. @@ -335,9 +335,7 @@ def init_encoder( self.encoder_ap = AudioProcessor(**self.encoder_config.audio) @torch.inference_mode() - def compute_embedding_from_clip( - self, wav_file: Union[Union[str, os.PathLike[Any]], List[Union[str, os.PathLike[Any]]]] - ) -> list: + def compute_embedding_from_clip(self, wav_file: str | os.PathLike[Any] | list[str | os.PathLike[Any]]) -> list: """Compute a embedding from a given audio file. Args: @@ -374,7 +372,7 @@ def _compute(wav_file: str): embedding = _compute(wav_file) return embedding[0].tolist() - def compute_embeddings(self, feats: Union[torch.Tensor, np.ndarray]) -> List: + def compute_embeddings(self, feats: torch.Tensor | np.ndarray) -> list: """Compute embedding from features. Args: diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 89c56583f5..026039ab29 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,7 +1,7 @@ import json import logging import os -from typing import Any, Dict, List, Optional, Union +from typing import Any import fsspec import numpy as np @@ -56,11 +56,11 @@ class SpeakerManager(EmbeddingManager): def __init__( self, - data_items: Optional[list[list[Any]]] = None, + data_items: list[list[Any]] | None = None, d_vectors_file_path: str = "", - speaker_id_file_path: Union[str, os.PathLike[Any]] = "", - encoder_model_path: Union[str, os.PathLike[Any]] = "", - encoder_config_path: Union[str, os.PathLike[Any]] = "", + speaker_id_file_path: str | os.PathLike[Any] = "", + encoder_model_path: str | os.PathLike[Any] = "", + encoder_config_path: str | os.PathLike[Any] = "", use_cuda: bool = False, ): super().__init__( @@ -82,11 +82,11 @@ def num_speakers(self): def speaker_names(self): return list(self.name_to_id.keys()) - def get_speakers(self) -> List: + def get_speakers(self) -> list: return self.name_to_id @staticmethod - def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager": + def init_from_config(config: "Coqpit", samples: list[list] | list[dict] = None) -> "SpeakerManager": """Initialize a speaker manager from config Args: @@ -150,7 +150,7 @@ def save_speaker_mapping(out_path, speaker_mapping): json.dump(speaker_mapping, f, indent=4) -def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager: +def get_speaker_manager(c: Coqpit, data: list = None, restore_path: str = None, out_path: str = None) -> SpeakerManager: """Initiate a `SpeakerManager` instance by the provided config. Args: diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py index eddf05db3f..24bab63ca1 100644 --- a/TTS/tts/utils/ssim.py +++ b/TTS/tts/utils/ssim.py @@ -1,6 +1,5 @@ # Adopted from https://github.com/photosynthesis-team/piq -from typing import List, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -24,11 +23,11 @@ def _reduce(x: torch.Tensor, reduction: str = "mean") -> torch.Tensor: def _validate_input( - tensors: List[torch.Tensor], - dim_range: Tuple[int, int] = (0, -1), - data_range: Tuple[float, float] = (0.0, -1.0), + tensors: list[torch.Tensor], + dim_range: tuple[int, int] = (0, -1), + data_range: tuple[float, float] = (0.0, -1.0), # size_dim_range: Tuple[float, float] = (0., -1.), - size_range: Optional[Tuple[int, int]] = None, + size_range: tuple[int, int] | None = None, ) -> None: r"""Check that input(-s) satisfies the requirements Args: @@ -89,13 +88,13 @@ def ssim( y: torch.Tensor, kernel_size: int = 11, kernel_sigma: float = 1.5, - data_range: Union[int, float] = 1.0, + data_range: int | float = 1.0, reduction: str = "mean", full: bool = False, downsample: bool = True, k1: float = 0.01, k2: float = 0.03, -) -> List[torch.Tensor]: +) -> list[torch.Tensor]: r"""Interface of Structural Similarity (SSIM) index. Inputs supposed to be in range ``[0, data_range]``. To match performance with skimage and tensorflow set ``'downsample' = True``. @@ -218,7 +217,7 @@ def __init__( k2: float = 0.03, downsample: bool = True, reduction: str = "mean", - data_range: Union[int, float] = 1.0, + data_range: int | float = 1.0, ) -> None: super().__init__() @@ -270,7 +269,7 @@ def _ssim_per_channel( kernel: torch.Tensor, k1: float = 0.01, k2: float = 0.03, -) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: +) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: r"""Calculate Structural Similarity (SSIM) index for X and Y per channel. Args: @@ -321,7 +320,7 @@ def _ssim_per_channel_complex( kernel: torch.Tensor, k1: float = 0.01, k2: float = 0.03, -) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: +) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: r"""Calculate Structural Similarity (SSIM) index for Complex X and Y per channel. Args: diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 5dc4cc569f..c09c3f5aa2 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -1,13 +1,9 @@ -from typing import Dict, Optional, Union - import numpy as np import torch from torch import nn -def numpy_to_torch( - np_array: np.ndarray, dtype: torch.dtype, device: Union[str, torch.device] = "cpu" -) -> Optional[torch.Tensor]: +def numpy_to_torch(np_array: np.ndarray, dtype: torch.dtype, device: str | torch.device = "cpu") -> torch.Tensor | None: if np_array is None: return None return torch.as_tensor(np_array, dtype=dtype, device=device) @@ -31,7 +27,7 @@ def run_model_torch( style_text: str = None, d_vector: torch.Tensor = None, language_id: torch.Tensor = None, -) -> Dict: +) -> dict: """Run a torch model for inference. It does not support batch inference. Args: @@ -75,14 +71,14 @@ def inv_spectrogram(postnet_output, ap, CONFIG): return wav -def id_to_torch(aux_id, device: Union[str, torch.device] = "cpu") -> Optional[torch.Tensor]: +def id_to_torch(aux_id, device: str | torch.device = "cpu") -> torch.Tensor | None: if aux_id is not None: aux_id = np.asarray(aux_id) aux_id = torch.from_numpy(aux_id).to(device) return aux_id -def embedding_to_torch(d_vector, device: Union[str, torch.device] = "cpu") -> Optional[torch.Tensor]: +def embedding_to_torch(d_vector, device: str | torch.device = "cpu") -> torch.Tensor | None: if d_vector is not None: d_vector = np.asarray(d_vector) d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor) diff --git a/TTS/tts/utils/text/characters.py b/TTS/tts/utils/text/characters.py index 4bf9bf6bd5..da30692f5e 100644 --- a/TTS/tts/utils/text/characters.py +++ b/TTS/tts/utils/text/characters.py @@ -1,6 +1,5 @@ import logging from dataclasses import replace -from typing import Dict from TTS.tts.configs.shared_configs import CharactersConfig @@ -47,7 +46,7 @@ class BaseVocabulary: vocab (Dict): A dictionary of characters and their corresponding indices. """ - def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None): + def __init__(self, vocab: dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None): self.vocab = vocab self.pad = pad self.blank = blank diff --git a/TTS/tts/utils/text/chinese_mandarin/numbers.py b/TTS/tts/utils/text/chinese_mandarin/numbers.py index 4787ea6100..3e6a043918 100644 --- a/TTS/tts/utils/text/chinese_mandarin/numbers.py +++ b/TTS/tts/utils/text/chinese_mandarin/numbers.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # Licensed under WTFPL or the Unlicense or CC0. # This uses Python 3, but it's easy to port to Python 2 by changing diff --git a/TTS/tts/utils/text/chinese_mandarin/phonemizer.py b/TTS/tts/utils/text/chinese_mandarin/phonemizer.py index e9d62e9d06..4dccdd5778 100644 --- a/TTS/tts/utils/text/chinese_mandarin/phonemizer.py +++ b/TTS/tts/utils/text/chinese_mandarin/phonemizer.py @@ -1,5 +1,3 @@ -from typing import List - try: import jieba import pypinyin @@ -9,7 +7,7 @@ from .pinyinToPhonemes import PINYIN_DICT -def _chinese_character_to_pinyin(text: str) -> List[str]: +def _chinese_character_to_pinyin(text: str) -> list[str]: pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True) pinyins_flat_list = [item for sublist in pinyins for item in sublist] return pinyins_flat_list @@ -25,9 +23,9 @@ def _chinese_pinyin_to_phoneme(pinyin: str) -> str: def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str: tokenized_text = jieba.cut(text, HMM=False) tokenized_text = " ".join(tokenized_text) - pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text) + pinyined_text: list[str] = _chinese_character_to_pinyin(tokenized_text) - results: List[str] = [] + results: list[str] = [] for token in pinyined_text: if token[-1] in "12345": # TODO transform to is_pinyin() diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index f496b9f0dd..795ab246d2 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -1,7 +1,6 @@ """Set of default text cleaners""" import re -from typing import Optional from unicodedata import normalize from anyascii import anyascii @@ -47,7 +46,7 @@ def remove_aux_symbols(text: str) -> str: return text -def replace_symbols(text: str, lang: Optional[str] = "en") -> str: +def replace_symbols(text: str, lang: str | None = "en") -> str: """Replace symbols based on the language tag. Args: diff --git a/TTS/tts/utils/text/cmudict.py b/TTS/tts/utils/text/cmudict.py index f206fb043b..041b42ff31 100644 --- a/TTS/tts/utils/text/cmudict.py +++ b/TTS/tts/utils/text/cmudict.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - import re VALID_SYMBOLS = [ diff --git a/TTS/tts/utils/text/english/number_norm.py b/TTS/tts/utils/text/english/number_norm.py index c912e285e4..c5f2f452d5 100644 --- a/TTS/tts/utils/text/english/number_norm.py +++ b/TTS/tts/utils/text/english/number_norm.py @@ -1,7 +1,6 @@ """ from https://github.com/keithito/tacotron """ import re -from typing import Dict import inflect @@ -21,7 +20,7 @@ def _expand_decimal_point(m): return m.group(1).replace(".", " point ") -def __expand_currency(value: str, inflection: Dict[float, str]) -> str: +def __expand_currency(value: str, inflection: dict[float, str]) -> str: parts = value.replace(",", "").split(".") if len(parts) > 2: return f"{value} {inflection[2]}" # Unexpected format diff --git a/TTS/tts/utils/text/korean/ko_dictionary.py b/TTS/tts/utils/text/korean/ko_dictionary.py index 9b739339c6..706f9f5daf 100644 --- a/TTS/tts/utils/text/korean/ko_dictionary.py +++ b/TTS/tts/utils/text/korean/ko_dictionary.py @@ -1,4 +1,3 @@ -# coding: utf-8 # Add the word you want to the dictionary. etc_dictionary = {"1+1": "ė›í”ŒëŸŦėŠ¤ė›", "2+1": "íˆŦ플ëŸŦėŠ¤ė›"} diff --git a/TTS/tts/utils/text/korean/korean.py b/TTS/tts/utils/text/korean/korean.py index 423aeed377..0feef3bdfb 100644 --- a/TTS/tts/utils/text/korean/korean.py +++ b/TTS/tts/utils/text/korean/korean.py @@ -1,5 +1,4 @@ -īģŋ# coding: utf-8 -# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py +īģŋ# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py import re from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary diff --git a/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py b/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py index 3c4a35bbfa..3be7354636 100644 --- a/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py @@ -1,5 +1,3 @@ -from typing import Dict - from TTS.tts.utils.text.bangla.phonemizer import bangla_text_to_phonemes from TTS.tts.utils.text.phonemizers.base import BasePhonemizer @@ -41,7 +39,7 @@ def _phonemize(self, text, separator): return self.phonemize_bn(text, separator) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict: return {"bn": "Bangla"} def version(self) -> str: diff --git a/TTS/tts/utils/text/phonemizers/base.py b/TTS/tts/utils/text/phonemizers/base.py index 5e701df458..4bd03851c7 100644 --- a/TTS/tts/utils/text/phonemizers/base.py +++ b/TTS/tts/utils/text/phonemizers/base.py @@ -1,6 +1,5 @@ import abc import logging -from typing import List, Tuple from TTS.tts.utils.text.punctuation import Punctuation @@ -37,7 +36,7 @@ class BasePhonemizer(abc.ABC): def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False): # ensure the backend is installed on the system if not self.is_available(): - raise RuntimeError("{} not installed on your system".format(self.name())) # pragma: nocover + raise RuntimeError(f"{self.name()} not installed on your system") # pragma: nocover # ensure the backend support the requested language self._language = self._init_language(language) @@ -93,7 +92,7 @@ def is_supported_language(self, language): def _phonemize(self, text, separator): """The main phonemization method""" - def _phonemize_preprocess(self, text) -> Tuple[List[str], List]: + def _phonemize_preprocess(self, text) -> tuple[list[str], list]: """Preprocess the text before phonemization 1. remove spaces diff --git a/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py b/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py index e5fcab6e09..fa4a515d1a 100644 --- a/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py @@ -1,5 +1,3 @@ -from typing import Dict - from TTS.tts.utils.text.belarusian.phonemizer import belarusian_text_to_phonemes from TTS.tts.utils.text.phonemizers.base import BasePhonemizer @@ -34,7 +32,7 @@ def _phonemize(self, text, separator): return self.phonemize_be(text, separator) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict: return {"be": "Belarusian"} def version(self) -> str: diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py index a15df716e7..5c58afdf08 100644 --- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py +++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py @@ -5,7 +5,6 @@ import subprocess import tempfile from pathlib import Path -from typing import Optional from packaging.version import Version @@ -104,7 +103,7 @@ class ESpeak(BasePhonemizer): def __init__( self, language: str, - backend: Optional[str] = None, + backend: str | None = None, punctuations: str = Punctuation.default_puncs(), keep_puncs: bool = True, ): diff --git a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py index f3e9c9abd4..836fccf5b8 100644 --- a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py +++ b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py @@ -1,5 +1,4 @@ import importlib -from typing import List import gruut from gruut_ipa import IPA @@ -114,7 +113,7 @@ def is_supported_language(self, language): return gruut.is_language_supported(language) @staticmethod - def supported_languages() -> List: + def supported_languages() -> list: """Get a dictionary of supported languages. Returns: diff --git a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py index 878e5e5296..b3b3ba4db7 100644 --- a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py @@ -1,5 +1,3 @@ -from typing import Dict - from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes from TTS.tts.utils.text.phonemizers.base import BasePhonemizer @@ -51,7 +49,7 @@ def phonemize(self, text: str, separator="|", language=None) -> str: return self._phonemize(text, separator) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict: return {"ja-jp": "Japanese (Japan)"} def version(self) -> str: diff --git a/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py b/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py index 0bdba2137b..93930d064e 100644 --- a/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py @@ -1,5 +1,3 @@ -from typing import Dict - from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes from TTS.tts.utils.text.phonemizers.base import BasePhonemizer @@ -44,7 +42,7 @@ def phonemize(self, text: str, separator: str = "", character: str = "hangeul", return self._phonemize(text, separator, character) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict: return {"ko-kr": "hangeul(korean)"} def version(self) -> str: diff --git a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py index 1a9e98b091..87fb940f6b 100644 --- a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py @@ -1,5 +1,4 @@ import logging -from typing import Dict, List from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name @@ -19,7 +18,7 @@ class MultiPhonemizer: lang_to_phonemizer = {} - def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None: # pylint: disable=dangerous-default-value + def __init__(self, lang_to_phonemizer_name: dict = {}) -> None: # pylint: disable=dangerous-default-value for k, v in lang_to_phonemizer_name.items(): if v == "" and k in DEF_LANG_TO_PHONEMIZER.keys(): lang_to_phonemizer_name[k] = DEF_LANG_TO_PHONEMIZER[k] @@ -29,7 +28,7 @@ def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None: # pylint: disab self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name) @staticmethod - def init_phonemizers(lang_to_phonemizer_name: Dict) -> Dict: + def init_phonemizers(lang_to_phonemizer_name: dict) -> dict: lang_to_phonemizer = {} for k, v in lang_to_phonemizer_name.items(): lang_to_phonemizer[k] = get_phonemizer_by_name(v, language=k) @@ -44,7 +43,7 @@ def phonemize(self, text, separator="|", language=""): raise ValueError("Language must be set for multi-phonemizer to phonemize.") return self.lang_to_phonemizer[language].phonemize(text, separator) - def supported_languages(self) -> List: + def supported_languages(self) -> list: return list(self.lang_to_phonemizer.keys()) def print_logs(self, level: int = 0): diff --git a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py index 41480c4173..9e70b03a0c 100644 --- a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py @@ -1,5 +1,3 @@ -from typing import Dict - from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes from TTS.tts.utils.text.phonemizers.base import BasePhonemizer @@ -41,7 +39,7 @@ def _phonemize(self, text, separator): return self.phonemize_zh_cn(text, separator) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict: return {"zh-cn": "Chinese (China)"} def version(self) -> str: diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py index f653cdf13f..4d6c9e401e 100644 --- a/TTS/tts/utils/text/tokenizer.py +++ b/TTS/tts/utils/text/tokenizer.py @@ -1,5 +1,6 @@ import logging -from typing import Callable, Dict, List, Union +from collections.abc import Callable +from typing import Union from TTS.tts.utils.text import cleaners from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes @@ -43,7 +44,7 @@ def __init__( use_phonemes=False, text_cleaner: Callable = None, characters: "BaseCharacters" = None, - phonemizer: Union["Phonemizer", Dict] = None, + phonemizer: Union["Phonemizer", dict] = None, add_blank: bool = False, use_eos_bos=False, ): @@ -65,7 +66,7 @@ def characters(self, new_characters): self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None - def encode(self, text: str) -> List[int]: + def encode(self, text: str) -> list[int]: """Encodes a string of text as a sequence of IDs.""" token_ids = [] for char in text: @@ -80,14 +81,14 @@ def encode(self, text: str) -> List[int]: logger.warning("Character %s not found in the vocabulary. Discarding it.", repr(char)) return token_ids - def decode(self, token_ids: List[int]) -> str: + def decode(self, token_ids: list[int]) -> str: """Decodes a sequence of IDs to a string of text.""" text = "" for token_id in token_ids: text += self.characters.id_to_char(token_id) return text - def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument + def text_to_ids(self, text: str, language: str = None) -> list[int]: # pylint: disable=unused-argument """Converts a string of text to a sequence of token IDs. Args: @@ -121,15 +122,15 @@ def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: text = self.pad_with_bos_eos(text) return text - def ids_to_text(self, id_sequence: List[int]) -> str: + def ids_to_text(self, id_sequence: list[int]) -> str: """Converts a sequence of token IDs to a string of text.""" return self.decode(id_sequence) - def pad_with_bos_eos(self, char_sequence: List[str]): + def pad_with_bos_eos(self, char_sequence: list[str]): """Pads a sequence with the special BOS and EOS characters.""" return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id] - def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False): + def intersperse_blank_char(self, char_sequence: list[str], use_blank_char: bool = False): """Intersperses the blank character between characters in a sequence. Use the ```blank``` character if defined else use the ```pad``` character. diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py index 0cba7fc8a8..7fd4259178 100644 --- a/TTS/utils/audio/numpy_transforms.py +++ b/TTS/utils/audio/numpy_transforms.py @@ -1,7 +1,7 @@ import logging import os from io import BytesIO -from typing import Any, Optional, Union +from typing import Any import librosa import numpy as np @@ -21,7 +21,7 @@ def build_mel_basis( fft_size: int, num_mels: int, mel_fmin: int, - mel_fmax: Optional[int] = None, + mel_fmax: int | None = None, **kwargs, ) -> np.ndarray: """Build melspectrogram basis. @@ -177,8 +177,8 @@ def stft( *, y: np.ndarray, fft_size: int, - hop_length: Optional[int] = None, - win_length: Optional[int] = None, + hop_length: int | None = None, + win_length: int | None = None, pad_mode: str = "reflect", window: str = "hann", center: bool = True, @@ -205,8 +205,8 @@ def stft( def istft( *, y: np.ndarray, - hop_length: Optional[int] = None, - win_length: Optional[int] = None, + hop_length: int | None = None, + win_length: int | None = None, window: str = "hann", center: bool = True, **kwargs, @@ -248,8 +248,8 @@ def compute_stft_paddings(*, x: np.ndarray, hop_length: int, pad_two_sides: bool def compute_f0( *, x: np.ndarray, - pitch_fmax: Optional[float] = None, - pitch_fmin: Optional[float] = None, + pitch_fmax: float | None = None, + pitch_fmin: float | None = None, hop_length: int, win_length: int, sample_rate: int, @@ -408,7 +408,7 @@ def rms_volume_norm(*, x: np.ndarray, db_level: float = -27.0, **kwargs) -> np.n def load_wav( - *, filename: Union[str, os.PathLike[Any]], sample_rate: Optional[int] = None, resample: bool = False, **kwargs + *, filename: str | os.PathLike[Any], sample_rate: int | None = None, resample: bool = False, **kwargs ) -> np.ndarray: """Read a wav file using Librosa and optionally resample, silence trim, volume normalize. @@ -437,7 +437,7 @@ def load_wav( def save_wav( *, wav: np.ndarray, - path: Union[str, os.PathLike[Any]], + path: str | os.PathLike[Any], sample_rate: int, pipe_out=None, do_rms_norm: bool = False, diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py index bf07333aea..9a8841106c 100644 --- a/TTS/utils/audio/processor.py +++ b/TTS/utils/audio/processor.py @@ -1,6 +1,6 @@ import logging import os -from typing import Any, Optional, Union +from typing import Any import librosa import numpy as np @@ -549,7 +549,7 @@ def sound_norm(x: np.ndarray) -> np.ndarray: return volume_norm(x=x) ### save and load ### - def load_wav(self, filename: Union[str, os.PathLike[Any]], sr: Optional[int] = None) -> np.ndarray: + def load_wav(self, filename: str | os.PathLike[Any], sr: int | None = None) -> np.ndarray: """Read a wav file using Librosa and optionally resample, silence trim, volume normalize. Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before. @@ -576,9 +576,7 @@ def load_wav(self, filename: Union[str, os.PathLike[Any]], sr: Optional[int] = N x = rms_volume_norm(x=x, db_level=self.db_level) return x - def save_wav( - self, wav: np.ndarray, path: Union[str, os.PathLike[Any]], sr: Optional[int] = None, pipe_out=None - ) -> None: + def save_wav(self, wav: np.ndarray, path: str | os.PathLike[Any], sr: int | None = None, pipe_out=None) -> None: """Save a waveform to a file using Scipy. Args: diff --git a/TTS/utils/capacitron_optimizer.py b/TTS/utils/capacitron_optimizer.py index 7206ffd508..01f303f98d 100644 --- a/TTS/utils/capacitron_optimizer.py +++ b/TTS/utils/capacitron_optimizer.py @@ -1,4 +1,4 @@ -from typing import Generator +from collections.abc import Generator from trainer.trainer_utils import get_optimizer diff --git a/TTS/utils/download.py b/TTS/utils/download.py index e94b1d68c8..75ef9164f6 100644 --- a/TTS/utils/download.py +++ b/TTS/utils/download.py @@ -7,8 +7,9 @@ import urllib import urllib.request import zipfile +from collections.abc import Iterable from os.path import expanduser -from typing import Any, Iterable, List, Optional +from typing import Any from torch.utils.model_zoo import tqdm @@ -16,7 +17,7 @@ def stream_url( - url: str, start_byte: Optional[int] = None, block_size: int = 32 * 1024, progress_bar: bool = True + url: str, start_byte: int | None = None, block_size: int = 32 * 1024, progress_bar: bool = True ) -> Iterable: """Stream url by chunk @@ -36,7 +37,7 @@ def stream_url( req = urllib.request.Request(url) if start_byte: - req.headers["Range"] = "bytes={}-".format(start_byte) + req.headers["Range"] = f"bytes={start_byte}-" with ( urllib.request.urlopen(req) as upointer, @@ -61,8 +62,8 @@ def stream_url( def download_url( url: str, download_folder: str, - filename: Optional[str] = None, - hash_value: Optional[str] = None, + filename: str | None = None, + hash_value: str | None = None, hash_type: str = "sha256", progress_bar: bool = True, resume: bool = False, @@ -88,10 +89,10 @@ def download_url( filepath = os.path.join(download_folder, filename) if resume and os.path.exists(filepath): mode = "ab" - local_size: Optional[int] = os.path.getsize(filepath) + local_size: int | None = os.path.getsize(filepath) elif not resume and os.path.exists(filepath): - raise RuntimeError("{} already exists. Delete the file manually and retry.".format(filepath)) + raise RuntimeError(f"{filepath} already exists. Delete the file manually and retry.") else: mode = "wb" local_size = None @@ -100,7 +101,7 @@ def download_url( with open(filepath, "rb") as file_obj: if validate_file(file_obj, hash_value, hash_type): return - raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath)) + raise RuntimeError(f"The hash of {filepath} does not match. Delete the file manually and retry.") with open(filepath, mode) as fpointer: for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar): @@ -108,7 +109,7 @@ def download_url( with open(filepath, "rb") as file_obj: if hash_value and not validate_file(file_obj, hash_value, hash_type): - raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath)) + raise RuntimeError(f"The hash of {filepath} does not match. Delete the file manually and retry.") def validate_file(file_obj: Any, hash_value: str, hash_type: str = "sha256") -> bool: @@ -140,7 +141,7 @@ def validate_file(file_obj: Any, hash_value: str, hash_type: str = "sha256") -> return hash_func.hexdigest() == hash_value -def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bool = False) -> List[str]: +def extract_archive(from_path: str, to_path: str | None = None, overwrite: bool = False) -> list[str]: """Extract archive. Args: from_path (str): the path of the archive. diff --git a/TTS/utils/downloaders.py b/TTS/utils/downloaders.py index 8705873982..c06c2649ad 100644 --- a/TTS/utils/downloaders.py +++ b/TTS/utils/downloaders.py @@ -1,6 +1,5 @@ import logging import os -from typing import Optional from TTS.utils.download import download_kaggle_dataset, download_url, extract_archive @@ -21,7 +20,7 @@ def download_ljspeech(path: str): extract_archive(archive) -def download_vctk(path: str, use_kaggle: Optional[bool] = False): +def download_vctk(path: str, use_kaggle: bool | None = False): """Download and extract VCTK dataset. Args: @@ -49,7 +48,7 @@ def download_tweb(path: str): download_kaggle_dataset("bryanpark/the-world-english-bible-speech-dataset", "TWEB", path) -def download_libri_tts(path: str, subset: Optional[str] = "all"): +def download_libri_tts(path: str, subset: str | None = "all"): """Download and extract libri tts dataset. Args: diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 090522493b..e1df6f6ed4 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -1,11 +1,11 @@ -# -*- coding: utf-8 -*- import datetime import importlib import logging import os import re +from collections.abc import Callable from pathlib import Path -from typing import Any, Callable, Dict, Optional, TextIO, TypeVar, Union +from typing import Any, TextIO, TypeVar import torch from packaging.version import Version @@ -16,11 +16,11 @@ _T = TypeVar("_T") -def exists(val: Union[_T, None]) -> TypeIs[_T]: +def exists(val: _T | None) -> TypeIs[_T]: return val is not None -def default(val: Union[_T, None], d: Union[_T, Callable[[], _T]]) -> _T: +def default(val: _T | None, d: _T | Callable[[], _T]) -> _T: if exists(val): return val return d() if callable(d) else d @@ -69,7 +69,7 @@ def get_import_path(obj: object) -> str: return ".".join([type(obj).__module__, type(obj).__name__]) -def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict: +def format_aux_input(def_args: dict, kwargs: dict) -> dict: """Format kwargs to hande auxilary inputs to models. Args: @@ -108,9 +108,9 @@ def setup_logger( logger_name: str, level: int = logging.INFO, *, - formatter: Optional[logging.Formatter] = None, - stream: Optional[TextIO] = None, - log_dir: Optional[Union[str, os.PathLike[Any]]] = None, + formatter: logging.Formatter | None = None, + stream: TextIO | None = None, + log_dir: str | os.PathLike[Any] | None = None, log_name: str = "log", ) -> None: """Set up a logger. @@ -146,6 +146,6 @@ def is_pytorch_at_least_2_4() -> bool: return Version(torch.__version__) >= Version("2.4") -def optional_to_str(x: Optional[Any]) -> str: +def optional_to_str(x: Any | None) -> str: """Convert input to string, using empty string if input is None.""" return "" if x is None else str(x) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 5dff1b84c8..20d6ab226b 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -6,7 +6,7 @@ import zipfile from pathlib import Path from shutil import copyfile, rmtree -from typing import Any, Optional, TypedDict, Union +from typing import Any, TypedDict import fsspec import requests @@ -27,12 +27,12 @@ class ModelItem(TypedDict, total=False): license: str author: str contact: str - commit: Optional[str] + commit: str | None model_hash: str tos_required: bool - default_vocoder: Optional[str] - model_url: Union[str, list[str]] - github_rls_url: Union[str, list[str]] + default_vocoder: str | None + model_url: str | list[str] + github_rls_url: str | list[str] hf_url: list[str] @@ -49,7 +49,7 @@ class ModelItem(TypedDict, total=False): } -class ModelManager(object): +class ModelManager: tqdm_progress = None """Manage TTS models defined in .models.json. It provides an interface to list and download @@ -66,8 +66,8 @@ class ModelManager(object): def __init__( self, - models_file: Optional[Union[str, os.PathLike[Any]]] = None, - output_prefix: Optional[Union[str, os.PathLike[Any]]] = None, + models_file: str | os.PathLike[Any] | None = None, + output_prefix: str | os.PathLike[Any] | None = None, progress_bar: bool = False, ) -> None: super().__init__() @@ -84,7 +84,7 @@ def __init__( path = Path(__file__).parent / "../.models.json" self.read_models_file(path) - def read_models_file(self, file_path: Union[str, os.PathLike[Any]]) -> None: + def read_models_file(self, file_path: str | os.PathLike[Any]) -> None: """Read .models.json as a dict Args: @@ -274,7 +274,7 @@ def set_model_url(model_item: ModelItem) -> ModelItem: model_item["model_url"] = "https://huggingface.co/coqui/" return model_item - def _set_model_item(self, model_name: str) -> tuple[ModelItem, str, str, Optional[str]]: + def _set_model_item(self, model_name: str) -> tuple[ModelItem, str, str, str | None]: # fetch model info from the dict if "fairseq" in model_name: model_type, lang, dataset, model = model_name.split("/") @@ -389,7 +389,7 @@ def check_if_configs_are_equal(self, model_name: str, model_item: ModelItem, out logger.info("%s is already downloaded however it has been changed. Redownloading it...", model_name) self.create_dir_and_download_model(model_name, model_item, output_path) - def download_model(self, model_name: str) -> tuple[Path, Optional[Path], ModelItem]: + def download_model(self, model_name: str) -> tuple[Path, Path | None, ModelItem]: """Download model files given the full model name. Model name is in the format 'type/language/dataset/model' @@ -471,7 +471,7 @@ def _find_files(output_path: Path) -> tuple[Path, Path]: return model_file, config_file @staticmethod - def _find_speaker_encoder(output_path: Path) -> Optional[Path]: + def _find_speaker_encoder(output_path: Path) -> Path | None: """Find the speaker encoder file in the output path Args: @@ -523,7 +523,7 @@ def _update_paths(self, output_path: Path, config_path: Path) -> None: self._update_path("model_args.speaker_encoder_config_path", speaker_encoder_config_path, config_path) @staticmethod - def _update_path(field_name: str, new_path: Optional[Path], config_path: Path) -> None: + def _update_path(field_name: str, new_path: Path | None, config_path: Path) -> None: """Update the path in the model config.json for the current environment after download""" if new_path is not None and new_path.is_file(): config = load_config(str(config_path)) @@ -619,9 +619,7 @@ def _download_tar_file(file_url: str, output_folder: Path, progress_bar: bool) - rmtree(output_folder / tar_names[0]) @staticmethod - def _download_model_files( - file_urls: list[str], output_folder: Union[str, os.PathLike[Any]], progress_bar: bool - ) -> None: + def _download_model_files(file_urls: list[str], output_folder: str | os.PathLike[Any], progress_bar: bool) -> None: """Download the github releases""" output_folder = Path(output_folder) for file_url in file_urls: diff --git a/TTS/utils/radam.py b/TTS/utils/radam.py index cbd14990f3..b5306d6ab3 100644 --- a/TTS/utils/radam.py +++ b/TTS/utils/radam.py @@ -9,13 +9,13 @@ class RAdam(Optimizer): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): if lr < 0.0: - raise ValueError("Invalid learning rate: {}".format(lr)) + raise ValueError(f"Invalid learning rate: {lr}") if eps < 0.0: - raise ValueError("Invalid epsilon value: {}".format(eps)) + raise ValueError(f"Invalid epsilon value: {eps}") if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}") if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}") self.degenerated_to_sgd = degenerated_to_sgd if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): diff --git a/TTS/utils/samplers.py b/TTS/utils/samplers.py index b08a763a33..4e8f3825b9 100644 --- a/TTS/utils/samplers.py +++ b/TTS/utils/samplers.py @@ -1,6 +1,6 @@ import math import random -from typing import Callable, List, Union +from collections.abc import Callable from torch.utils.data.sampler import BatchSampler, Sampler, SubsetRandomSampler @@ -176,7 +176,7 @@ def __init__( data, batch_size, drop_last, - sort_key: Union[Callable, List] = identity, + sort_key: Callable | list = identity, bucket_size_multiplier=100, ): super().__init__(sampler, batch_size, drop_last) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index fafeddfd75..cebb094a48 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -2,7 +2,7 @@ import os import time from pathlib import Path -from typing import Any, List, Optional, Union +from typing import Any import numpy as np import pysbd @@ -30,18 +30,18 @@ class Synthesizer(nn.Module): def __init__( self, *, - tts_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None, - tts_config_path: Optional[Union[str, os.PathLike[Any]]] = None, - tts_speakers_file: Optional[Union[str, os.PathLike[Any]]] = None, - tts_languages_file: Optional[Union[str, os.PathLike[Any]]] = None, - vocoder_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None, - vocoder_config: Optional[Union[str, os.PathLike[Any]]] = None, - encoder_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None, - encoder_config: Optional[Union[str, os.PathLike[Any]]] = None, - vc_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None, - vc_config: Optional[Union[str, os.PathLike[Any]]] = None, - model_dir: Optional[Union[str, os.PathLike[Any]]] = None, - voice_dir: Optional[Union[str, os.PathLike[Any]]] = None, + tts_checkpoint: str | os.PathLike[Any] | None = None, + tts_config_path: str | os.PathLike[Any] | None = None, + tts_speakers_file: str | os.PathLike[Any] | None = None, + tts_languages_file: str | os.PathLike[Any] | None = None, + vocoder_checkpoint: str | os.PathLike[Any] | None = None, + vocoder_config: str | os.PathLike[Any] | None = None, + encoder_checkpoint: str | os.PathLike[Any] | None = None, + encoder_config: str | os.PathLike[Any] | None = None, + vc_checkpoint: str | os.PathLike[Any] | None = None, + vc_config: str | os.PathLike[Any] | None = None, + model_dir: str | os.PathLike[Any] | None = None, + voice_dir: str | os.PathLike[Any] | None = None, use_cuda: bool = False, ) -> None: """General 🐸 TTS interface for inference. It takes a tts and a vocoder @@ -248,7 +248,7 @@ def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> N if use_cuda: self.vocoder_model.cuda() - def split_into_sentences(self, text) -> List[str]: + def split_into_sentences(self, text) -> list[str]: """Split give text into sentences. Args: @@ -259,7 +259,7 @@ def split_into_sentences(self, text) -> List[str]: """ return self.seg.segment(text) - def save_wav(self, wav: List[int], path: str, pipe_out=None) -> None: + def save_wav(self, wav: list[int], path: str, pipe_out=None) -> None: """Save the waveform as a file. Args: @@ -274,7 +274,7 @@ def save_wav(self, wav: List[int], path: str, pipe_out=None) -> None: wav = np.array(wav) save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate, pipe_out=pipe_out) - def voice_conversion(self, source_wav: str, target_wav: Union[str, list[str]], **kwargs) -> List[int]: + def voice_conversion(self, source_wav: str, target_wav: str | list[str], **kwargs) -> list[int]: start_time = time.time() if not isinstance(target_wav, list): @@ -302,7 +302,7 @@ def tts( reference_speaker_name=None, split_sentences: bool = True, **kwargs, - ) -> List[int]: + ) -> list[int]: """🐸 TTS magic. Run all the models and generate speech. Args: diff --git a/TTS/vc/configs/freevc_config.py b/TTS/vc/configs/freevc_config.py index d600bfb1f4..37f8048b7f 100644 --- a/TTS/vc/configs/freevc_config.py +++ b/TTS/vc/configs/freevc_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List, Optional from coqpit import Coqpit @@ -47,7 +46,7 @@ class FreeVCAudioConfig(Coqpit): win_length: int = field(default=1280) n_mel_channels: int = field(default=80) mel_fmin: float = field(default=0.0) - mel_fmax: Optional[float] = field(default=None) + mel_fmax: float | None = field(default=None) @dataclass @@ -122,11 +121,11 @@ class FreeVCArgs(Coqpit): kernel_size: int = field(default=3) p_dropout: float = field(default=0.1) resblock: str = field(default="1") - resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11]) - resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) - upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2]) + resblock_kernel_sizes: list[int] = field(default_factory=lambda: [3, 7, 11]) + resblock_dilation_sizes: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) + upsample_rates: list[int] = field(default_factory=lambda: [10, 8, 2, 2]) upsample_initial_channel: int = field(default=512) - upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4]) + upsample_kernel_sizes: list[int] = field(default_factory=lambda: [16, 16, 4, 4]) n_layers_q: int = field(default=3) use_spectral_norm: bool = field(default=False) gin_channels: int = field(default=256) @@ -269,7 +268,7 @@ class FreeVCConfig(BaseVCConfig): # use d-vectors use_d_vector_file: bool = False - d_vector_file: List[str] = None + d_vector_file: list[str] = None d_vector_dim: int = None def __post_init__(self): diff --git a/TTS/vc/configs/openvoice_config.py b/TTS/vc/configs/openvoice_config.py index 261cdd6f47..167a61ddb3 100644 --- a/TTS/vc/configs/openvoice_config.py +++ b/TTS/vc/configs/openvoice_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import Optional from coqpit import Coqpit @@ -187,13 +186,13 @@ class OpenVoiceConfig(BaseVCConfig): # multi-speaker settings # use speaker embedding layer num_speakers: int = 0 - speakers_file: Optional[str] = None + speakers_file: str | None = None speaker_embedding_channels: int = 256 # use d-vectors use_d_vector_file: bool = False - d_vector_file: Optional[list[str]] = None - d_vector_dim: Optional[int] = None + d_vector_file: list[str] | None = None + d_vector_dim: int | None = None def __post_init__(self) -> None: for key, val in self.model_args.items(): diff --git a/TTS/vc/configs/shared_configs.py b/TTS/vc/configs/shared_configs.py index 3c6b1a32cf..b84a97e487 100644 --- a/TTS/vc/configs/shared_configs.py +++ b/TTS/vc/configs/shared_configs.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig @@ -132,7 +131,7 @@ class BaseVCConfig(BaseTrainingConfig): shuffle: bool = False drop_last: bool = False # dataset - datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) + datasets: list[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # optimizer optimizer: str = "radam" optimizer_params: dict = None @@ -140,7 +139,7 @@ class BaseVCConfig(BaseTrainingConfig): lr_scheduler: str = None lr_scheduler_params: dict = field(default_factory=lambda: {}) # testing - test_sentences: List[str] = field(default_factory=lambda: []) + test_sentences: list[str] = field(default_factory=lambda: []) # evaluation eval_split_max_size: int = None eval_split_size: float = 0.01 diff --git a/TTS/vc/layers/freevc/speaker_encoder/audio.py b/TTS/vc/layers/freevc/speaker_encoder/audio.py index 5fa317ce45..5d14bf2f19 100644 --- a/TTS/vc/layers/freevc/speaker_encoder/audio.py +++ b/TTS/vc/layers/freevc/speaker_encoder/audio.py @@ -1,5 +1,4 @@ from pathlib import Path -from typing import Optional, Union # import webrtcvad import librosa @@ -16,7 +15,7 @@ int16_max = (2**15) - 1 -def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int] = None): +def preprocess_wav(fpath_or_wav: str | Path | np.ndarray, source_sr: int | None = None): """ Applies the preprocessing operations used in training the Speaker Encoder to a waveform either on disk or in memory. The waveform will be resampled to match the data hyperparameters. diff --git a/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py b/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py index 62fae59bc1..d2f4ffe394 100644 --- a/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py +++ b/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py @@ -1,6 +1,5 @@ import logging from time import perf_counter as timer -from typing import List import numpy as np import torch @@ -89,7 +88,7 @@ def compute_partial_slices(n_samples: int, rate, min_coverage): assert 0 < min_coverage <= 1 # Compute how many frames separate two partial utterances - samples_per_frame = int((sampling_rate * mel_window_step / 1000)) + samples_per_frame = int(sampling_rate * mel_window_step / 1000) n_frames = int(np.ceil((n_samples + 1) / samples_per_frame)) frame_step = int(np.round((sampling_rate / rate) / samples_per_frame)) assert 0 < frame_step, "The rate is too high" @@ -162,7 +161,7 @@ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_ return embed, partial_embeds, wav_slices return embed - def embed_speaker(self, wavs: List[np.ndarray], **kwargs): + def embed_speaker(self, wavs: list[np.ndarray], **kwargs): """ Compute the embedding of a collection of wavs (presumably from the same speaker) by averaging their embedding and L2-normalizing it. diff --git a/TTS/vc/layers/freevc/wavlm/modules.py b/TTS/vc/layers/freevc/wavlm/modules.py index 37c1a6e877..06348e4bb4 100644 --- a/TTS/vc/layers/freevc/wavlm/modules.py +++ b/TTS/vc/layers/freevc/wavlm/modules.py @@ -9,7 +9,6 @@ import math import warnings -from typing import Dict, Optional, Tuple import torch import torch.nn.functional as F @@ -158,7 +157,7 @@ def get_activation_fn(activation: str): elif activation == "glu": return lambda x: x else: - raise RuntimeError("--activation-fn {} not supported".format(activation)) + raise RuntimeError(f"--activation-fn {activation} not supported") def init_bert_params(module): @@ -424,17 +423,17 @@ def compute_bias(self, query_length, key_length): def forward( self, query, - key: Optional[Tensor], - value: Optional[Tensor], - key_padding_mask: Optional[Tensor] = None, - incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + key: Tensor | None, + value: Tensor | None, + key_padding_mask: Tensor | None = None, + incremental_state: dict[str, dict[str, Tensor | None]] | None = None, need_weights: bool = True, static_kv: bool = False, - attn_mask: Optional[Tensor] = None, + attn_mask: Tensor | None = None, before_softmax: bool = False, need_head_weights: bool = False, - position_bias: Optional[Tensor] = None, - ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]: + position_bias: Tensor | None = None, + ) -> tuple[Tensor, Tensor | None, Tensor | None]: """Input shape: Time x Batch x Channel Args: @@ -605,7 +604,7 @@ def forward( else: assert v is not None v = torch.cat([prev_value, v], dim=1) - prev_key_padding_mask: Optional[Tensor] = None + prev_key_padding_mask: Tensor | None = None if "prev_key_padding_mask" in saved_state: prev_key_padding_mask = saved_state["prev_key_padding_mask"] assert k is not None and v is not None @@ -700,7 +699,7 @@ def forward( assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn = self.out_proj(attn) - attn_weights: Optional[Tensor] = None + attn_weights: Tensor | None = None if need_weights: attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0) if not need_head_weights: @@ -711,12 +710,12 @@ def forward( @staticmethod def _append_prev_key_padding_mask( - key_padding_mask: Optional[Tensor], - prev_key_padding_mask: Optional[Tensor], + key_padding_mask: Tensor | None, + prev_key_padding_mask: Tensor | None, batch_size: int, src_len: int, static_kv: bool, - ) -> Optional[Tensor]: + ) -> Tensor | None: # saved key padding masks have shape (bsz, seq_len) if prev_key_padding_mask is not None and static_kv: new_key_padding_mask = prev_key_padding_mask @@ -748,19 +747,19 @@ def _append_prev_key_padding_mask( return new_key_padding_mask def _get_input_buffer( - self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] - ) -> Dict[str, Optional[Tensor]]: + self, incremental_state: dict[str, dict[str, Tensor | None]] | None + ) -> dict[str, Tensor | None]: result = self.get_incremental_state(incremental_state, "attn_state") if result is not None: return result else: - empty_result: Dict[str, Optional[Tensor]] = {} + empty_result: dict[str, Tensor | None] = {} return empty_result def _set_input_buffer( self, - incremental_state: Dict[str, Dict[str, Optional[Tensor]]], - buffer: Dict[str, Optional[Tensor]], + incremental_state: dict[str, dict[str, Tensor | None]], + buffer: dict[str, Tensor | None], ): return self.set_incremental_state(incremental_state, "attn_state", buffer) diff --git a/TTS/vc/layers/freevc/wavlm/wavlm.py b/TTS/vc/layers/freevc/wavlm/wavlm.py index a398aaf977..cb2c0152e8 100644 --- a/TTS/vc/layers/freevc/wavlm/wavlm.py +++ b/TTS/vc/layers/freevc/wavlm/wavlm.py @@ -9,7 +9,7 @@ import logging import math -from typing import Any, List, Optional, Tuple +from typing import Any import numpy as np import torch @@ -33,8 +33,8 @@ def compute_mask_indices( - shape: Tuple[int, int], - padding_mask: Optional[torch.Tensor], + shape: tuple[int, int], + padding_mask: torch.Tensor | None, mask_prob: float, mask_length: int, mask_type: str = "static", @@ -317,10 +317,10 @@ def forward_padding_mask( def extract_features( self, source: torch.Tensor, - padding_mask: Optional[torch.Tensor] = None, + padding_mask: torch.Tensor | None = None, mask: bool = False, ret_conv: bool = False, - output_layer: Optional[int] = None, + output_layer: int | None = None, ret_layer_results: bool = False, ) -> tuple[torch.Tensor, dict[str, Any]]: if self.feature_grad_mult > 0: @@ -367,7 +367,7 @@ def extract_features( class ConvFeatureExtractionModel(nn.Module): def __init__( self, - conv_layers: List[Tuple[int, int, int]], + conv_layers: list[tuple[int, int, int]], dropout: float = 0.0, mode: str = "default", conv_bias: bool = False, diff --git a/TTS/vc/models/base_vc.py b/TTS/vc/models/base_vc.py index 6f7fb192b0..90afd997b4 100644 --- a/TTS/vc/models/base_vc.py +++ b/TTS/vc/models/base_vc.py @@ -1,7 +1,7 @@ import logging import os import random -from typing import Any, Optional, Union +from typing import Any import torch import torch.distributed as dist @@ -37,9 +37,9 @@ class BaseVC(BaseTrainerModel): def __init__( self, config: Coqpit, - ap: Optional[AudioProcessor] = None, - speaker_manager: Optional[SpeakerManager] = None, - language_manager: Optional[LanguageManager] = None, + ap: AudioProcessor | None = None, + speaker_manager: SpeakerManager | None = None, + language_manager: LanguageManager | None = None, ) -> None: super().__init__() self.config = config @@ -69,7 +69,7 @@ def _set_model_args(self, config: Coqpit) -> None: else: raise ValueError("config must be either a *Config or *Args") - def init_multispeaker(self, config: Coqpit, data: Optional[list[Any]] = None) -> None: + def init_multispeaker(self, config: Coqpit, data: list[Any] | None = None) -> None: """Initialize a speaker embedding layer if needen and define expected embedding channel size for defining `in_channels` size of the connected layers. @@ -106,7 +106,7 @@ def get_aux_input(self, **kwargs: Any) -> dict[str, Any]: """Prepare and return `aux_input` used by `forward()`""" return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None} - def get_aux_input_from_test_sentences(self, sentence_info: Union[str, list[str]]) -> dict[str, Any]: + def get_aux_input_from_test_sentences(self, sentence_info: str | list[str]) -> dict[str, Any]: if hasattr(self.config, "model_args"): config = self.config.model_args else: @@ -275,10 +275,10 @@ def get_data_loader( config: Coqpit, assets: dict, is_eval: bool, - samples: Union[list[dict], list[list]], + samples: list[dict] | list[list], verbose: bool, num_gpus: int, - rank: Optional[int] = None, + rank: int | None = None, ) -> "DataLoader": if is_eval and not config.run_eval: loader = None @@ -402,13 +402,11 @@ def test_run(self, assets: dict) -> tuple[dict, dict]: use_griffin_lim=True, do_trim_silence=False, ) - test_audios["{}-audio".format(idx)] = outputs_dict["wav"] - test_figures["{}-prediction".format(idx)] = plot_spectrogram( + test_audios[f"{idx}-audio"] = outputs_dict["wav"] + test_figures[f"{idx}-prediction"] = plot_spectrogram( outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False ) - test_figures["{}-alignment".format(idx)] = plot_alignment( - outputs_dict["outputs"]["alignments"], output_fig=False - ) + test_figures[f"{idx}-alignment"] = plot_alignment(outputs_dict["outputs"]["alignments"], output_fig=False) return test_figures, test_audios def on_init_start(self, trainer: Trainer) -> None: diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py index 104ad9ae6c..7d946a1eff 100644 --- a/TTS/vc/models/freevc.py +++ b/TTS/vc/models/freevc.py @@ -1,5 +1,4 @@ import logging -from typing import Optional, Tuple, Union import librosa import numpy as np @@ -331,15 +330,15 @@ def forward( self, c: torch.Tensor, spec: torch.Tensor, - g: Optional[torch.Tensor] = None, - mel: Optional[torch.Tensor] = None, - c_lengths: Optional[torch.Tensor] = None, - spec_lengths: Optional[torch.Tensor] = None, - ) -> Tuple[ + g: torch.Tensor | None = None, + mel: torch.Tensor | None = None, + c_lengths: torch.Tensor | None = None, + spec_lengths: torch.Tensor | None = None, + ) -> tuple[ torch.Tensor, torch.Tensor, torch.Tensor, - Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], ]: """ Forward pass of the model. @@ -431,7 +430,7 @@ def load_audio(self, wav): return wav.float() @torch.inference_mode() - def voice_conversion(self, src: Union[str, torch.Tensor], tgt: list[Union[str, torch.Tensor]]): + def voice_conversion(self, src: str | torch.Tensor, tgt: list[str | torch.Tensor]): """ Voice conversion pass of the model. diff --git a/TTS/vc/models/knnvc.py b/TTS/vc/models/knnvc.py index 968873cfec..c31f52e749 100644 --- a/TTS/vc/models/knnvc.py +++ b/TTS/vc/models/knnvc.py @@ -1,12 +1,11 @@ import logging import os -from typing import Any, Optional, Union +from typing import Any, TypeAlias import torch import torch.nn.functional as F import torchaudio from coqpit import Coqpit -from typing_extensions import TypeAlias from TTS.vc.configs.knnvc_config import KNNVCConfig from TTS.vc.layers.freevc.wavlm import get_wavlm @@ -14,7 +13,7 @@ logger = logging.getLogger(__name__) -PathOrTensor: TypeAlias = Union[str, os.PathLike[Any], torch.Tensor] +PathOrTensor: TypeAlias = str | os.PathLike[Any] | torch.Tensor class KNNVC(BaseVC): @@ -126,9 +125,9 @@ def match( self, query_seq: torch.Tensor, matching_set: torch.Tensor, - synth_set: Optional[torch.Tensor] = None, - topk: Optional[int] = None, - target_duration: Optional[float] = None, + synth_set: torch.Tensor | None = None, + topk: int | None = None, + target_duration: float | None = None, ) -> torch.Tensor: """Given `query_seq`, `matching_set`, and `synth_set` tensors of shape (N, dim), perform kNN regression matching with k=`topk`. @@ -162,7 +161,7 @@ def match( out_feats = synth_set[best.indices].mean(dim=1) return out_feats.unsqueeze(0) - def load_checkpoint(self, vc_config: KNNVCConfig, _vc_checkpoint: Union[str, os.PathLike[Any]]) -> None: + def load_checkpoint(self, vc_config: KNNVCConfig, _vc_checkpoint: str | os.PathLike[Any]) -> None: """kNN-VC does not use checkpoints.""" def forward(self) -> None: ... @@ -173,7 +172,7 @@ def voice_conversion( self, source: PathOrTensor, target: list[PathOrTensor], - topk: Optional[int] = None, + topk: int | None = None, ) -> torch.Tensor: if not isinstance(target, list): target = [target] diff --git a/TTS/vc/models/openvoice.py b/TTS/vc/models/openvoice.py index 3cb37e64b5..1049a580c7 100644 --- a/TTS/vc/models/openvoice.py +++ b/TTS/vc/models/openvoice.py @@ -1,8 +1,9 @@ import json import logging import os +from collections.abc import Mapping from pathlib import Path -from typing import Any, Mapping, Optional, Union +from typing import Any import librosa import numpy as np @@ -117,7 +118,7 @@ class OpenVoice(BaseVC): October 2023, serving as the backend of MyShell. """ - def __init__(self, config: Coqpit, speaker_manager: Optional[SpeakerManager] = None) -> None: + def __init__(self, config: Coqpit, speaker_manager: SpeakerManager | None = None) -> None: super().__init__(config, None, speaker_manager, None) self.init_multispeaker(config) @@ -178,7 +179,7 @@ def __init__(self, config: Coqpit, speaker_manager: Optional[SpeakerManager] = N def init_from_config(config: OpenVoiceConfig) -> "OpenVoice": return OpenVoice(config) - def init_multispeaker(self, config: Coqpit, data: Optional[list[Any]] = None) -> None: + def init_multispeaker(self, config: Coqpit, data: list[Any] | None = None) -> None: """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer or with external `d_vectors` computed from a speaker encoder model. @@ -195,7 +196,7 @@ def init_multispeaker(self, config: Coqpit, data: Optional[list[Any]] = None) -> def load_checkpoint( self, config: OpenVoiceConfig, - checkpoint_path: Union[str, os.PathLike[Any]], + checkpoint_path: str | os.PathLike[Any], eval: bool = False, strict: bool = True, cache: bool = False, @@ -219,7 +220,7 @@ def train_step(self) -> None: ... def eval_step(self) -> None: ... @staticmethod - def _set_x_lengths(x: torch.Tensor, aux_input: Mapping[str, Optional[torch.Tensor]]) -> torch.Tensor: + def _set_x_lengths(x: torch.Tensor, aux_input: Mapping[str, torch.Tensor | None]) -> torch.Tensor: if "x_lengths" in aux_input and aux_input["x_lengths"] is not None: return aux_input["x_lengths"] return torch.tensor(x.shape[-1:]).to(x.device) @@ -228,7 +229,7 @@ def _set_x_lengths(x: torch.Tensor, aux_input: Mapping[str, Optional[torch.Tenso def inference( self, x: torch.Tensor, - aux_input: Mapping[str, Optional[torch.Tensor]] = {"x_lengths": None, "g_src": None, "g_tgt": None}, + aux_input: Mapping[str, torch.Tensor | None] = {"x_lengths": None, "g_src": None, "g_tgt": None}, ) -> dict[str, torch.Tensor]: """ Inference pass of the model @@ -267,7 +268,7 @@ def inference( "z_hat": z_hat, } - def load_audio(self, wav: Union[str, npt.NDArray[np.float32], torch.Tensor, list[float]]) -> torch.Tensor: + def load_audio(self, wav: str | npt.NDArray[np.float32] | torch.Tensor | list[float]) -> torch.Tensor: """Read and format the input audio.""" if isinstance(wav, str): out = torch.from_numpy(librosa.load(wav, sr=self.config.audio.input_sample_rate)[0]) @@ -279,7 +280,7 @@ def load_audio(self, wav: Union[str, npt.NDArray[np.float32], torch.Tensor, list out = wav return out.to(self.device).float() - def extract_se(self, audio: Union[str, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]: + def extract_se(self, audio: str | torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: y = self.load_audio(audio) y = y.to(self.device) y = y.unsqueeze(0) @@ -296,9 +297,7 @@ def extract_se(self, audio: Union[str, torch.Tensor]) -> tuple[torch.Tensor, tor return g, spec @torch.inference_mode() - def voice_conversion( - self, src: Union[str, torch.Tensor], tgt: list[Union[str, torch.Tensor]] - ) -> npt.NDArray[np.float32]: + def voice_conversion(self, src: str | torch.Tensor, tgt: list[str | torch.Tensor]) -> npt.NDArray[np.float32]: """ Voice conversion pass of the model. diff --git a/TTS/vocoder/configs/univnet_config.py b/TTS/vocoder/configs/univnet_config.py index 67f324cfce..85662831ee 100644 --- a/TTS/vocoder/configs/univnet_config.py +++ b/TTS/vocoder/configs/univnet_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import Dict from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig @@ -96,7 +95,7 @@ class UnivnetConfig(BaseGANVocoderConfig): # model specific params discriminator_model: str = "univnet_discriminator" generator_model: str = "univnet_generator" - generator_model_params: Dict = field( + generator_model_params: dict = field( default_factory=lambda: { "in_channels": 64, "out_channels": 1, @@ -121,7 +120,7 @@ class UnivnetConfig(BaseGANVocoderConfig): # loss weights - overrides stft_loss_weight: float = 2.5 - stft_loss_params: Dict = field( + stft_loss_params: dict = field( default_factory=lambda: { "n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], @@ -133,7 +132,7 @@ class UnivnetConfig(BaseGANVocoderConfig): hinge_G_loss_weight: float = 0 feat_match_loss_weight: float = 0 l1_spec_loss_weight: float = 0 - l1_spec_loss_params: Dict = field( + l1_spec_loss_params: dict = field( default_factory=lambda: { "use_mel": True, "sample_rate": 22050, @@ -153,7 +152,7 @@ class UnivnetConfig(BaseGANVocoderConfig): # lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) lr_scheduler_disc: str = None # one of the schedulers from https:#pytorch.org/docs/stable/optim.html # lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) - optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.5, 0.9], "weight_decay": 0.0}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.5, 0.9], "weight_decay": 0.0}) steps_to_start_discriminator: int = 200000 def __post_init__(self): diff --git a/TTS/vocoder/datasets/__init__.py b/TTS/vocoder/datasets/__init__.py index 04462817a8..d935209348 100644 --- a/TTS/vocoder/datasets/__init__.py +++ b/TTS/vocoder/datasets/__init__.py @@ -10,7 +10,7 @@ from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List) -> Dataset: +def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: list) -> Dataset: if config.model.lower() in "gan": dataset = GANDataset( ap=ap, diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py index 6f34bccb7c..3ae9015451 100644 --- a/TTS/vocoder/datasets/wavegrad_dataset.py +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -2,7 +2,6 @@ import os import random from multiprocessing import Manager -from typing import List, Tuple import numpy as np import torch @@ -65,7 +64,7 @@ def __getitem__(self, idx): item = self.load_item(idx) return item - def load_test_samples(self, num_samples: int) -> List[Tuple]: + def load_test_samples(self, num_samples: int) -> list[tuple]: """Return test samples. Args: diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py index 8d4dd725ef..0fad81864e 100644 --- a/TTS/vocoder/layers/losses.py +++ b/TTS/vocoder/layers/losses.py @@ -1,5 +1,3 @@ -from typing import Dict, Union - import torch from torch import nn from torch.nn import functional as F @@ -352,7 +350,7 @@ def forward(self, scores_fake, scores_real): class WaveRNNLoss(nn.Module): - def __init__(self, wave_rnn_mode: Union[str, int]): + def __init__(self, wave_rnn_mode: str | int): super().__init__() if wave_rnn_mode == "mold": self.loss_func = discretized_mix_logistic_loss @@ -363,6 +361,6 @@ def __init__(self, wave_rnn_mode: Union[str, int]): else: raise ValueError(" [!] Unknown mode for Wavernn.") - def forward(self, y_hat, y) -> Dict: + def forward(self, y_hat, y) -> dict: loss = self.loss_func(y_hat, y) return {"loss": loss} diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py index 7785d8011c..42dfef32b7 100644 --- a/TTS/vocoder/models/gan.py +++ b/TTS/vocoder/models/gan.py @@ -1,5 +1,4 @@ from inspect import signature -from typing import Dict, List, Tuple import numpy as np import torch @@ -65,7 +64,7 @@ def inference(self, x: torch.Tensor) -> torch.Tensor: """ return self.model_g.inference(x) - def train_step(self, batch: Dict, criterion: Dict, optimizer_idx: int) -> Tuple[Dict, Dict]: + def train_step(self, batch: dict, criterion: dict, optimizer_idx: int) -> tuple[dict, dict]: """Compute model outputs and the loss values. `optimizer_idx` selects the generator or the discriminator for network on the current pass. @@ -185,7 +184,7 @@ def train_step(self, batch: Dict, criterion: Dict, optimizer_idx: int) -> Tuple[ outputs = {"model_outputs": self.y_hat_g} return outputs, loss_dict - def _log(self, name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, Dict]: + def _log(self, name: str, ap: AudioProcessor, batch: dict, outputs: dict) -> tuple[dict, dict]: """Logging shared by the training and evaluation. Args: @@ -205,22 +204,22 @@ def _log(self, name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tup return figures, audios def train_log( - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument - ) -> Tuple[Dict, np.ndarray]: + self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + ) -> tuple[dict, np.ndarray]: """Call `_log()` for training.""" figures, audios = self._log("eval", self.ap, batch, outputs) logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) @torch.inference_mode() - def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]: + def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> tuple[dict, dict]: """Call `train_step()` with `no_grad()`""" self.train_disc = True # Avoid a bug in the Training with the missing discriminator loss return self.train_step(batch, criterion, optimizer_idx) def eval_log( - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument - ) -> Tuple[Dict, np.ndarray]: + self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + ) -> tuple[dict, np.ndarray]: """Call `_log()` for evaluation.""" figures, audios = self._log("eval", self.ap, batch, outputs) logger.eval_figures(steps, figures) @@ -259,7 +258,7 @@ def on_train_step_start(self, trainer) -> None: """ self.train_disc = trainer.total_steps_done >= self.config.steps_to_start_discriminator - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: """Initiate and return the GAN optimizers based on the config parameters. It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator. @@ -275,7 +274,7 @@ def get_optimizer(self) -> List: ) return [optimizer2, optimizer1] - def get_lr(self) -> List: + def get_lr(self) -> list: """Set the initial learning rates for each optimizer. Returns: @@ -283,7 +282,7 @@ def get_lr(self) -> List: """ return [self.config.lr_disc, self.config.lr_gen] - def get_scheduler(self, optimizer) -> List: + def get_scheduler(self, optimizer) -> list: """Set the schedulers for each optimizer. Args: @@ -297,7 +296,7 @@ def get_scheduler(self, optimizer) -> List: return [scheduler2, scheduler1] @staticmethod - def format_batch(batch: List) -> Dict: + def format_batch(batch: list) -> dict: """Format the batch for training. Args: @@ -316,9 +315,9 @@ def format_batch(batch: List) -> Dict: def get_data_loader( # pylint: disable=no-self-use, unused-argument self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: True, - samples: List, + samples: list, verbose: bool, num_gpus: int, rank: int = None, # pylint: disable=unused-argument diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py index 19f5648f4d..82909231ee 100644 --- a/TTS/vocoder/models/univnet_generator.py +++ b/TTS/vocoder/models/univnet_generator.py @@ -1,5 +1,4 @@ import logging -from typing import List import numpy as np import torch @@ -21,7 +20,7 @@ def __init__( out_channels: int, hidden_channels: int, cond_channels: int, - upsample_factors: List[int], + upsample_factors: list[int], lvc_layers_each_block: int, lvc_kernel_size: int, kpnet_hidden_channels: int, diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index d756f956dd..16c66e235b 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import Dict, List, Tuple import numpy as np import torch @@ -25,10 +24,10 @@ class WavegradArgs(Coqpit): use_weight_norm: bool = False y_conv_channels: int = 32 x_conv_channels: int = 768 - dblock_out_channels: List[int] = field(default_factory=lambda: [128, 128, 256, 512]) - ublock_out_channels: List[int] = field(default_factory=lambda: [512, 512, 256, 128, 128]) - upsample_factors: List[int] = field(default_factory=lambda: [4, 4, 4, 2, 2]) - upsample_dilations: List[List[int]] = field( + dblock_out_channels: list[int] = field(default_factory=lambda: [128, 128, 256, 512]) + ublock_out_channels: list[int] = field(default_factory=lambda: [512, 512, 256, 128, 128]) + upsample_factors: list[int] = field(default_factory=lambda: [4, 4, 4, 2, 2]) + upsample_dilations: list[list[int]] = field( default_factory=lambda: [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]] ) @@ -242,7 +241,7 @@ def load_checkpoint( ) self.compute_noise_level(betas) - def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + def train_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]: # format data x = batch["input"] y = batch["waveform"] @@ -258,20 +257,20 @@ def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: return {"model_output": noise_hat}, {"loss": loss} def train_log( # pylint: disable=no-self-use - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument - ) -> Tuple[Dict, np.ndarray]: + self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + ) -> tuple[dict, np.ndarray]: pass @torch.inference_mode() - def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + def eval_step(self, batch: dict, criterion: nn.Module) -> tuple[dict, dict]: return self.train_step(batch, criterion) def eval_log( # pylint: disable=no-self-use - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument + self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument ) -> None: pass - def test(self, assets: Dict, test_loader: "DataLoader", outputs=None): # pylint: disable=unused-argument + def test(self, assets: dict, test_loader: "DataLoader", outputs=None): # pylint: disable=unused-argument # setup noise schedule and inference ap = assets["audio_processor"] noise_schedule = self.config["test_noise_schedule"] @@ -302,13 +301,13 @@ def get_criterion(): return torch.nn.L1Loss() @staticmethod - def format_batch(batch: Dict) -> Dict: + def format_batch(batch: dict) -> dict: # return a whole audio segment m, y = batch[0], batch[1] y = y.unsqueeze(1) return {"input": m, "waveform": y} - def get_data_loader(self, config: Coqpit, assets: Dict, is_eval: True, samples: List, verbose: bool, num_gpus: int): + def get_data_loader(self, config: Coqpit, assets: dict, is_eval: True, samples: list, verbose: bool, num_gpus: int): ap = assets["audio_processor"] dataset = WaveGradDataset( ap=ap, diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 4ece55af62..2fe55f91bc 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -1,7 +1,6 @@ import sys import time from dataclasses import dataclass, field -from typing import Dict, List, Tuple import numpy as np import torch @@ -171,7 +170,7 @@ class WavernnArgs(Coqpit): num_res_blocks: int = 10 use_aux_net: bool = True use_upsample_net: bool = True - upsample_factors: List[int] = field(default_factory=lambda: [4, 8, 8]) + upsample_factors: list[int] = field(default_factory=lambda: [4, 8, 8]) mode: str = "mold" # mold [string], gauss [string], bits [int] mulaw: bool = True # apply mulaw if mode is bits pad: int = 2 @@ -537,7 +536,7 @@ def load_checkpoint( self.eval() assert not self.training - def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + def train_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]: mels = batch["input"] waveform = batch["waveform"] waveform_coarse = batch["waveform_coarse"] @@ -552,13 +551,13 @@ def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: loss_dict = criterion(y_hat, waveform_coarse) return {"model_output": y_hat}, loss_dict - def eval_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + def eval_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]: return self.train_step(batch, criterion) @torch.no_grad() def test( - self, assets: Dict, test_loader: "DataLoader", output: Dict # pylint: disable=unused-argument - ) -> Tuple[Dict, Dict]: + self, assets: dict, test_loader: "DataLoader", output: dict # pylint: disable=unused-argument + ) -> tuple[dict, dict]: ap = self.ap figures = {} audios = {} @@ -579,14 +578,14 @@ def test( return figures, audios def test_log( - self, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument - ) -> Tuple[Dict, np.ndarray]: + self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + ) -> tuple[dict, np.ndarray]: figures, audios = outputs logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) @staticmethod - def format_batch(batch: Dict) -> Dict: + def format_batch(batch: dict) -> dict: waveform = batch[0] mels = batch[1] waveform_coarse = batch[2] @@ -595,9 +594,9 @@ def format_batch(batch: Dict) -> Dict: def get_data_loader( # pylint: disable=no-self-use self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: True, - samples: List, + samples: list, verbose: bool, num_gpus: int, ): diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py index fe706ba9ff..bef68e5564 100644 --- a/TTS/vocoder/utils/distribution.py +++ b/TTS/vocoder/utils/distribution.py @@ -12,7 +12,7 @@ def gaussian_loss(y_hat, y, log_std_min=-7.0): mean = y_hat[:, :, :1] log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min) # TODO: replace with pytorch dist - log_probs = -0.5 * (-math.log(2.0 * math.pi) - 2.0 * log_std - torch.pow(y - mean, 2) * torch.exp((-2.0 * log_std))) + log_probs = -0.5 * (-math.log(2.0 * math.pi) - 2.0 * log_std - torch.pow(y - mean, 2) * torch.exp(-2.0 * log_std)) return log_probs.squeeze().mean() diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index ac797d97f7..2823d206a0 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -1,5 +1,4 @@ import logging -from typing import Dict import numpy as np import torch @@ -32,7 +31,7 @@ def interpolate_vocoder_input(scale_factor, spec): return spec -def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> Dict: +def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> dict: """Plot the predicted and the real waveform and their spectrograms. Args: diff --git a/notebooks/dataset_analysis/analyze.py b/notebooks/dataset_analysis/analyze.py index 4855886efd..44bf25c071 100644 --- a/notebooks/dataset_analysis/analyze.py +++ b/notebooks/dataset_analysis/analyze.py @@ -43,7 +43,7 @@ def process_meta_data(path): meta_data = {} # load meta data - with open(path, "r", encoding="utf-8") as f: + with open(path, encoding="utf-8") as f: data = csv.reader(f, delimiter="|") for row in data: frames = int(row[2]) @@ -58,7 +58,7 @@ def process_meta_data(path): "utt": utt, "frames": frames, "audio_len": audio_len, - "row": "{}|{}|{}|{}".format(row[0], row[1], row[2], row[3]), + "row": f"{row[0]}|{row[1]}|{row[2]}|{row[3]}", } ) @@ -156,7 +156,7 @@ def plot_phonemes(train_path, cmu_dict_path, save_path): phonemes = {} - with open(train_path, "r", encoding="utf-8") as f: + with open(train_path, encoding="utf-8") as f: data = csv.reader(f, delimiter="|") phonemes["None"] = 0 for row in data: diff --git a/pyproject.toml b/pyproject.toml index bc5b869dce..d84b3bb947 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -201,6 +201,7 @@ lint.extend-select = [ "PLR0911", # too-many-return-statements "PLR1711", # useless-return "PLW", + "UP", # pyupgrade "W291", # trailing-whitespace "NPY201", # NumPy 2.0 deprecation ] diff --git a/tests/__init__.py b/tests/__init__.py index 8108bdeb50..1a03d07552 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,5 +1,6 @@ import os -from typing import Callable, Optional +from collections.abc import Callable +from typing import Optional import pytest from trainer.generic_utils import get_cuda @@ -46,7 +47,7 @@ def run_cli(command): assert exit_status == 0, f" [!] command `{command}` failed." -def run_main(main_func: Callable, args: Optional[list[str]] = None, expected_code: int = 0): +def run_main(main_func: Callable, args: list[str] | None = None, expected_code: int = 0): with pytest.raises(SystemExit) as exc_info: main_func(args) assert exc_info.value.code == expected_code diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index f260af161e..975281c549 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -51,7 +51,7 @@ if not os.path.exists(c.data_path): DATA_EXIST = False -print(" > Dynamic data loader test: {}".format(DATA_EXIST)) +print(f" > Dynamic data loader test: {DATA_EXIST}") def _create_dataloader(batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False): diff --git a/tests/tts_tests/test_losses.py b/tests/tts_tests/test_losses.py index 794478dca3..2290e9a6cc 100644 --- a/tests/tts_tests/test_losses.py +++ b/tests/tts_tests/test_losses.py @@ -21,7 +21,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 8, 128).float() @@ -29,14 +29,14 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" # seq_len_norm = True # test input == target @@ -52,7 +52,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 8, 128).float() @@ -60,14 +60,14 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item()) + assert abs(output.item() - 1.0) < 1e-5, f"1.0 vs {output.item()}" dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" class MSELossMaskedTests(unittest.TestCase): @@ -85,7 +85,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 8, 128).float() @@ -93,14 +93,14 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" # seq_len_norm = True # test input == target @@ -116,7 +116,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 8, 128).float() @@ -124,14 +124,14 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item()) + assert abs(output.item() - 1.0) < 1e-5, f"1.0 vs {output.item()}" dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" class SSIMLossTests(unittest.TestCase): @@ -153,7 +153,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.ones(4) * 58).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() >= 1.0, "0 vs {}".format(output.item()) + assert output.item() >= 1.0, f"0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 57, 128).float() @@ -168,7 +168,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(54, 58)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" # seq_len_norm = True # test input == target @@ -184,7 +184,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_target = T.zeros(4, 57, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 57, 128).float() @@ -192,14 +192,14 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(54, 58)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item()) + assert abs(output.item() - 1.0) < 1e-5, f"1.0 vs {output.item()}" dummy_input = T.rand(4, 57, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(54, 58)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" class BCELossTest(unittest.TestCase): diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py index 191e0a19ee..12791feda4 100644 --- a/tests/tts_tests/test_tacotron2_d-vectors_train.py +++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py @@ -63,7 +63,7 @@ def test_train(tmp_path): continue_speakers_path = config.d_vector_file # Check integrity of the config - with open(continue_config_path, "r", encoding="utf-8") as f: + with open(continue_config_path, encoding="utf-8") as f: config_loaded = json.load(f) assert config_loaded["characters"] is not None assert config_loaded["output_path"] in str(continue_path) diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index 72b6bcd46b..9a8027736e 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -72,9 +72,9 @@ def test_train_step(self): # pylint: disable=no-self-use for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) + assert ( + param != param_ref + ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" count += 1 @@ -131,9 +131,9 @@ def test_train_step(): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) + assert ( + param != param_ref + ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" count += 1 @@ -198,9 +198,9 @@ def test_train_step(self): if name == "gst_layer.encoder.recurrence.weight_hh_l0": # print(param.grad) continue - assert (param != param_ref).any(), "param {} {} with shape {} not updated!! \n{}\n{}".format( - name, count, param.shape, param, param_ref - ) + assert ( + param != param_ref + ).any(), f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" count += 1 # with file gst style @@ -254,9 +254,9 @@ def test_train_step(self): if name == "gst_layer.encoder.recurrence.weight_hh_l0": # print(param.grad) continue - assert (param != param_ref).any(), "param {} {} with shape {} not updated!! \n{}\n{}".format( - name, count, param.shape, param, param_ref - ) + assert ( + param != param_ref + ).any(), f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" count += 1 @@ -321,9 +321,9 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) + assert ( + param != param_ref + ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" count += 1 @@ -384,7 +384,7 @@ def test_train_step(): name, param = name_param if name == "gst_layer.encoder.recurrence.weight_hh_l0": continue - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) + assert ( + param != param_ref + ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" count += 1 diff --git a/tests/tts_tests/test_tacotron_layers.py b/tests/tts_tests/test_tacotron_layers.py index 43e72417c2..9521cfea26 100644 --- a/tests/tts_tests/test_tacotron_layers.py +++ b/tests/tts_tests/test_tacotron_layers.py @@ -67,8 +67,8 @@ def test_in_out(): output, alignment, stop_tokens = layer(dummy_input, dummy_memory, mask=None) assert output.shape[0] == 4 - assert output.shape[1] == 80, "size not {}".format(output.shape[1]) - assert output.shape[2] == 2, "size not {}".format(output.shape[2]) + assert output.shape[1] == 80, f"size not {output.shape[1]}" + assert output.shape[2] == 2, f"size not {output.shape[2]}" assert stop_tokens.shape[0] == 4 diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 7ec3f0df1b..e71baa574d 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -71,9 +71,9 @@ def test_train_step(): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) + assert ( + param != param_ref + ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" count += 1 @@ -127,9 +127,9 @@ def test_train_step(): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) + assert ( + param != param_ref + ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" count += 1 @@ -186,9 +186,9 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) + assert ( + param != param_ref + ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" count += 1 # with file gst style @@ -238,9 +238,9 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) + assert ( + param != param_ref + ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" count += 1 @@ -305,9 +305,9 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) + assert ( + param != param_ref + ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" count += 1 @@ -366,7 +366,7 @@ def test_train_step(): name, param = name_param if name == "gst_layer.encoder.recurrence.weight_hh_l0": continue - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) + assert ( + param != param_ref + ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" count += 1 diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index c8a52e1c1b..f0b347b895 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -373,9 +373,9 @@ def _check_parameter_changes(model, model_ref): name = item1[0] param = item1[1] param_ref = item2[1] - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - name, param.shape, param, param_ref - ) + assert ( + param != param_ref + ).any(), f"param {name} with shape {param.shape} not updated!! \n{param}\n{param_ref}" count = count + 1 def _create_batch(self, config, batch_size): diff --git a/tests/tts_tests2/test_glow_tts.py b/tests/tts_tests2/test_glow_tts.py index 3c7ac51556..d0557b6311 100644 --- a/tests/tts_tests2/test_glow_tts.py +++ b/tests/tts_tests2/test_glow_tts.py @@ -42,9 +42,9 @@ def _create_inputs(batch_size=8): def _check_parameter_changes(model, model_ref): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) + assert ( + param != param_ref + ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" count += 1 def test_init_multispeaker(self): diff --git a/tests/vocoder_tests/test_wavegrad.py b/tests/vocoder_tests/test_wavegrad.py index 7530bec426..d1d3610b70 100644 --- a/tests/vocoder_tests/test_wavegrad.py +++ b/tests/vocoder_tests/test_wavegrad.py @@ -47,6 +47,4 @@ def test_train_step(): for i, (param, param_ref) in enumerate(zip(model.parameters(), model_ref.parameters())): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - i, param.shape, param, param_ref - ) + assert (param != param_ref).any(), f"param {i} with shape {param.shape} not updated!! \n{param}\n{param_ref}" From 4b23fb2cbb61622482dee67f98b961616eb8bb85 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Sat, 11 Jan 2025 00:59:44 +0100 Subject: [PATCH 5/8] refactor: apply unsafe automatic ruff lint fixes Manually checked and adjusted after generating with: uv run ruff check tests/ TTS/ notebooks/ recipes/ --fix --unsafe-fixes --- TTS/bin/train_encoder.py | 2 +- TTS/config/__init__.py | 4 +- TTS/encoder/models/base_encoder.py | 4 +- TTS/encoder/models/resnet.py | 6 +- TTS/encoder/utils/prepare_voxceleb.py | 12 ++-- TTS/tts/datasets/__init__.py | 9 ++- TTS/tts/layers/delightful_tts/conv_layers.py | 4 +- TTS/tts/layers/delightful_tts/networks.py | 4 +- TTS/tts/layers/tortoise/diffusion.py | 4 +- TTS/tts/layers/tortoise/vocoder.py | 4 +- TTS/tts/layers/xtts/tokenizer.py | 64 +++++++++---------- TTS/tts/layers/xtts/trainer/gpt_trainer.py | 2 +- TTS/tts/layers/xtts/zh_num2words.py | 11 ++-- TTS/tts/models/__init__.py | 1 - TTS/tts/utils/text/cmudict.py | 2 +- TTS/tts/utils/text/english/abbreviations.py | 2 +- TTS/tts/utils/text/french/abbreviations.py | 4 +- .../utils/text/phonemizers/espeak_wrapper.py | 2 +- TTS/tts/utils/text/tokenizer.py | 2 +- TTS/utils/radam.py | 2 +- TTS/vc/layers/freevc/modules.py | 6 +- TTS/vc/layers/freevc/wavlm/modules.py | 6 +- TTS/vc/models/__init__.py | 1 - TTS/vc/models/freevc.py | 6 +- TTS/vocoder/datasets/__init__.py | 2 - TTS/vocoder/datasets/gan_dataset.py | 2 +- TTS/vocoder/datasets/wavernn_dataset.py | 2 +- TTS/vocoder/layers/wavegrad.py | 2 +- .../models/parallel_wavegan_discriminator.py | 4 +- .../models/parallel_wavegan_generator.py | 2 +- TTS/vocoder/models/univnet_generator.py | 2 +- tests/tts_tests/test_tacotron_model.py | 12 ++-- tests/tts_tests2/test_glow_tts.py | 8 +-- tests/vc_tests/test_freevc.py | 2 +- 34 files changed, 98 insertions(+), 104 deletions(-) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 8d377db241..914c729856 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -296,7 +296,7 @@ def main(args): # pylint: disable=redefined-outer-name criterion, args.restore_step = model.load_checkpoint( c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion ) - print(" > Model restored from step %d" % args.restore_step, flush=True) + print(f" > Model restored from step {args.restore_step}", flush=True) else: args.restore_step = 0 diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index e47a141c73..401003504e 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -1,7 +1,7 @@ import json import os import re -from typing import Any, Dict, Union +from typing import Any, Union import fsspec import yaml @@ -58,7 +58,7 @@ def _process_model_name(config_dict: dict) -> str: """Format the model name as expected. It is a band-aid for the old `vocoder` model names. Args: - config_dict (Dict): A dictionary including the config fields. + config_dict (dict): A dictionary including the config fields. Returns: str: Formatted modelname. diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py index 603481cc56..c6680c3a25 100644 --- a/TTS/encoder/models/base_encoder.py +++ b/TTS/encoder/models/base_encoder.py @@ -34,7 +34,7 @@ class BaseEncoder(nn.Module): # pylint: disable=W0102 def __init__(self): - super(BaseEncoder, self).__init__() + super().__init__() def get_torch_mel_spectrogram_class(self, audio_config): return torch.nn.Sequential( @@ -107,7 +107,7 @@ def get_criterion(self, c: Coqpit, num_classes=None): elif c.loss == "softmaxproto": criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes) else: - raise Exception("The %s not is a loss supported" % c.loss) + raise Exception(f"The {c.loss} not is a loss supported") return criterion def load_checkpoint( diff --git a/TTS/encoder/models/resnet.py b/TTS/encoder/models/resnet.py index 5eafcd6005..d7f3a2f4bd 100644 --- a/TTS/encoder/models/resnet.py +++ b/TTS/encoder/models/resnet.py @@ -7,7 +7,7 @@ class SELayer(nn.Module): def __init__(self, channel, reduction=8): - super(SELayer, self).__init__() + super().__init__() self.avg_pool = nn.AdaptiveAvgPool2d(1) self.fc = nn.Sequential( nn.Linear(channel, channel // reduction), @@ -27,7 +27,7 @@ class SEBasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8): - super(SEBasicBlock, self).__init__() + super().__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False) @@ -73,7 +73,7 @@ def __init__( use_torch_spec=False, audio_config=None, ): - super(ResNetSpeakerEncoder, self).__init__() + super().__init__() self.encoder_type = encoder_type self.input_dim = input_dim diff --git a/TTS/encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py index 18ca21c876..fe57874a99 100644 --- a/TTS/encoder/utils/prepare_voxceleb.py +++ b/TTS/encoder/utils/prepare_voxceleb.py @@ -82,17 +82,17 @@ def download_and_extract(directory, subset, urls): continue logger.info("Downloading %s to %s", url, zip_filepath) subprocess.call( - "wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath), + "wget {} --user {} --password {} -O {}".format(url, USER["user"], USER["password"], zip_filepath), shell=True, ) statinfo = os.stat(zip_filepath) - logger.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size)) + logger.info("Successfully downloaded %s, size(bytes): %d", url, statinfo.st_size) # concatenate all parts into zip files if ".zip" not in zip_filepath: zip_filepath = "_".join(zip_filepath.split("_")[:-1]) - subprocess.call("cat %s* > %s.zip" % (zip_filepath, zip_filepath), shell=True) + subprocess.call(f"cat {zip_filepath}* > {zip_filepath}.zip", shell=True) zip_filepath += ".zip" extract_path = zip_filepath.strip(".zip") @@ -100,12 +100,12 @@ def download_and_extract(directory, subset, urls): with open(zip_filepath, "rb") as f_zip: md5 = hashlib.md5(f_zip.read()).hexdigest() if md5 != MD5SUM[subset]: - raise ValueError("md5sum of %s mismatch" % zip_filepath) + raise ValueError(f"md5sum of {zip_filepath} mismatch") with zipfile.ZipFile(zip_filepath, "r") as zfile: zfile.extractall(directory) extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename) - subprocess.call("mv %s %s" % (extract_path_ori, extract_path), shell=True) + subprocess.call(f"mv {extract_path_ori} {extract_path}", shell=True) finally: # os.remove(zip_filepath) pass @@ -193,7 +193,7 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file): writer.writerow(["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"]) for wav_file in files: writer.writerow(wav_file) - logger.info(f"Successfully generated csv file {csv_file_path}") + logger.info("Successfully generated csv file %s", csv_file_path) def processor(directory, subset, force_process): diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index a99eb4290d..2f5357c642 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -4,7 +4,6 @@ from collections import Counter from collections.abc import Callable from pathlib import Path -from typing import Dict, List, Tuple, Union import numpy as np @@ -18,7 +17,7 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01): """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training. Args: - items (List[List]): + items (list[list]): A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`. eval_split_max_size (int): @@ -76,12 +75,12 @@ def load_tts_samples( eval_split_max_size=None, eval_split_size=0.01, ) -> tuple[list[list], list[list]]: - """Parse the dataset from the datasets config, load the samples as a List and load the attention alignments if provided. + """Parse the dataset from the datasets config, load the samples as a list and load the attention alignments if provided. If `formatter` is not None, apply the formatter to the samples else pick the formatter from the available ones based on the dataset name. Args: - datasets (List[Dict], Dict): A list of datasets or a single dataset dictionary. If multiple datasets are + datasets (list[dict], dict): A list of datasets or a single dataset dictionary. If multiple datasets are in the list, they are all merged. eval_split (bool, optional): If true, create a evaluation split. If an eval split provided explicitly, generate @@ -100,7 +99,7 @@ def load_tts_samples( If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%). Returns: - Tuple[List[List], List[List]: training and evaluation splits of the dataset. + tuple[list[list], list[list]: training and evaluation splits of the dataset. """ meta_data_train_all = [] meta_data_eval_all = [] if eval_split else None diff --git a/TTS/tts/layers/delightful_tts/conv_layers.py b/TTS/tts/layers/delightful_tts/conv_layers.py index 588d236852..5cf41d4ff6 100644 --- a/TTS/tts/layers/delightful_tts/conv_layers.py +++ b/TTS/tts/layers/delightful_tts/conv_layers.py @@ -50,7 +50,7 @@ def __init__( w_init_gain="linear", use_weight_norm=False, ): - super(ConvNorm, self).__init__() # pylint: disable=super-with-arguments + super().__init__() if padding is None: assert kernel_size % 2 == 1 padding = int(dilation * (kernel_size - 1) / 2) @@ -92,7 +92,7 @@ def __init__( lstm_type="bilstm", use_linear=True, ): - super(ConvLSTMLinear, self).__init__() # pylint: disable=super-with-arguments + super().__init__() self.out_dim = out_dim self.lstm_type = lstm_type self.use_linear = use_linear diff --git a/TTS/tts/layers/delightful_tts/networks.py b/TTS/tts/layers/delightful_tts/networks.py index d0a4adae79..93b65a2a74 100644 --- a/TTS/tts/layers/delightful_tts/networks.py +++ b/TTS/tts/layers/delightful_tts/networks.py @@ -51,7 +51,7 @@ def __init__( kernel_size=3, use_partial_padding=False, # pylint: disable=unused-argument ): - super(BottleneckLayer, self).__init__() # pylint: disable=super-with-arguments + super().__init__() self.reduction_factor = reduction_factor reduced_dim = int(in_dim / reduction_factor) @@ -194,7 +194,7 @@ class STL(nn.Module): """ def __init__(self, n_hidden: int, token_num: int): - super(STL, self).__init__() # pylint: disable=super-with-arguments + super().__init__() num_heads = 1 E = n_hidden diff --git a/TTS/tts/layers/tortoise/diffusion.py b/TTS/tts/layers/tortoise/diffusion.py index 2b29091b44..cfb8fa800d 100644 --- a/TTS/tts/layers/tortoise/diffusion.py +++ b/TTS/tts/layers/tortoise/diffusion.py @@ -653,7 +653,7 @@ def p_sample_loop_progressive( """ if device is None: device = next(model.parameters()).device - assert isinstance(shape, (tuple, list)) + assert isinstance(shape, tuple | list) if noise is not None: img = noise else: @@ -805,7 +805,7 @@ def ddim_sample_loop_progressive( """ if device is None: device = next(model.parameters()).device - assert isinstance(shape, (tuple, list)) + assert isinstance(shape, tuple | list) if noise is not None: img = noise else: diff --git a/TTS/tts/layers/tortoise/vocoder.py b/TTS/tts/layers/tortoise/vocoder.py index 6fd784f486..e7497d8190 100644 --- a/TTS/tts/layers/tortoise/vocoder.py +++ b/TTS/tts/layers/tortoise/vocoder.py @@ -293,7 +293,7 @@ def __init__( hop_length=256, n_mel_channels=100, ): - super(UnivNetGenerator, self).__init__() + super().__init__() self.mel_channel = n_mel_channels self.noise_dim = noise_dim self.hop_length = hop_length @@ -344,7 +344,7 @@ def forward(self, c, z): return z def eval(self, inference=False): - super(UnivNetGenerator, self).eval() + super().eval() # don't remove weight norm while validation in training loop if inference: self.remove_weight_norm() diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 8af4e78a40..ef4162a1cb 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -76,7 +76,7 @@ def split_sentence(text, lang, text_split_length=250): # List of (regular expression, replacement) pairs for abbreviations: _abbreviations = { "en": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("mrs", "misess"), ("mr", "mister"), @@ -99,7 +99,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "es": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("sra", "seÃąora"), ("sr", "seÃąor"), @@ -112,7 +112,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "fr": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("mme", "madame"), ("mr", "monsieur"), @@ -124,7 +124,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "de": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("fr", "frau"), ("dr", "doktor"), @@ -134,7 +134,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "pt": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("sra", "senhora"), ("sr", "senhor"), @@ -147,7 +147,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "it": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ # ("sig.ra", "signora"), ("sig", "signore"), @@ -159,7 +159,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "pl": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("p", "pani"), ("m", "pan"), @@ -169,19 +169,19 @@ def split_sentence(text, lang, text_split_length=250): ] ], "ar": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ # There are not many common abbreviations in Arabic as in English. ] ], "zh": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts. ] ], "cs": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("dr", "doktor"), # doctor ("ing", "inÅženÃŊr"), # engineer @@ -190,7 +190,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "ru": [ - (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\b", re.IGNORECASE), x[1]) for x in [ ("Đŗ-ĐļĐ°", "ĐŗĐžŅĐŋĐžĐļĐ°"), # Mrs. ("Đŗ-ĐŊ", "ĐŗĐžŅĐŋОдиĐŊ"), # Mr. @@ -199,7 +199,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "nl": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("dhr", "de heer"), # Mr. ("mevr", "mevrouw"), # Mrs. @@ -209,7 +209,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "tr": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("b", "bay"), # Mr. ("byk", "bÃŧyÃŧk"), # bÃŧyÃŧk @@ -218,7 +218,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "hu": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("dr", "doktor"), # doctor ("b", "bÃĄcsi"), # Mr. @@ -227,13 +227,13 @@ def split_sentence(text, lang, text_split_length=250): ] ], "ko": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ # Korean doesn't typically use abbreviations in the same way as Latin-based scripts. ] ], "hi": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ # Hindi doesn't typically use abbreviations in the same way as Latin-based scripts. ] @@ -249,7 +249,7 @@ def expand_abbreviations_multilingual(text, lang="en"): _symbols_multilingual = { "en": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " and "), ("@", " at "), @@ -261,7 +261,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "es": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " y "), ("@", " arroba "), @@ -273,7 +273,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "fr": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " et "), ("@", " arobase "), @@ -285,7 +285,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "de": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " und "), ("@", " at "), @@ -297,7 +297,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "pt": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " e "), ("@", " arroba "), @@ -309,7 +309,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "it": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " e "), ("@", " chiocciola "), @@ -321,7 +321,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "pl": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " i "), ("@", " małpa "), @@ -334,7 +334,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "ar": [ # Arabic - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " Ųˆ "), ("@", " ØšŲ„Ų‰ "), @@ -347,7 +347,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "zh": [ # Chinese - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " 和 "), ("@", " 在 "), @@ -360,7 +360,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "cs": [ # Czech - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " a "), ("@", " na "), @@ -373,7 +373,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "ru": [ # Russian - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " и "), ("@", " ŅĐžĐąĐ°ĐēĐ° "), @@ -386,7 +386,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "nl": [ # Dutch - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " en "), ("@", " bij "), @@ -398,7 +398,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "tr": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " ve "), ("@", " at "), @@ -410,7 +410,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "hu": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " Ês "), ("@", " kukac "), @@ -423,7 +423,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "ko": [ # Korean - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " ꡸ëĻŦęŗ  "), ("@", " ė— "), @@ -435,7 +435,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "hi": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " ā¤”ā¤° "), ("@", " ā¤ā¤Ÿ ā¤ĻāĨ€ ā¤°āĨ‡ā¤Ÿ "), diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py index 271e7b46af..e437bc35eb 100644 --- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py +++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py @@ -426,7 +426,7 @@ def get_optimizer(self) -> list: v.is_norm = isinstance(m, norm_modules) v.is_emb = isinstance(m, emb_modules) - fpn = "%s.%s" % (mn, k) if mn else k # full param name + fpn = f"{mn}.{k}" if mn else k # full param name all_param_names.add(fpn) param_map[fpn] = v if v.is_bias or v.is_norm or v.is_emb: diff --git a/TTS/tts/layers/xtts/zh_num2words.py b/TTS/tts/layers/xtts/zh_num2words.py index f0a1db786d..360d9b06c8 100644 --- a/TTS/tts/layers/xtts/zh_num2words.py +++ b/TTS/tts/layers/xtts/zh_num2words.py @@ -420,7 +420,7 @@ class ChineseNumberUnit(ChineseChar): """ def __init__(self, power, simplified, traditional, big_s, big_t): - super(ChineseNumberUnit, self).__init__(simplified, traditional) + super().__init__(simplified, traditional) self.power = power self.big_s = big_s self.big_t = big_t @@ -456,7 +456,7 @@ class ChineseNumberDigit(ChineseChar): """ def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None): - super(ChineseNumberDigit, self).__init__(simplified, traditional) + super().__init__(simplified, traditional) self.value = value self.big_s = big_s self.big_t = big_t @@ -477,7 +477,7 @@ class ChineseMath(ChineseChar): """ def __init__(self, simplified, traditional, symbol, expression=None): - super(ChineseMath, self).__init__(simplified, traditional) + super().__init__(simplified, traditional) self.symbol = symbol self.expression = expression self.big_s = simplified @@ -507,8 +507,7 @@ def __init__(self, positive, negative, point): self.point = point def __iter__(self): - for v in self.__dict__.values(): - yield v + yield from self.__dict__.values() # class OtherSymbol(object): @@ -702,7 +701,7 @@ def get_value(value_string, use_zeros=True): if isinstance(v, CND) and v.value == 2: next_symbol = result_symbols[i + 1] if i < len(result_symbols) - 1 else None previous_symbol = result_symbols[i - 1] if i > 0 else None - if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))): + if isinstance(next_symbol, CNU) and isinstance(previous_symbol, CNU | type(None)): if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)): result_symbols[i] = liang diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index 0e1587b60b..4746b13ea2 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -1,5 +1,4 @@ import logging -from typing import Dict, List, Union from TTS.utils.generic_utils import find_module diff --git a/TTS/tts/utils/text/cmudict.py b/TTS/tts/utils/text/cmudict.py index 041b42ff31..9c0df06196 100644 --- a/TTS/tts/utils/text/cmudict.py +++ b/TTS/tts/utils/text/cmudict.py @@ -119,7 +119,7 @@ def get_arpabet(word, cmudict, punctuation_symbols): word = word[:-1] arpabet = cmudict.lookup(word) if arpabet is not None: - return first_symbol + "{%s}" % arpabet[0] + last_symbol + return first_symbol + "{%s}" % arpabet[0] + last_symbol # noqa: UP031 return first_symbol + word + last_symbol diff --git a/TTS/tts/utils/text/english/abbreviations.py b/TTS/tts/utils/text/english/abbreviations.py index cd93c13c8e..20042b255b 100644 --- a/TTS/tts/utils/text/english/abbreviations.py +++ b/TTS/tts/utils/text/english/abbreviations.py @@ -2,7 +2,7 @@ # List of (regular expression, replacement) pairs for abbreviations in english: abbreviations_en = [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("mrs", "misess"), ("mr", "mister"), diff --git a/TTS/tts/utils/text/french/abbreviations.py b/TTS/tts/utils/text/french/abbreviations.py index f580dfed7b..e317bbbf3a 100644 --- a/TTS/tts/utils/text/french/abbreviations.py +++ b/TTS/tts/utils/text/french/abbreviations.py @@ -2,7 +2,7 @@ # List of (regular expression, replacement) pairs for abbreviations in french: abbreviations_fr = [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("M", "monsieur"), ("Mlle", "mademoiselle"), @@ -38,7 +38,7 @@ ("boul", "boulevard"), ] ] + [ - (re.compile("\\b%s" % x[0]), x[1]) + (re.compile(f"\\b{x[0]}"), x[1]) for x in [ ("Mlle", "mademoiselle"), ("Mlles", "mesdemoiselles"), diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py index 5c58afdf08..dbcb8994a7 100644 --- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py +++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py @@ -183,7 +183,7 @@ def phonemize_espeak(self, text: str, separator: str = "|", *, tie: bool = False else: args.append("--ipa=1") if tie: - args.append("--tie=%s" % tie) + args.append(f"--tie={tie}") tmp = tempfile.NamedTemporaryFile(mode="w+t", delete=False, encoding="utf8") tmp.write(text) diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py index 4d6c9e401e..07a8753884 100644 --- a/TTS/tts/utils/text/tokenizer.py +++ b/TTS/tts/utils/text/tokenizer.py @@ -164,7 +164,7 @@ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None): """ # init cleaners text_cleaner = None - if isinstance(config.text_cleaner, (str, list)): + if isinstance(config.text_cleaner, str | list): text_cleaner = getattr(cleaners, config.text_cleaner) # init characters diff --git a/TTS/utils/radam.py b/TTS/utils/radam.py index b5306d6ab3..b893d115c9 100644 --- a/TTS/utils/radam.py +++ b/TTS/utils/radam.py @@ -18,7 +18,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0 raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}") self.degenerated_to_sgd = degenerated_to_sgd - if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): + if isinstance(params, list | tuple) and len(params) > 0 and isinstance(params[0], dict): for param in params: if "betas" in param and (param["betas"][0] != betas[0] or param["betas"][1] != betas[1]): param["buffer"] = [[None, None, None] for _ in range(10)] diff --git a/TTS/vc/layers/freevc/modules.py b/TTS/vc/layers/freevc/modules.py index c34f22d701..92df39b5e0 100644 --- a/TTS/vc/layers/freevc/modules.py +++ b/TTS/vc/layers/freevc/modules.py @@ -48,7 +48,7 @@ def forward(self, x, x_mask): class WN(torch.nn.Module): def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): - super(WN, self).__init__() + super().__init__() assert kernel_size % 2 == 1 self.hidden_channels = hidden_channels self.kernel_size = (kernel_size,) @@ -122,7 +122,7 @@ def remove_weight_norm(self): class ResBlock1(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): - super(ResBlock1, self).__init__() + super().__init__() self.convs1 = nn.ModuleList( [ weight_norm( @@ -198,7 +198,7 @@ def remove_weight_norm(self): class ResBlock2(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3)): - super(ResBlock2, self).__init__() + super().__init__() self.convs = nn.ModuleList( [ weight_norm( diff --git a/TTS/vc/layers/freevc/wavlm/modules.py b/TTS/vc/layers/freevc/wavlm/modules.py index 06348e4bb4..cddacd69ab 100644 --- a/TTS/vc/layers/freevc/wavlm/modules.py +++ b/TTS/vc/layers/freevc/wavlm/modules.py @@ -88,7 +88,7 @@ class Swish(nn.Module): def __init__(self): """Construct an MultiHeadedAttention object.""" - super(Swish, self).__init__() + super().__init__() self.act = torch.nn.Sigmoid() def forward(self, x): @@ -97,7 +97,7 @@ def forward(self, x): class GLU_Linear(nn.Module): def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True): - super(GLU_Linear, self).__init__() + super().__init__() self.glu_type = glu_type self.output_dim = output_dim @@ -218,7 +218,7 @@ def quant_noise(module, p, block_size): return module # supported modules - assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)) + assert isinstance(module, nn.Linear | nn.Embedding | nn.Conv2d) # test whether module.weight has the right sizes wrt block_size is_conv = module.weight.ndim == 4 diff --git a/TTS/vc/models/__init__.py b/TTS/vc/models/__init__.py index 8151a0445e..859eaeb2a7 100644 --- a/TTS/vc/models/__init__.py +++ b/TTS/vc/models/__init__.py @@ -1,7 +1,6 @@ import importlib import logging import re -from typing import Dict, List, Optional, Union from TTS.vc.configs.shared_configs import BaseVCConfig from TTS.vc.models.base_vc import BaseVC diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py index 7d946a1eff..59af40a836 100644 --- a/TTS/vc/models/freevc.py +++ b/TTS/vc/models/freevc.py @@ -101,7 +101,7 @@ def __init__( upsample_kernel_sizes, gin_channels=0, ): - super(Generator, self).__init__() + super().__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) @@ -164,7 +164,7 @@ def remove_weight_norm(self): class MultiPeriodDiscriminator(torch.nn.Module): def __init__(self, use_spectral_norm=False): - super(MultiPeriodDiscriminator, self).__init__() + super().__init__() periods = [2, 3, 5, 7, 11] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] @@ -189,7 +189,7 @@ def forward(self, y, y_hat): class SpeakerEncoder(torch.nn.Module): def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256): - super(SpeakerEncoder, self).__init__() + super().__init__() self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True) self.linear = nn.Linear(model_hidden_size, model_embedding_size) self.relu = nn.ReLU() diff --git a/TTS/vocoder/datasets/__init__.py b/TTS/vocoder/datasets/__init__.py index d935209348..cef6a50b05 100644 --- a/TTS/vocoder/datasets/__init__.py +++ b/TTS/vocoder/datasets/__init__.py @@ -1,5 +1,3 @@ -from typing import List - from coqpit import Coqpit from torch.utils.data import Dataset diff --git a/TTS/vocoder/datasets/gan_dataset.py b/TTS/vocoder/datasets/gan_dataset.py index 0806c0d496..c0882c701f 100644 --- a/TTS/vocoder/datasets/gan_dataset.py +++ b/TTS/vocoder/datasets/gan_dataset.py @@ -32,7 +32,7 @@ def __init__( super().__init__() self.ap = ap self.item_list = items - self.compute_feat = not isinstance(items[0], (tuple, list)) + self.compute_feat = not isinstance(items[0], tuple | list) self.seq_len = seq_len self.hop_len = hop_len self.pad_short = pad_short diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 4c4f5c48df..ffb71177c5 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -18,7 +18,7 @@ class WaveRNNDataset(Dataset): def __init__(self, ap, items, seq_len, hop_len, pad, mode, mulaw, is_training=True, return_segments=True): super().__init__() self.ap = ap - self.compute_feat = not isinstance(items[0], (tuple, list)) + self.compute_feat = not isinstance(items[0], tuple | list) self.item_list = items self.seq_len = seq_len self.hop_len = hop_len diff --git a/TTS/vocoder/layers/wavegrad.py b/TTS/vocoder/layers/wavegrad.py index 9f1512c6d4..187e7062e2 100644 --- a/TTS/vocoder/layers/wavegrad.py +++ b/TTS/vocoder/layers/wavegrad.py @@ -74,7 +74,7 @@ def shif_and_scale(x, scale, shift): class UBlock(nn.Module): def __init__(self, input_size, hidden_size, factor, dilation): super().__init__() - assert isinstance(dilation, (list, tuple)) + assert isinstance(dilation, list | tuple) assert len(dilation) == 4 self.factor = factor diff --git a/TTS/vocoder/models/parallel_wavegan_discriminator.py b/TTS/vocoder/models/parallel_wavegan_discriminator.py index 211d45d91c..02ad60e0ff 100644 --- a/TTS/vocoder/models/parallel_wavegan_discriminator.py +++ b/TTS/vocoder/models/parallel_wavegan_discriminator.py @@ -71,7 +71,7 @@ def forward(self, x): def apply_weight_norm(self): def _apply_weight_norm(m): - if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d): torch.nn.utils.parametrizations.weight_norm(m) self.apply(_apply_weight_norm) @@ -174,7 +174,7 @@ def forward(self, x): def apply_weight_norm(self): def _apply_weight_norm(m): - if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d): torch.nn.utils.parametrizations.weight_norm(m) self.apply(_apply_weight_norm) diff --git a/TTS/vocoder/models/parallel_wavegan_generator.py b/TTS/vocoder/models/parallel_wavegan_generator.py index 0659a00cc1..f4ef3a0734 100644 --- a/TTS/vocoder/models/parallel_wavegan_generator.py +++ b/TTS/vocoder/models/parallel_wavegan_generator.py @@ -145,7 +145,7 @@ def _remove_weight_norm(m): def apply_weight_norm(self): def _apply_weight_norm(m): - if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d): torch.nn.utils.parametrizations.weight_norm(m) logger.info("Weight norm is applied to %s", m) diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py index 82909231ee..d991941441 100644 --- a/TTS/vocoder/models/univnet_generator.py +++ b/TTS/vocoder/models/univnet_generator.py @@ -127,7 +127,7 @@ def apply_weight_norm(self): """Apply weight normalization module from all of the layers.""" def _apply_weight_norm(m): - if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d): torch.nn.utils.parametrizations.weight_norm(m) logger.info("Weight norm is applied to %s", m) diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index e71baa574d..3976b9ae8d 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -51,7 +51,7 @@ def test_train_step(): criterion_st = nn.BCEWithLogitsLoss().to(device) model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() - print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -105,7 +105,7 @@ def test_train_step(): config.d_vector_dim = 55 model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() - print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -165,7 +165,7 @@ def test_train_step(): model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) - print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron GST model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -217,7 +217,7 @@ def test_train_step(): model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) - print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron GST model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -288,7 +288,7 @@ def test_train_step(): criterion = model.get_criterion() optimizer = model.get_optimizer() model.train() - print(" > Num parameters for Tacotron with Capacitron VAE model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron with Capacitron VAE model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -341,7 +341,7 @@ def test_train_step(): config.d_vector_dim = 55 model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() - print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): diff --git a/tests/tts_tests2/test_glow_tts.py b/tests/tts_tests2/test_glow_tts.py index d0557b6311..967e9ecb9e 100644 --- a/tests/tts_tests2/test_glow_tts.py +++ b/tests/tts_tests2/test_glow_tts.py @@ -107,7 +107,7 @@ def _test_forward(self, batch_size): config = GlowTTSConfig(num_chars=32) model = GlowTTS(config).to(device) model.train() - print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + print(f" > Num parameters for GlowTTS model:{count_parameters(model)}") # inference encoder and decoder with MAS y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths) self.assertEqual(y["z"].shape, mel_spec.shape) @@ -134,7 +134,7 @@ def _test_forward_with_d_vector(self, batch_size): ) model = GlowTTS.init_from_config(config).to(device) model.train() - print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + print(f" > Num parameters for GlowTTS model:{count_parameters(model)}") # inference encoder and decoder with MAS y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"d_vectors": d_vector}) self.assertEqual(y["z"].shape, mel_spec.shape) @@ -160,7 +160,7 @@ def _test_forward_with_speaker_id(self, batch_size): ) model = GlowTTS.init_from_config(config).to(device) model.train() - print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + print(f" > Num parameters for GlowTTS model:{count_parameters(model)}") # inference encoder and decoder with MAS y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"speaker_ids": speaker_ids}) self.assertEqual(y["z"].shape, mel_spec.shape) @@ -261,7 +261,7 @@ def test_train_step(self): # reference model to compare model weights model_ref = GlowTTS(config).to(device) model.train() - print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + print(f" > Num parameters for GlowTTS model:{count_parameters(model)}") # pass the state to ref model model_ref.load_state_dict(copy.deepcopy(model.state_dict())) count = 0 diff --git a/tests/vc_tests/test_freevc.py b/tests/vc_tests/test_freevc.py index fe07b2723c..dd45d6941f 100644 --- a/tests/vc_tests/test_freevc.py +++ b/tests/vc_tests/test_freevc.py @@ -55,7 +55,7 @@ def _test_forward(self, batch_size): config = FreeVCConfig() model = FreeVC(config).to(device) model.train() - print(" > Num parameters for FreeVC model:%s" % (count_parameters(model))) + print(f" > Num parameters for FreeVC model:{count_parameters(model)}") mel, spec, spec_lengths, waveform = self._create_inputs(config, batch_size) From 5a4b8b5b9aee4334ae3ab66f00dfc04aaf2840ce Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Sat, 11 Jan 2025 01:15:54 +0100 Subject: [PATCH 6/8] style: run ruff format --- TTS/encoder/configs/base_encoder_config.py | 6 +-- TTS/encoder/utils/prepare_voxceleb.py | 2 +- TTS/tts/configs/neuralhmm_tts_config.py | 12 ++--- TTS/tts/configs/overflow_config.py | 12 ++--- TTS/tts/configs/tacotron_config.py | 12 ++--- TTS/tts/datasets/__init__.py | 6 +-- TTS/tts/datasets/formatters.py | 4 +- TTS/tts/layers/bark/hubert/kmeans_hubert.py | 1 - TTS/tts/layers/bark/inference_funcs.py | 4 +- TTS/tts/layers/bark/model.py | 6 +-- TTS/tts/layers/bark/model_fine.py | 6 +-- TTS/tts/layers/feed_forward/encoder.py | 6 +-- TTS/tts/layers/generic/pos_encoding.py | 2 +- TTS/tts/layers/generic/transformer.py | 4 +- TTS/tts/layers/tortoise/arch_utils.py | 6 +-- TTS/tts/layers/tortoise/audio_utils.py | 12 ++--- TTS/tts/layers/tortoise/autoregressive.py | 6 +-- TTS/tts/layers/tortoise/dpm_solver.py | 54 +++++-------------- TTS/tts/layers/tortoise/transformer.py | 6 +-- TTS/tts/layers/tortoise/xtransformers.py | 30 +++++------ TTS/tts/layers/xtts/gpt.py | 18 +++---- TTS/tts/layers/xtts/stream_generator.py | 1 - TTS/tts/layers/xtts/trainer/gpt_trainer.py | 6 ++- TTS/tts/models/align_tts.py | 12 ++--- TTS/tts/models/bark.py | 4 +- TTS/tts/models/base_tacotron.py | 10 ++-- TTS/tts/models/base_tts.py | 6 +-- TTS/tts/models/delightful_tts.py | 14 ++--- TTS/tts/models/forward_tts.py | 8 +-- TTS/tts/models/glow_tts.py | 26 +++------ TTS/tts/models/neuralhmm_tts.py | 14 ++--- TTS/tts/models/overflow.py | 14 ++--- TTS/tts/models/tacotron.py | 4 +- TTS/tts/models/tacotron2.py | 4 +- TTS/tts/models/tortoise.py | 6 +-- TTS/tts/models/vits.py | 36 ++++++------- TTS/tts/models/xtts.py | 18 +++---- TTS/tts/utils/helpers.py | 6 +-- TTS/tts/utils/speakers.py | 6 +-- TTS/tts/utils/ssim.py | 18 +++---- TTS/tts/utils/text/bangla/phonemizer.py | 2 +- TTS/tts/utils/text/characters.py | 6 +-- TTS/tts/utils/text/english/number_norm.py | 2 +- TTS/tts/utils/text/korean/korean.py | 2 +- TTS/tts/utils/text/phonemizers/base.py | 2 +- TTS/utils/audio/processor.py | 20 ++++--- TTS/utils/samplers.py | 6 +-- TTS/vc/layers/freevc/wavlm/modules.py | 2 +- TTS/vc/layers/freevc/wavlm/wavlm.py | 14 ++--- TTS/vc/models/base_vc.py | 6 +-- TTS/vocoder/datasets/gan_dataset.py | 6 +-- TTS/vocoder/datasets/wavegrad_dataset.py | 6 +-- TTS/vocoder/layers/losses.py | 12 ++--- TTS/vocoder/layers/lvc_block.py | 6 +-- TTS/vocoder/models/gan.py | 14 ++++- TTS/vocoder/models/hifigan_generator.py | 4 +- TTS/vocoder/models/melgan_generator.py | 4 +- .../models/parallel_wavegan_generator.py | 10 ++-- TTS/vocoder/models/wavegrad.py | 18 +++++-- TTS/vocoder/models/wavernn.py | 21 +++++--- tests/text_tests/test_phonemizer.py | 8 +-- tests/text_tests/test_text_cleaners.py | 4 +- tests/tts_tests/test_tacotron2_model.py | 36 ++++++------- tests/tts_tests/test_tacotron_model.py | 36 ++++++------- tests/tts_tests/test_vits.py | 6 +-- tests/tts_tests2/test_glow_tts.py | 14 ++--- tests/vc_tests/test_freevc.py | 12 ++--- tests/vc_tests/test_openvoice.py | 7 ++- 68 files changed, 333 insertions(+), 381 deletions(-) diff --git a/TTS/encoder/configs/base_encoder_config.py b/TTS/encoder/configs/base_encoder_config.py index 97cbf47893..d2d0ef580d 100644 --- a/TTS/encoder/configs/base_encoder_config.py +++ b/TTS/encoder/configs/base_encoder_config.py @@ -55,6 +55,6 @@ class BaseEncoderConfig(BaseTrainingConfig): def check_values(self): super().check_values() c = asdict(self) - assert ( - c["model_params"]["input_dim"] == self.audio.num_mels - ), " [!] model input dimendion must be equal to melspectrogram dimension." + assert c["model_params"]["input_dim"] == self.audio.num_mels, ( + " [!] model input dimendion must be equal to melspectrogram dimension." + ) diff --git a/TTS/encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py index fe57874a99..8d50ffd5f5 100644 --- a/TTS/encoder/utils/prepare_voxceleb.py +++ b/TTS/encoder/utils/prepare_voxceleb.py @@ -16,7 +16,7 @@ # Only support eager mode and TF>=2.0.0 # pylint: disable=no-member, invalid-name, relative-beyond-top-level # pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes -""" voxceleb 1 & 2 """ +"""voxceleb 1 & 2""" import csv import hashlib diff --git a/TTS/tts/configs/neuralhmm_tts_config.py b/TTS/tts/configs/neuralhmm_tts_config.py index be7a81fa89..bd1736c880 100644 --- a/TTS/tts/configs/neuralhmm_tts_config.py +++ b/TTS/tts/configs/neuralhmm_tts_config.py @@ -161,9 +161,9 @@ def check_values(self): AssertionError: transition probability is not between 0 and 1 """ assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model." - assert ( - len(self.outputnet_size) >= 1 - ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" - assert ( - 0 < self.flat_start_params["transition_p"] < 1 - ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + assert len(self.outputnet_size) >= 1, ( + f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" + ) + assert 0 < self.flat_start_params["transition_p"] < 1, ( + f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + ) diff --git a/TTS/tts/configs/overflow_config.py b/TTS/tts/configs/overflow_config.py index 8a113f1f33..93a6a9e377 100644 --- a/TTS/tts/configs/overflow_config.py +++ b/TTS/tts/configs/overflow_config.py @@ -192,9 +192,9 @@ def check_values(self): AssertionError: transition probability is not between 0 and 1 """ assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model." - assert ( - len(self.outputnet_size) >= 1 - ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" - assert ( - 0 < self.flat_start_params["transition_p"] < 1 - ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + assert len(self.outputnet_size) >= 1, ( + f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" + ) + assert 0 < self.flat_start_params["transition_p"] < 1, ( + f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + ) diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index 7badbfac59..e4b419d1fa 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -223,12 +223,12 @@ class TacotronConfig(BaseTTSConfig): def check_values(self): if self.gradual_training: - assert ( - self.gradual_training[0][1] == self.r - ), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}" + assert self.gradual_training[0][1] == self.r, ( + f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}" + ) if self.model == "tacotron" and self.audio is not None: - assert self.out_channels == ( - self.audio.fft_size // 2 + 1 - ), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}" + assert self.out_channels == (self.audio.fft_size // 2 + 1), ( + f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}" + ) if self.model == "tacotron2" and self.audio is not None: assert self.out_channels == self.audio.num_mels diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 2f5357c642..d83abce00a 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -37,9 +37,9 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01): else: eval_split_size = int(len(items) * eval_split_size) - assert ( - eval_split_size > 0 - ), f" [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {1 / len(items)}" + assert eval_split_size > 0, ( + f" [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {1 / len(items)}" + ) np.random.seed(0) np.random.shuffle(items) if is_multi_speaker: diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 6cf65c9b5e..3a4605275a 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -424,7 +424,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic """ file_ext = "flac" items = [] - meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) + meta_files = glob(f"{os.path.join(root_path, 'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] @@ -451,7 +451,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" items = [] - meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) + meta_files = glob(f"{os.path.join(root_path, 'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] diff --git a/TTS/tts/layers/bark/hubert/kmeans_hubert.py b/TTS/tts/layers/bark/hubert/kmeans_hubert.py index ade84794eb..87be97d5d1 100644 --- a/TTS/tts/layers/bark/hubert/kmeans_hubert.py +++ b/TTS/tts/layers/bark/hubert/kmeans_hubert.py @@ -7,7 +7,6 @@ # Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py - import torch from einops import pack, unpack from torch import nn diff --git a/TTS/tts/layers/bark/inference_funcs.py b/TTS/tts/layers/bark/inference_funcs.py index 1d141dc537..457a20ea28 100644 --- a/TTS/tts/layers/bark/inference_funcs.py +++ b/TTS/tts/layers/bark/inference_funcs.py @@ -58,9 +58,7 @@ def load_npz(npz_file: str) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64 def load_voice( model, voice: str, extra_voice_dirs: list[str] = [] -) -> tuple[ - npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None -]: # pylint: disable=dangerous-default-value +) -> tuple[npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None]: # pylint: disable=dangerous-default-value if voice == "random": return None, None, None diff --git a/TTS/tts/layers/bark/model.py b/TTS/tts/layers/bark/model.py index 54a9cecec0..4850d0a88b 100644 --- a/TTS/tts/layers/bark/model.py +++ b/TTS/tts/layers/bark/model.py @@ -175,9 +175,9 @@ def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use assert idx.shape[1] >= 256 + 256 + 1 t = idx.shape[1] - 256 else: - assert ( - t <= self.config.block_size - ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + assert t <= self.config.block_size, ( + f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + ) # forward the GPT model itself if merge_context: diff --git a/TTS/tts/layers/bark/model_fine.py b/TTS/tts/layers/bark/model_fine.py index 29126b41ab..20f54d2152 100644 --- a/TTS/tts/layers/bark/model_fine.py +++ b/TTS/tts/layers/bark/model_fine.py @@ -101,9 +101,9 @@ def __init__(self, config): def forward(self, pred_idx, idx): device = idx.device b, t, codes = idx.size() - assert ( - t <= self.config.block_size - ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + assert t <= self.config.block_size, ( + f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + ) assert pred_idx > 0, "cannot predict 0th codebook" assert codes == self.n_codes_total, (b, t, codes) pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t) diff --git a/TTS/tts/layers/feed_forward/encoder.py b/TTS/tts/layers/feed_forward/encoder.py index caf939ffc7..2d08f03c2d 100644 --- a/TTS/tts/layers/feed_forward/encoder.py +++ b/TTS/tts/layers/feed_forward/encoder.py @@ -143,9 +143,9 @@ def __init__( elif encoder_type.lower() == "residual_conv_bn": self.encoder = ResidualConv1dBNEncoder(in_hidden_channels, out_channels, in_hidden_channels, encoder_params) elif encoder_type.lower() == "fftransformer": - assert ( - in_hidden_channels == out_channels - ), "[!] must be `in_channels` == `out_channels` when encoder type is 'fftransformer'" + assert in_hidden_channels == out_channels, ( + "[!] must be `in_channels` == `out_channels` when encoder type is 'fftransformer'" + ) # pylint: disable=unexpected-keyword-arg self.encoder = FFTransformerBlock(in_hidden_channels, **encoder_params) else: diff --git a/TTS/tts/layers/generic/pos_encoding.py b/TTS/tts/layers/generic/pos_encoding.py index 695e37a6e0..7765e224aa 100644 --- a/TTS/tts/layers/generic/pos_encoding.py +++ b/TTS/tts/layers/generic/pos_encoding.py @@ -18,7 +18,7 @@ class PositionalEncoding(nn.Module): def __init__(self, channels, dropout_p=0.0, max_len=5000, use_scale=False): super().__init__() if channels % 2 != 0: - raise ValueError("Cannot use sin/cos positional encoding with " f"odd channels (got channels={channels:d})") + raise ValueError(f"Cannot use sin/cos positional encoding with odd channels (got channels={channels:d})") self.use_scale = use_scale if use_scale: self.scale = torch.nn.Parameter(torch.ones(1)) diff --git a/TTS/tts/layers/generic/transformer.py b/TTS/tts/layers/generic/transformer.py index 9b7ecee2ba..2fe9bcc408 100644 --- a/TTS/tts/layers/generic/transformer.py +++ b/TTS/tts/layers/generic/transformer.py @@ -70,9 +70,7 @@ def forward(self, x, mask=None, g=None): # pylint: disable=unused-argument class FFTDurationPredictor: - def __init__( - self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None - ): # pylint: disable=unused-argument + def __init__(self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None): # pylint: disable=unused-argument self.fft = FFTransformerBlock(in_channels, num_heads, hidden_channels, num_layers, dropout_p) self.proj = nn.Linear(in_channels, 1) diff --git a/TTS/tts/layers/tortoise/arch_utils.py b/TTS/tts/layers/tortoise/arch_utils.py index 1bbf676393..00fa559c77 100644 --- a/TTS/tts/layers/tortoise/arch_utils.py +++ b/TTS/tts/layers/tortoise/arch_utils.py @@ -101,9 +101,9 @@ def __init__( if num_head_channels == -1: self.num_heads = num_heads else: - assert ( - channels % num_head_channels == 0 - ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + assert channels % num_head_channels == 0, ( + f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + ) self.num_heads = channels // num_head_channels self.norm = normalization(channels) self.qkv = nn.Conv1d(channels, channels * 3, 1) diff --git a/TTS/tts/layers/tortoise/audio_utils.py b/TTS/tts/layers/tortoise/audio_utils.py index 6d6bb8cdb7..6bbe6c389c 100644 --- a/TTS/tts/layers/tortoise/audio_utils.py +++ b/TTS/tts/layers/tortoise/audio_utils.py @@ -125,14 +125,14 @@ def load_voices(voices: list[str], extra_voice_dirs: list[str] = []): return None, None clip, latent = load_voice(voice, extra_voice_dirs) if latent is None: - assert ( - len(latents) == 0 - ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + assert len(latents) == 0, ( + "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + ) clips.extend(clip) elif clip is None: - assert ( - len(clips) == 0 - ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + assert len(clips) == 0, ( + "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + ) latents.append(latent) if len(latents) == 0: return clips, None diff --git a/TTS/tts/layers/tortoise/autoregressive.py b/TTS/tts/layers/tortoise/autoregressive.py index cbfe076825..eaeb2a03c1 100644 --- a/TTS/tts/layers/tortoise/autoregressive.py +++ b/TTS/tts/layers/tortoise/autoregressive.py @@ -608,9 +608,9 @@ def inference_speech( if input_tokens is None: inputs = fake_inputs else: - assert ( - num_return_sequences % input_tokens.shape[0] == 0 - ), "The number of return sequences must be divisible by the number of input sequences" + assert num_return_sequences % input_tokens.shape[0] == 0, ( + "The number of return sequences must be divisible by the number of input sequences" + ) fake_inputs = fake_inputs.repeat(num_return_sequences, 1) input_tokens = input_tokens.repeat(num_return_sequences // input_tokens.shape[0], 1) inputs = torch.cat([fake_inputs, input_tokens], dim=1) diff --git a/TTS/tts/layers/tortoise/dpm_solver.py b/TTS/tts/layers/tortoise/dpm_solver.py index d34b61f486..c8892d456a 100644 --- a/TTS/tts/layers/tortoise/dpm_solver.py +++ b/TTS/tts/layers/tortoise/dpm_solver.py @@ -563,41 +563,21 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type if order == 3: K = steps // 3 + 1 if steps % 3 == 0: - orders = [ - 3, - ] * ( - K - 2 - ) + [2, 1] + orders = [3] * (K - 2) + [2, 1] elif steps % 3 == 1: - orders = [ - 3, - ] * ( - K - 1 - ) + [1] + orders = [3] * (K - 1) + [1] else: - orders = [ - 3, - ] * ( - K - 1 - ) + [2] + orders = [3] * (K - 1) + [2] elif order == 2: if steps % 2 == 0: K = steps // 2 - orders = [ - 2, - ] * K + orders = [2] * K else: K = steps // 2 + 1 - orders = [ - 2, - ] * ( - K - 1 - ) + [1] + orders = [2] * (K - 1) + [1] elif order == 1: K = 1 - orders = [ - 1, - ] * steps + orders = [1] * steps else: raise ValueError("'order' must be '1' or '2' or '3'.") if skip_type == "logSNR": @@ -605,15 +585,7 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device) else: timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[ - torch.cumsum( - torch.tensor( - [ - 0, - ] - + orders - ), - 0, - ).to(device) + torch.cumsum(torch.tensor([0] + orders), 0).to(device) ] return timesteps_outer, orders @@ -1217,9 +1189,9 @@ def inverse( """ t_0 = 1.0 / self.noise_schedule.total_N if t_start is None else t_start t_T = self.noise_schedule.T if t_end is None else t_end - assert ( - t_0 > 0 and t_T > 0 - ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + assert t_0 > 0 and t_T > 0, ( + "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + ) return self.sample( x, steps=steps, @@ -1362,9 +1334,9 @@ def sample( """ t_0 = 1.0 / self.noise_schedule.total_N if t_end is None else t_end t_T = self.noise_schedule.T if t_start is None else t_start - assert ( - t_0 > 0 and t_T > 0 - ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + assert t_0 > 0 and t_T > 0, ( + "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + ) if return_intermediate: assert method in [ "multistep", diff --git a/TTS/tts/layers/tortoise/transformer.py b/TTS/tts/layers/tortoise/transformer.py index c1854bd196..531f294220 100644 --- a/TTS/tts/layers/tortoise/transformer.py +++ b/TTS/tts/layers/tortoise/transformer.py @@ -43,9 +43,9 @@ def route_args(router, args, depth): class SequentialSequence(nn.Module): def __init__(self, layers, args_route={}, layer_dropout=0.0): super().__init__() - assert all( - len(route) == len(layers) for route in args_route.values() - ), "each argument route map must have the same depth as the number of sequential layers" + assert all(len(route) == len(layers) for route in args_route.values()), ( + "each argument route map must have the same depth as the number of sequential layers" + ) self.layers = layers self.args_route = args_route self.layer_dropout = layer_dropout diff --git a/TTS/tts/layers/tortoise/xtransformers.py b/TTS/tts/layers/tortoise/xtransformers.py index 0892fee19d..b2e74cf118 100644 --- a/TTS/tts/layers/tortoise/xtransformers.py +++ b/TTS/tts/layers/tortoise/xtransformers.py @@ -560,9 +560,9 @@ def __init__( self.rel_pos_bias = rel_pos_bias if rel_pos_bias: - assert ( - rel_pos_num_buckets <= rel_pos_max_distance - ), "number of relative position buckets must be less than the relative position max distance" + assert rel_pos_num_buckets <= rel_pos_max_distance, ( + "number of relative position buckets must be less than the relative position max distance" + ) self.rel_pos = RelativePositionBias( scale=dim_head**0.5, causal=causal, @@ -680,9 +680,9 @@ def forward( del input_mask if exists(attn_mask): - assert ( - 2 <= attn_mask.ndim <= 4 - ), "attention mask must have greater than 2 dimensions but less than or equal to 4" + assert 2 <= attn_mask.ndim <= 4, ( + "attention mask must have greater than 2 dimensions but less than or equal to 4" + ) if attn_mask.ndim == 2: attn_mask = rearrange(attn_mask, "i j -> () () i j") elif attn_mask.ndim == 3: @@ -790,9 +790,9 @@ def __init__( rotary_emb_dim = max(default(rotary_emb_dim, dim_head // 2), 32) self.rotary_pos_emb = RotaryEmbedding(rotary_emb_dim) if rotary_pos_emb else None - assert not ( - alibi_pos_bias and rel_pos_bias - ), "you can only choose Alibi positional bias or T5 relative positional bias, not both" + assert not (alibi_pos_bias and rel_pos_bias), ( + "you can only choose Alibi positional bias or T5 relative positional bias, not both" + ) if alibi_pos_bias: alibi_num_heads = default(alibi_num_heads, heads) @@ -922,9 +922,9 @@ def forward( past_key_values=None, expected_seq_len=None, ): - assert not ( - self.cross_attend ^ (exists(context) or exists(full_context)) - ), "context must be passed in if cross_attend is set to True" + assert not (self.cross_attend ^ (exists(context) or exists(full_context))), ( + "context must be passed in if cross_attend is set to True" + ) assert context is None or full_context is None, "only one of full_context or context can be provided" hiddens = [] @@ -940,9 +940,9 @@ def forward( rotary_pos_emb = None if exists(self.rotary_pos_emb): if not self.training and self.causal: - assert ( - expected_seq_len is not None - ), "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`" + assert expected_seq_len is not None, ( + "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`" + ) elif expected_seq_len is None: expected_seq_len = 0 seq_len = x.shape[1] diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py index 20eff26ecc..4e0f53616d 100644 --- a/TTS/tts/layers/xtts/gpt.py +++ b/TTS/tts/layers/xtts/gpt.py @@ -347,12 +347,12 @@ def forward( audio_codes = F.pad(audio_codes, (0, max_mel_len - audio_codes.shape[-1])) # 💖 Lovely assertions - assert ( - max_mel_len <= audio_codes.shape[-1] - ), f" ❗ max_mel_len ({max_mel_len}) > audio_codes.shape[-1] ({audio_codes.shape[-1]})" - assert ( - max_text_len <= text_inputs.shape[-1] - ), f" ❗ max_text_len ({max_text_len}) > text_inputs.shape[-1] ({text_inputs.shape[-1]})" + assert max_mel_len <= audio_codes.shape[-1], ( + f" ❗ max_mel_len ({max_mel_len}) > audio_codes.shape[-1] ({audio_codes.shape[-1]})" + ) + assert max_text_len <= text_inputs.shape[-1], ( + f" ❗ max_text_len ({max_text_len}) > text_inputs.shape[-1] ({text_inputs.shape[-1]})" + ) # Append stop token to text inputs text_inputs = F.pad(text_inputs[:, :max_text_len], (0, 1), value=self.stop_text_token) @@ -454,9 +454,9 @@ def forward( mel_targets[idx, l + 1 :] = -1 # check if stoptoken is in every row of mel_targets - assert (mel_targets == self.stop_audio_token).sum() >= mel_targets.shape[ - 0 - ], f" ❗ mel_targets does not contain stop token ({self.stop_audio_token}) in every row." + assert (mel_targets == self.stop_audio_token).sum() >= mel_targets.shape[0], ( + f" ❗ mel_targets does not contain stop token ({self.stop_audio_token}) in every row." + ) # ignore the loss for the segment used for conditioning # coin flip for the segment to be ignored diff --git a/TTS/tts/layers/xtts/stream_generator.py b/TTS/tts/layers/xtts/stream_generator.py index 303a990c27..e09a5233ac 100644 --- a/TTS/tts/layers/xtts/stream_generator.py +++ b/TTS/tts/layers/xtts/stream_generator.py @@ -953,7 +953,6 @@ def init_stream_support(): def _get_logits_warper(generation_config: GenerationConfig) -> LogitsProcessorList: - warpers = LogitsProcessorList() if generation_config.temperature is not None and generation_config.temperature != 1.0: diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py index e437bc35eb..0a8af2f950 100644 --- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py +++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py @@ -244,7 +244,11 @@ def test_run(self, assets) -> tuple[dict, dict]: # pylint: disable=W0613 return {"audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.args.output_sample_rate) diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 12c3d18252..c2e29c7100 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -232,9 +232,7 @@ def _forward_mdn(self, o_en, y, y_lengths, x_mask): dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, y_mask) return dr_mas, mu, log_sigma, logp - def forward( - self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None - ): # pylint: disable=unused-argument + def forward(self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None): # pylint: disable=unused-argument """ Shapes: - x: :math:`[B, T_max]` @@ -351,9 +349,7 @@ def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use train_audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": train_audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) @@ -366,9 +362,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/tts/models/bark.py b/TTS/tts/models/bark.py index ca1c301281..84814745a2 100644 --- a/TTS/tts/models/bark.py +++ b/TTS/tts/models/bark.py @@ -190,9 +190,7 @@ def _set_voice_dirs(self, voice_dirs): return _voice_dirs # TODO: remove config from synthesize - def synthesize( - self, text, config, speaker_id="random", voice_dirs=None, **kwargs - ): # pylint: disable=unused-argument + def synthesize(self, text, config, speaker_id="random", voice_dirs=None, **kwargs): # pylint: disable=unused-argument """Synthesize speech with the given input text. Args: diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py index 8821036b5f..05f4ae168d 100644 --- a/TTS/tts/models/base_tacotron.py +++ b/TTS/tts/models/base_tacotron.py @@ -93,9 +93,7 @@ def forward(self): def inference(self): pass - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin """Load model checkpoint and set up internals. Args: @@ -176,7 +174,11 @@ def test_run(self, assets: dict) -> tuple[dict, dict]: return {"figures": test_figures, "audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.ap.sample_rate) logger.test_figures(steps, outputs["figures"]) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 0976e4cdab..f5bc49e147 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -210,9 +210,9 @@ def format_batch(self, batch: dict) -> dict: extra_frames = dur.sum() - mel_lengths[idx] largest_idxs = torch.argsort(-dur)[:extra_frames] dur[largest_idxs] -= 1 - assert ( - dur.sum() == mel_lengths[idx] - ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + assert dur.sum() == mel_lengths[idx], ( + f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + ) durations[idx, : text_lengths[idx]] = dur # set stop targets wrt reduction factor diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py index 5db0f3df60..7b6103512c 100644 --- a/TTS/tts/models/delightful_tts.py +++ b/TTS/tts/models/delightful_tts.py @@ -831,9 +831,7 @@ def _log(self, batch, outputs, name_prefix="train"): audios[f"{name_prefix}/vocoder_audio"] = sample_voice return figures, audios - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=no-self-use, unused-argument + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=no-self-use, unused-argument """Create visualizations and waveform examples. For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to @@ -1046,7 +1044,11 @@ def test_run(self, assets) -> tuple[dict, dict]: return {"figures": test_figures, "audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.config.audio.sample_rate) logger.test_figures(steps, outputs["figures"]) @@ -1258,9 +1260,7 @@ def on_epoch_end(self, trainer): # pylint: disable=unused-argument self.energy_scaler.eval() @staticmethod - def init_from_config( - config: "DelightfulTTSConfig", samples: list[list] | list[dict] = None - ): # pylint: disable=unused-argument + def init_from_config(config: "DelightfulTTSConfig", samples: list[list] | list[dict] = None): # pylint: disable=unused-argument """Initiate model from config Args: diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py index 5b68475406..497ac3f63a 100644 --- a/TTS/tts/models/forward_tts.py +++ b/TTS/tts/models/forward_tts.py @@ -770,9 +770,7 @@ def _create_logs(self, batch, outputs, ap): train_audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": train_audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) @@ -785,9 +783,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 68b175afcc..5d03b53dc6 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -124,9 +124,9 @@ def init_multispeaker(self, config: Coqpit): config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 ) if self.speaker_manager is not None: - assert ( - config.d_vector_dim == self.speaker_manager.embedding_dim - ), " [!] d-vector dimension mismatch b/w config and speaker manager." + assert config.d_vector_dim == self.speaker_manager.embedding_dim, ( + " [!] d-vector dimension mismatch b/w config and speaker manager." + ) # init speaker embedding layer if config.use_speaker_embedding and not config.use_d_vector_file: logger.info("Init speaker_embedding layer.") @@ -192,9 +192,7 @@ def _speaker_embedding(self, aux_input: dict) -> torch.Tensor | None: g = F.normalize(g).unsqueeze(-1) # [b, h, 1] return g - def forward( - self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} - ): # pylint: disable=dangerous-default-value + def forward(self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=dangerous-default-value """ Args: x (torch.Tensor): @@ -318,9 +316,7 @@ def inference_with_MAS( return outputs @torch.inference_mode() - def decoder_inference( - self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} - ): # pylint: disable=dangerous-default-value + def decoder_inference(self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=dangerous-default-value """ Shapes: - y: :math:`[B, T, C]` @@ -341,9 +337,7 @@ def decoder_inference( return outputs @torch.inference_mode() - def inference( - self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None} - ): # pylint: disable=dangerous-default-value + def inference(self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}): # pylint: disable=dangerous-default-value x_lengths = aux_input["x_lengths"] g = self._speaker_embedding(aux_input) # embedding pass @@ -456,9 +450,7 @@ def _create_logs(self, batch, outputs, ap): train_audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": train_audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) @@ -521,9 +513,7 @@ def preprocess(self, y, y_lengths, y_max_length, attn=None): def store_inverse(self): self.decoder.store_inverse() - def load_checkpoint( - self, config, checkpoint_path, eval=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/tts/models/neuralhmm_tts.py b/TTS/tts/models/neuralhmm_tts.py index a7c0ea7f14..2cbf425884 100644 --- a/TTS/tts/models/neuralhmm_tts.py +++ b/TTS/tts/models/neuralhmm_tts.py @@ -345,17 +345,13 @@ def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use, unus audio = ap.inv_melspectrogram(inference_output["model_outputs"][0].T.cpu().numpy()) return figures, {"audios": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=unused-argument + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Log training progress.""" figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) - def eval_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=unused-argument + def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Compute and log evaluation metrics.""" # Plot model parameters histograms if isinstance(logger, TensorboardLogger): @@ -369,7 +365,11 @@ def eval_log( logger.eval_audios(steps, audios, self.ap.sample_rate) def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs[1], self.ap.sample_rate) logger.test_figures(steps, outputs[0]) diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py index 85e1523307..aad2e1f553 100644 --- a/TTS/tts/models/overflow.py +++ b/TTS/tts/models/overflow.py @@ -362,17 +362,13 @@ def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use, unus audio = ap.inv_melspectrogram(inference_output["model_outputs"][0].T.cpu().numpy()) return figures, {"audios": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=unused-argument + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Log training progress.""" figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) - def eval_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=unused-argument + def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Compute and log evaluation metrics.""" # Plot model parameters histograms if isinstance(logger, TensorboardLogger): @@ -386,7 +382,11 @@ def eval_log( logger.eval_audios(steps, audios, self.ap.sample_rate) def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs[1], self.ap.sample_rate) logger.test_figures(steps, outputs[0]) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 879a2b94b5..59173691f7 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -376,9 +376,7 @@ def _create_logs(self, batch, outputs, ap): audio = ap.inv_spectrogram(pred_linear_spec.T) return figures, {"audio": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index c8c0c875ad..e924d82d42 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -399,9 +399,7 @@ def _create_logs(self, batch, outputs, ap): audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use """Log training progress.""" figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py index 738e9dd9b3..b44a5fbfc6 100644 --- a/TTS/tts/models/tortoise.py +++ b/TTS/tts/models/tortoise.py @@ -685,9 +685,9 @@ def inference( text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).to(self.device) text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary. - assert ( - text_tokens.shape[-1] < 400 - ), "Too much text provided. Break the text up into separate segments and re-try inference." + assert text_tokens.shape[-1] < 400, ( + "Too much text provided. Break the text up into separate segments and re-try inference." + ) if voice_samples is not None: ( diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 28c0ff99c1..3b6cee7ead 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -1186,9 +1186,7 @@ def _log(self, ap, batch, outputs, name_prefix="train"): # pylint: disable=unus ) return figures, audios - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=no-self-use """Create visualizations and waveform examples. For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to @@ -1295,7 +1293,11 @@ def test_run(self, assets) -> tuple[dict, dict]: return {"figures": test_figures, "audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.ap.sample_rate) logger.test_figures(steps, outputs["figures"]) @@ -1364,9 +1366,9 @@ def format_batch_on_device(self, batch): ) if self.args.encoder_sample_rate: - assert batch["spec"].shape[2] == int( - batch["mel"].shape[2] / self.interpolate_factor - ), f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}" + assert batch["spec"].shape[2] == int(batch["mel"].shape[2] / self.interpolate_factor), ( + f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}" + ) else: assert batch["spec"].shape[2] == batch["mel"].shape[2], f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}" @@ -1536,9 +1538,7 @@ def get_criterion(self): return [VitsDiscriminatorLoss(self.config), VitsGeneratorLoss(self.config)] - def load_checkpoint( - self, config, checkpoint_path, eval=False, strict=True, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, strict=True, cache=False): # pylint: disable=unused-argument, redefined-builtin """Load the model checkpoint and setup for training or inference""" state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) # compat band-aid for the pre-trained models to not use the encoder baked into the model @@ -1565,9 +1565,7 @@ def load_checkpoint( self.eval() assert not self.training - def load_fairseq_checkpoint( - self, config, checkpoint_dir, eval=False, strict=True - ): # pylint: disable=unused-argument, redefined-builtin + def load_fairseq_checkpoint(self, config, checkpoint_dir, eval=False, strict=True): # pylint: disable=unused-argument, redefined-builtin """Load VITS checkpoints released by fairseq here: https://github.com/facebookresearch/fairseq/tree/main/examples/mms Performs some changes for compatibility. @@ -1626,15 +1624,15 @@ def init_from_config(config: "VitsConfig", samples: list[list] | list[dict] = No upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item() if not config.model_args.encoder_sample_rate: - assert ( - upsample_rate == config.audio.hop_length - ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}" + assert upsample_rate == config.audio.hop_length, ( + f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}" + ) else: encoder_to_vocoder_upsampling_factor = config.audio.sample_rate / config.model_args.encoder_sample_rate effective_hop_length = config.audio.hop_length * encoder_to_vocoder_upsampling_factor - assert ( - upsample_rate == effective_hop_length - ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}" + assert upsample_rate == effective_hop_length, ( + f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}" + ) ap = AudioProcessor.init_from_config(config) tokenizer, new_config = TTSTokenizer.init_from_config(config) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 63f161d05f..2df07a0435 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -379,9 +379,9 @@ def synthesize(self, text, config, speaker_wav, language, speaker_id=None, **kwa as latents used at inference. """ - assert ( - "zh-cn" if language == "zh" else language in self.config.languages - ), f" ❗ Language {language} is not supported. Supported languages are {self.config.languages}" + assert "zh-cn" if language == "zh" else language in self.config.languages, ( + f" ❗ Language {language} is not supported. Supported languages are {self.config.languages}" + ) # Use generally found best tuning knobs for generation. settings = { "temperature": config.temperature, @@ -519,9 +519,9 @@ def inference( sent = sent.strip().lower() text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device) - assert ( - text_tokens.shape[-1] < self.args.gpt_max_text_tokens - ), " ❗ XTTS can only generate text with a maximum of 400 tokens." + assert text_tokens.shape[-1] < self.args.gpt_max_text_tokens, ( + " ❗ XTTS can only generate text with a maximum of 400 tokens." + ) with torch.no_grad(): gpt_codes = self.gpt.generate( @@ -627,9 +627,9 @@ def inference_stream( sent = sent.strip().lower() text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device) - assert ( - text_tokens.shape[-1] < self.args.gpt_max_text_tokens - ), " ❗ XTTS can only generate text with a maximum of 400 tokens." + assert text_tokens.shape[-1] < self.args.gpt_max_text_tokens, ( + " ❗ XTTS can only generate text with a maximum of 400 tokens." + ) fake_inputs = self.gpt.compute_embeddings( gpt_cond_latent.to(self.device), diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py index cf02e5282b..a3648eff4b 100644 --- a/TTS/tts/utils/helpers.py +++ b/TTS/tts/utils/helpers.py @@ -105,9 +105,9 @@ def rand_segments( _x_lenghts[len_diff < 0] = segment_size len_diff = _x_lenghts - segment_size else: - assert all( - len_diff > 0 - ), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}" + assert all(len_diff > 0), ( + f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}" + ) segment_indices = (torch.rand([B]).type_as(x) * (len_diff + 1)).long() ret = segment(x, segment_indices, segment_size, pad_short=pad_short) return ret, segment_indices diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 026039ab29..6fab27de5a 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -185,9 +185,9 @@ def get_speaker_manager(c: Coqpit, data: list = None, restore_path: str = None, elif not c.use_d_vector_file: # restor speaker manager with speaker ID file. speaker_ids_from_data = speaker_manager.name_to_id speaker_manager.load_ids_from_file(speakers_file) - assert all( - speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data - ), " [!] You cannot introduce new speakers to a pre-trained model." + assert all(speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data), ( + " [!] You cannot introduce new speakers to a pre-trained model." + ) elif c.use_d_vector_file and c.d_vector_file: # new speaker manager with external speaker embeddings. speaker_manager.load_embeddings_from_file(c.d_vector_file) diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py index 24bab63ca1..660370a832 100644 --- a/TTS/tts/utils/ssim.py +++ b/TTS/tts/utils/ssim.py @@ -49,16 +49,16 @@ def _validate_input( if size_range is None: assert t.size() == x.size(), f"Expected tensors with same size, got {t.size()} and {x.size()}" else: - assert ( - t.size()[size_range[0] : size_range[1]] == x.size()[size_range[0] : size_range[1]] - ), f"Expected tensors with same size at given dimensions, got {t.size()} and {x.size()}" + assert t.size()[size_range[0] : size_range[1]] == x.size()[size_range[0] : size_range[1]], ( + f"Expected tensors with same size at given dimensions, got {t.size()} and {x.size()}" + ) if dim_range[0] == dim_range[1]: assert t.dim() == dim_range[0], f"Expected number of dimensions to be {dim_range[0]}, got {t.dim()}" elif dim_range[0] < dim_range[1]: - assert ( - dim_range[0] <= t.dim() <= dim_range[1] - ), f"Expected number of dimensions to be between {dim_range[0]} and {dim_range[1]}, got {t.dim()}" + assert dim_range[0] <= t.dim() <= dim_range[1], ( + f"Expected number of dimensions to be between {dim_range[0]} and {dim_range[1]}, got {t.dim()}" + ) if data_range[0] < data_range[1]: assert data_range[0] <= t.min(), f"Expected values to be greater or equal to {data_range[0]}, got {t.min()}" @@ -285,8 +285,7 @@ def _ssim_per_channel( """ if x.size(-1) < kernel.size(-1) or x.size(-2) < kernel.size(-2): raise ValueError( - f"Kernel size can't be greater than actual input size. Input size: {x.size()}. " - f"Kernel size: {kernel.size()}" + f"Kernel size can't be greater than actual input size. Input size: {x.size()}. Kernel size: {kernel.size()}" ) c1 = k1**2 @@ -337,8 +336,7 @@ def _ssim_per_channel_complex( n_channels = x.size(1) if x.size(-2) < kernel.size(-1) or x.size(-3) < kernel.size(-2): raise ValueError( - f"Kernel size can't be greater than actual input size. Input size: {x.size()}. " - f"Kernel size: {kernel.size()}" + f"Kernel size can't be greater than actual input size. Input size: {x.size()}. Kernel size: {kernel.size()}" ) c1 = k1**2 diff --git a/TTS/tts/utils/text/bangla/phonemizer.py b/TTS/tts/utils/text/bangla/phonemizer.py index cddcb00fd5..1537240380 100644 --- a/TTS/tts/utils/text/bangla/phonemizer.py +++ b/TTS/tts/utils/text/bangla/phonemizer.py @@ -45,7 +45,7 @@ def tag_text(text: str): # create start and end text = "start" + text + "end" # tag text - parts = re.split("[\u0600-\u06FF]+", text) + parts = re.split("[\u0600-\u06ff]+", text) # remove non chars parts = [p for p in parts if p.strip()] # unique parts diff --git a/TTS/tts/utils/text/characters.py b/TTS/tts/utils/text/characters.py index da30692f5e..f8beaef036 100644 --- a/TTS/tts/utils/text/characters.py +++ b/TTS/tts/utils/text/characters.py @@ -289,9 +289,9 @@ def _create_vocab(self): self.vocab = _vocab + list(self._punctuations) if self.is_unique: duplicates = {x for x in self.vocab if self.vocab.count(x) > 1} - assert ( - len(self.vocab) == len(self._char_to_id) == len(self._id_to_char) - ), f" [!] There are duplicate characters in the character set. {duplicates}" + assert len(self.vocab) == len(self._char_to_id) == len(self._id_to_char), ( + f" [!] There are duplicate characters in the character set. {duplicates}" + ) def char_to_id(self, char: str) -> int: try: diff --git a/TTS/tts/utils/text/english/number_norm.py b/TTS/tts/utils/text/english/number_norm.py index c5f2f452d5..be2a4b3084 100644 --- a/TTS/tts/utils/text/english/number_norm.py +++ b/TTS/tts/utils/text/english/number_norm.py @@ -1,4 +1,4 @@ -""" from https://github.com/keithito/tacotron """ +"""from https://github.com/keithito/tacotron""" import re diff --git a/TTS/tts/utils/text/korean/korean.py b/TTS/tts/utils/text/korean/korean.py index 0feef3bdfb..1b1e0ca0fb 100644 --- a/TTS/tts/utils/text/korean/korean.py +++ b/TTS/tts/utils/text/korean/korean.py @@ -1,4 +1,4 @@ -īģŋ# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py +# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py import re from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary diff --git a/TTS/tts/utils/text/phonemizers/base.py b/TTS/tts/utils/text/phonemizers/base.py index 4bd03851c7..6cc6ec0b37 100644 --- a/TTS/tts/utils/text/phonemizers/base.py +++ b/TTS/tts/utils/text/phonemizers/base.py @@ -52,7 +52,7 @@ def _init_language(self, language): """ if not self.is_supported_language(language): - raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend") + raise RuntimeError(f'language "{language}" is not supported by the {self.name()} backend') return language @property diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py index 9a8841106c..55b8575aa4 100644 --- a/TTS/utils/audio/processor.py +++ b/TTS/utils/audio/processor.py @@ -222,9 +222,9 @@ def __init__( self.hop_length = hop_length self.win_length = win_length assert min_level_db != 0.0, " [!] min_level_db is 0" - assert ( - self.win_length <= self.fft_size - ), f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}" + assert self.win_length <= self.fft_size, ( + f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}" + ) members = vars(self) logger.info("Setting up Audio Processor...") for key, value in members.items(): @@ -283,7 +283,9 @@ def normalize(self, S: np.ndarray) -> np.ndarray: S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm if self.clip_norm: S_norm = np.clip( - S_norm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type + S_norm, + -self.max_norm, # pylint: disable=invalid-unary-operand-type + self.max_norm, ) return S_norm S_norm = self.max_norm * S_norm @@ -318,7 +320,9 @@ def denormalize(self, S: np.ndarray) -> np.ndarray: if self.symmetric_norm: if self.clip_norm: S_denorm = np.clip( - S_denorm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type + S_denorm, + -self.max_norm, # pylint: disable=invalid-unary-operand-type + self.max_norm, ) S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db return S_denorm + self.ref_level_db @@ -351,9 +355,9 @@ def load_stats(self, stats_path: str) -> tuple[np.array, np.array, np.array, np. if key in skip_parameters: continue if key not in ["sample_rate", "trim_db"]: - assert ( - stats_config[key] == self.__dict__[key] - ), f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" + assert stats_config[key] == self.__dict__[key], ( + f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" + ) return mel_mean, mel_std, linear_mean, linear_std, stats_config # pylint: disable=attribute-defined-outside-init diff --git a/TTS/utils/samplers.py b/TTS/utils/samplers.py index 4e8f3825b9..d24733977a 100644 --- a/TTS/utils/samplers.py +++ b/TTS/utils/samplers.py @@ -49,9 +49,9 @@ def __init__( label_key="class_name", ): super().__init__(dataset_items) - assert ( - batch_size % (num_classes_in_batch * num_gpus) == 0 - ), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)." + assert batch_size % (num_classes_in_batch * num_gpus) == 0, ( + "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)." + ) label_indices = {} for idx, item in enumerate(dataset_items): diff --git a/TTS/vc/layers/freevc/wavlm/modules.py b/TTS/vc/layers/freevc/wavlm/modules.py index cddacd69ab..cf31a866de 100644 --- a/TTS/vc/layers/freevc/wavlm/modules.py +++ b/TTS/vc/layers/freevc/wavlm/modules.py @@ -330,7 +330,7 @@ def __init__( self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( - "Self-attention requires query, key and " "value to be of the same size" + "Self-attention requires query, key and value to be of the same size" ) k_bias = True diff --git a/TTS/vc/layers/freevc/wavlm/wavlm.py b/TTS/vc/layers/freevc/wavlm/wavlm.py index cb2c0152e8..6358662e18 100644 --- a/TTS/vc/layers/freevc/wavlm/wavlm.py +++ b/TTS/vc/layers/freevc/wavlm/wavlm.py @@ -68,8 +68,7 @@ def compute_mask_indices( all_num_mask = int( # add a random number for probabilistic rounding - mask_prob * all_sz / float(mask_length) - + np.random.rand() + mask_prob * all_sz / float(mask_length) + np.random.rand() ) all_num_mask = max(min_masks, all_num_mask) @@ -80,8 +79,7 @@ def compute_mask_indices( sz = all_sz - padding_mask[i].long().sum().item() num_mask = int( # add a random number for probabilistic rounding - mask_prob * sz / float(mask_length) - + np.random.rand() + mask_prob * sz / float(mask_length) + np.random.rand() ) num_mask = max(min_masks, num_mask) else: @@ -155,9 +153,7 @@ def arrange(s, e, length, keep_length): class WavLMConfig: def __init__(self, cfg=None): - self.extractor_mode: str = ( - "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) - ) + self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) self.encoder_layers: int = 12 # num encoder layers in the transformer self.encoder_embed_dim: int = 768 # encoder embedding dimension @@ -166,9 +162,7 @@ def __init__(self, cfg=None): self.activation_fn: str = "gelu" # activation function to use self.layer_norm_first: bool = False # apply layernorm first in the transformer - self.conv_feature_layers: str = ( - "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] - ) + self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] self.conv_bias: bool = False # include bias in conv encoder self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this diff --git a/TTS/vc/models/base_vc.py b/TTS/vc/models/base_vc.py index 90afd997b4..a953b901e8 100644 --- a/TTS/vc/models/base_vc.py +++ b/TTS/vc/models/base_vc.py @@ -199,9 +199,9 @@ def format_batch(self, batch: dict[str, Any]) -> dict[str, Any]: extra_frames = dur.sum() - mel_lengths[idx] largest_idxs = torch.argsort(-dur)[:extra_frames] dur[largest_idxs] -= 1 - assert ( - dur.sum() == mel_lengths[idx] - ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + assert dur.sum() == mel_lengths[idx], ( + f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + ) durations[idx, : text_lengths[idx]] = dur # set stop targets wrt reduction factor diff --git a/TTS/vocoder/datasets/gan_dataset.py b/TTS/vocoder/datasets/gan_dataset.py index c0882c701f..076545f8a2 100644 --- a/TTS/vocoder/datasets/gan_dataset.py +++ b/TTS/vocoder/datasets/gan_dataset.py @@ -128,9 +128,9 @@ def load_item(self, idx): # correct the audio length wrt padding applied in stft audio = np.pad(audio, (0, self.hop_len), mode="edge") audio = audio[: mel.shape[-1] * self.hop_len] - assert ( - mel.shape[-1] * self.hop_len == audio.shape[-1] - ), f" [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}" + assert mel.shape[-1] * self.hop_len == audio.shape[-1], ( + f" [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}" + ) audio = torch.from_numpy(audio).float().unsqueeze(0) mel = torch.from_numpy(mel).float().squeeze(0) diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py index 3ae9015451..435330bebe 100644 --- a/TTS/vocoder/datasets/wavegrad_dataset.py +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -102,9 +102,9 @@ def load_item(self, idx): audio = np.pad( audio, (0, self.seq_len + self.pad_short - len(audio)), mode="constant", constant_values=0.0 ) - assert ( - audio.shape[-1] >= self.seq_len + self.pad_short - ), f"{audio.shape[-1]} vs {self.seq_len + self.pad_short}" + assert audio.shape[-1] >= self.seq_len + self.pad_short, ( + f"{audio.shape[-1]} vs {self.seq_len + self.pad_short}" + ) # correct the audio length wrt hop length p = (audio.shape[-1] // self.hop_len + 1) * self.hop_len - audio.shape[-1] diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py index 0fad81864e..81a1f30884 100644 --- a/TTS/vocoder/layers/losses.py +++ b/TTS/vocoder/layers/losses.py @@ -224,9 +224,9 @@ class GeneratorLoss(nn.Module): def __init__(self, C): super().__init__() - assert not ( - C.use_mse_gan_loss and C.use_hinge_gan_loss - ), " [!] Cannot use HingeGANLoss and MSEGANLoss together." + assert not (C.use_mse_gan_loss and C.use_hinge_gan_loss), ( + " [!] Cannot use HingeGANLoss and MSEGANLoss together." + ) self.use_stft_loss = C.use_stft_loss if "use_stft_loss" in C else False self.use_subband_stft_loss = C.use_subband_stft_loss if "use_subband_stft_loss" in C else False @@ -311,9 +311,9 @@ class DiscriminatorLoss(nn.Module): def __init__(self, C): super().__init__() - assert not ( - C.use_mse_gan_loss and C.use_hinge_gan_loss - ), " [!] Cannot use HingeGANLoss and MSEGANLoss together." + assert not (C.use_mse_gan_loss and C.use_hinge_gan_loss), ( + " [!] Cannot use HingeGANLoss and MSEGANLoss together." + ) self.use_mse_gan_loss = C.use_mse_gan_loss self.use_hinge_gan_loss = C.use_hinge_gan_loss diff --git a/TTS/vocoder/layers/lvc_block.py b/TTS/vocoder/layers/lvc_block.py index 8913a1132e..ab1a56e7fc 100644 --- a/TTS/vocoder/layers/lvc_block.py +++ b/TTS/vocoder/layers/lvc_block.py @@ -175,9 +175,9 @@ def location_variable_convolution(x, kernel, bias, dilation, hop_size): batch, _, in_length = x.shape batch, _, out_channels, kernel_size, kernel_length = kernel.shape - assert in_length == ( - kernel_length * hop_size - ), f"length of (x, kernel) is not matched, {in_length} vs {kernel_length * hop_size}" + assert in_length == (kernel_length * hop_size), ( + f"length of (x, kernel) is not matched, {in_length} vs {kernel_length * hop_size}" + ) padding = dilation * int((kernel_size - 1) / 2) x = F.pad(x, (padding, padding), "constant", 0) # (batch, in_channels, in_length + 2*padding) diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py index 42dfef32b7..ba3852e795 100644 --- a/TTS/vocoder/models/gan.py +++ b/TTS/vocoder/models/gan.py @@ -204,7 +204,12 @@ def _log(self, name: str, ap: AudioProcessor, batch: dict, outputs: dict) -> tup return figures, audios def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> tuple[dict, np.ndarray]: """Call `_log()` for training.""" figures, audios = self._log("eval", self.ap, batch, outputs) @@ -218,7 +223,12 @@ def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> tu return self.train_step(batch, criterion, optimizer_idx) def eval_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> tuple[dict, np.ndarray]: """Call `_log()` for evaluation.""" figures, audios = self._log("eval", self.ap, batch, outputs) diff --git a/TTS/vocoder/models/hifigan_generator.py b/TTS/vocoder/models/hifigan_generator.py index b2100c55b1..308b12ab56 100644 --- a/TTS/vocoder/models/hifigan_generator.py +++ b/TTS/vocoder/models/hifigan_generator.py @@ -313,9 +313,7 @@ def remove_weight_norm(self): remove_parametrizations(self.conv_pre, "weight") remove_parametrizations(self.conv_post, "weight") - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/vocoder/models/melgan_generator.py b/TTS/vocoder/models/melgan_generator.py index 03c971afa4..53ed700755 100644 --- a/TTS/vocoder/models/melgan_generator.py +++ b/TTS/vocoder/models/melgan_generator.py @@ -84,9 +84,7 @@ def remove_weight_norm(self): except ValueError: layer.remove_weight_norm() - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/vocoder/models/parallel_wavegan_generator.py b/TTS/vocoder/models/parallel_wavegan_generator.py index f4ef3a0734..71b38d4c0d 100644 --- a/TTS/vocoder/models/parallel_wavegan_generator.py +++ b/TTS/vocoder/models/parallel_wavegan_generator.py @@ -108,9 +108,9 @@ def forward(self, c): # perform upsampling if c is not None and self.upsample_net is not None: c = self.upsample_net(c) - assert ( - c.shape[-1] == x.shape[-1] - ), f" [!] Upsampling scale does not match the expected output. {c.shape} vs {x.shape}" + assert c.shape[-1] == x.shape[-1], ( + f" [!] Upsampling scale does not match the expected output. {c.shape} vs {x.shape}" + ) # encode to hidden representation x = self.first_conv(x) @@ -155,9 +155,7 @@ def _apply_weight_norm(m): def receptive_field_size(self): return _get_receptive_field_size(self.layers, self.stacks, self.kernel_size) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index 16c66e235b..b1a4a26562 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -217,9 +217,7 @@ def apply_weight_norm(self): self.out_conv = weight_norm(self.out_conv) self.y_conv = weight_norm(self.y_conv) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: @@ -257,7 +255,12 @@ def train_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]: return {"model_output": noise_hat}, {"loss": loss} def train_log( # pylint: disable=no-self-use - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> tuple[dict, np.ndarray]: pass @@ -266,7 +269,12 @@ def eval_step(self, batch: dict, criterion: nn.Module) -> tuple[dict, dict]: return self.train_step(batch, criterion) def eval_log( # pylint: disable=no-self-use - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: pass diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 2fe55f91bc..5a93f125ba 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -225,9 +225,9 @@ class of models has however remained an elusive problem. With a focus on text-to self.aux_dims = self.args.res_out_dims // 4 if self.args.use_upsample_net: - assert ( - np.cumprod(self.args.upsample_factors)[-1] == config.audio.hop_length - ), " [!] upsample scales needs to be equal to hop_length" + assert np.cumprod(self.args.upsample_factors)[-1] == config.audio.hop_length, ( + " [!] upsample scales needs to be equal to hop_length" + ) self.upsample = UpsampleNetwork( self.args.feat_dims, self.args.upsample_factors, @@ -527,9 +527,7 @@ def xfade_and_unfold(y, target, overlap): return unfolded - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: @@ -556,7 +554,10 @@ def eval_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]: @torch.no_grad() def test( - self, assets: dict, test_loader: "DataLoader", output: dict # pylint: disable=unused-argument + self, + assets: dict, + test_loader: "DataLoader", + output: dict, # pylint: disable=unused-argument ) -> tuple[dict, dict]: ap = self.ap figures = {} @@ -578,7 +579,11 @@ def test( return figures, audios def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> tuple[dict, np.ndarray]: figures, audios = outputs logger.eval_figures(steps, figures) diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py index f9067530e6..370a541b97 100644 --- a/tests/text_tests/test_phonemizer.py +++ b/tests/text_tests/test_phonemizer.py @@ -240,12 +240,8 @@ def test_is_available(self): class TestBN_Phonemizer(unittest.TestCase): def setUp(self): self.phonemizer = BN_Phonemizer() - self._TEST_CASES = ( - "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡, āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ, āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨" - ) - self._EXPECTED = ( - "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡ āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨āĨ¤" - ) + self._TEST_CASES = "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡, āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ, āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨" + self._EXPECTED = "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡ āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨āĨ¤" def test_phonemize(self): self.assertEqual(self.phonemizer.phonemize(self._TEST_CASES, separator=""), self._EXPECTED) diff --git a/tests/text_tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py index 25c169eddd..f5d342bb00 100644 --- a/tests/text_tests/test_text_cleaners.py +++ b/tests/text_tests/test_text_cleaners.py @@ -45,11 +45,11 @@ def test_normalize_unicode() -> None: ("na\u0303", "nÃŖ"), ("o\u0302u", "ôu"), ("n\u0303", "Ãą"), - ("\u4E2D\u56FD", "中å›Ŋ"), + ("\u4e2d\u56fd", "中å›Ŋ"), ("niÃąo", "niÃąo"), ("a\u0308", "ä"), ("\u3053\u3093\u306b\u3061\u306f", "こんãĢãĄã¯"), - ("\u03B1\u03B2", "ιβ"), + ("\u03b1\u03b2", "ιβ"), ] for arg, expect in test_cases: assert normalize_unicode(arg) == expect diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index 9a8027736e..72069bf943 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -72,9 +72,9 @@ def test_train_step(self): # pylint: disable=no-self-use for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -131,9 +131,9 @@ def test_train_step(): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -198,9 +198,9 @@ def test_train_step(self): if name == "gst_layer.encoder.recurrence.weight_hh_l0": # print(param.grad) continue - assert ( - param != param_ref - ).any(), f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 # with file gst style @@ -254,9 +254,9 @@ def test_train_step(self): if name == "gst_layer.encoder.recurrence.weight_hh_l0": # print(param.grad) continue - assert ( - param != param_ref - ).any(), f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -321,9 +321,9 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -384,7 +384,7 @@ def test_train_step(): name, param = name_param if name == "gst_layer.encoder.recurrence.weight_hh_l0": continue - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 3976b9ae8d..5f9af86e7e 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -71,9 +71,9 @@ def test_train_step(): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -127,9 +127,9 @@ def test_train_step(): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -186,9 +186,9 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 # with file gst style @@ -238,9 +238,9 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -305,9 +305,9 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -366,7 +366,7 @@ def test_train_step(): name, param = name_param if name == "gst_layer.encoder.recurrence.weight_hh_l0": continue - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index f0b347b895..790439ecb2 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -373,9 +373,9 @@ def _check_parameter_changes(model, model_ref): name = item1[0] param = item1[1] param_ref = item2[1] - assert ( - param != param_ref - ).any(), f"param {name} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {name} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count = count + 1 def _create_batch(self, config, batch_size): diff --git a/tests/tts_tests2/test_glow_tts.py b/tests/tts_tests2/test_glow_tts.py index 967e9ecb9e..c92063576f 100644 --- a/tests/tts_tests2/test_glow_tts.py +++ b/tests/tts_tests2/test_glow_tts.py @@ -42,9 +42,9 @@ def _create_inputs(batch_size=8): def _check_parameter_changes(model, model_ref): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 def test_init_multispeaker(self): @@ -241,10 +241,10 @@ def _test_inference_with_MAS(self, batch_size): # inference encoder and decoder with MAS y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths) y2 = model.decoder_inference(mel_spec, mel_lengths) - assert ( - y2["model_outputs"].shape == y["model_outputs"].shape - ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format( - y["model_outputs"].shape, y2["model_outputs"].shape + assert y2["model_outputs"].shape == y["model_outputs"].shape, ( + "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format( + y["model_outputs"].shape, y2["model_outputs"].shape + ) ) def test_inference_with_MAS(self): diff --git a/tests/vc_tests/test_freevc.py b/tests/vc_tests/test_freevc.py index dd45d6941f..784e32a68d 100644 --- a/tests/vc_tests/test_freevc.py +++ b/tests/vc_tests/test_freevc.py @@ -80,9 +80,9 @@ def _test_inference(self, batch_size): wavlm_vec_lengths = torch.ones(batch_size, dtype=torch.long) output_wav = model.inference(wavlm_vec, None, mel, wavlm_vec_lengths) - assert ( - output_wav.shape[-1] // config.audio.hop_length == wavlm_vec.shape[-1] - ), f"{output_wav.shape[-1] // config.audio.hop_length} != {wavlm_vec.shape}" + assert output_wav.shape[-1] // config.audio.hop_length == wavlm_vec.shape[-1], ( + f"{output_wav.shape[-1] // config.audio.hop_length} != {wavlm_vec.shape}" + ) def test_inference(self): self._test_inference(1) @@ -95,9 +95,9 @@ def test_voice_conversion(self): source_wav, target_wav = self._create_inputs_inference() output_wav = model.voice_conversion(source_wav, target_wav) - assert ( - output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length - ), f"{output_wav.shape} != {source_wav.shape}, {config.audio.hop_length}" + assert output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length, ( + f"{output_wav.shape} != {source_wav.shape}, {config.audio.hop_length}" + ) def test_train_step(self): ... diff --git a/tests/vc_tests/test_openvoice.py b/tests/vc_tests/test_openvoice.py index c9f7ae3931..703873ea47 100644 --- a/tests/vc_tests/test_openvoice.py +++ b/tests/vc_tests/test_openvoice.py @@ -16,7 +16,6 @@ class TestOpenVoice(unittest.TestCase): - @staticmethod def _create_inputs_inference(): source_wav = torch.rand(16100) @@ -37,6 +36,6 @@ def test_voice_conversion(self): source_wav, target_wav = self._create_inputs_inference() output_wav = model.voice_conversion(source_wav, target_wav) - assert ( - output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length - ), f"{output_wav.shape} != {source_wav.shape}" + assert output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length, ( + f"{output_wav.shape} != {source_wav.shape}" + ) From ce9f1bfddb53a448086c8d2ac5ad34bf6fb01239 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Sat, 11 Jan 2025 01:19:08 +0100 Subject: [PATCH 7/8] build: switch black to ruff formatter --- .pre-commit-config.yaml | 6 +----- CONTRIBUTING.md | 2 +- Makefile | 4 ++-- pyproject.toml | 5 ----- 4 files changed, 4 insertions(+), 13 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 97542c8cc3..2f070ad085 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,13 +7,9 @@ repos: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - - repo: "https://github.com/psf/black" - rev: 24.10.0 - hooks: - - id: black - language_version: python3 - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.9.1 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] + - id: ruff-format diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2b3a973763..5fe9421442 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -88,7 +88,7 @@ curl -LsSf https://astral.sh/uv/install.sh | sh uv run make test_all # run all the tests, report all the errors ``` -9. Format your code. We use ```black``` for code formatting. +9. Format your code. We use ```ruff``` for code formatting. ```bash make style diff --git a/Makefile b/Makefile index 35345b8c1f..d86845ddcf 100644 --- a/Makefile +++ b/Makefile @@ -43,11 +43,11 @@ test_failed: ## only run tests failed the last time. coverage run -m pytest -x -v --last-failed tests style: ## update code style. - uv run --only-dev black ${target_dirs} + uv run --only-dev ruff format ${target_dirs} lint: ## run linters. uv run --only-dev ruff check ${target_dirs} - uv run --only-dev black ${target_dirs} --check + uv run --only-dev ruff format ${target_dirs} --check system-deps: ## install linux system deps sudo apt-get install -y libsndfile1-dev diff --git a/pyproject.toml b/pyproject.toml index d84b3bb947..e9516d3d8c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -135,7 +135,6 @@ all = [ [dependency-groups] dev = [ - "black==24.10.0", "coverage[toml]>=7", "pre-commit>=4", "pytest>=8", @@ -232,10 +231,6 @@ max-returns = 7 "E402", # module level import not at top of file ] -[tool.black] -line-length = 120 -target-version = ['py310'] - [tool.coverage.report] skip_covered = true skip_empty = true From 008912ce76e48c78ef5f83d88dc26f7d04a06fe3 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Thu, 16 Jan 2025 15:47:35 +0100 Subject: [PATCH 8/8] fix(tortoise): do not set device, which is handled in BaseTrainerModel [ci skip] --- TTS/tts/models/tortoise.py | 1 - 1 file changed, 1 deletion(-) diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py index b44a5fbfc6..a42d577676 100644 --- a/TTS/tts/models/tortoise.py +++ b/TTS/tts/models/tortoise.py @@ -342,7 +342,6 @@ def __init__(self, config: Coqpit): else self.args.autoregressive_batch_size ) self.enable_redaction = self.args.enable_redaction - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if self.enable_redaction: self.aligner = Wav2VecAlignment()