diff --git a/.github/actions/setup-uv/action.yml b/.github/actions/setup-uv/action.yml
index c7dd4f5f99..88a73e8481 100644
--- a/.github/actions/setup-uv/action.yml
+++ b/.github/actions/setup-uv/action.yml
@@ -4,8 +4,9 @@ runs:
   using: 'composite'
   steps:
     - name: Install uv
-      uses: astral-sh/setup-uv@v4
+      uses: astral-sh/setup-uv@v5
       with:
-        version: "0.5.4"
+        version: "0.5.17"
         enable-cache: true
         cache-dependency-glob: "**/pyproject.toml"
+        python-version: ${{ matrix.python-version }}
diff --git a/.github/workflows/style_check.yml b/.github/workflows/style_check.yml
index d1060f6be2..03426808cc 100644
--- a/.github/workflows/style_check.yml
+++ b/.github/workflows/style_check.yml
@@ -9,15 +9,9 @@ on:
 jobs:
   lint:
     runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: [3.9]
     steps:
       - uses: actions/checkout@v4
       - name: Setup uv
         uses: ./.github/actions/setup-uv
-      - name: Set up Python ${{ matrix.python-version }}
-        run: uv python install ${{ matrix.python-version }}
       - name: Lint check
         run: make lint
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 8d639d5dee..aa01abb874 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -22,14 +22,12 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9, "3.10", "3.11", "3.12"]
+        python-version: ["3.10", "3.11", "3.12"]
         subset: ["data_tests", "inference_tests", "test_aux", "test_text"]
     steps:
       - uses: actions/checkout@v4
       - name: Setup uv
         uses: ./.github/actions/setup-uv
-      - name: Set up Python ${{ matrix.python-version }}
-        run: uv python install ${{ matrix.python-version }}
       - name: Install Espeak
         if: contains(fromJSON('["inference_tests", "test_text"]'), matrix.subset)
         run: |
@@ -37,7 +35,6 @@ jobs:
           sudo apt-get install espeak espeak-ng
       - name: Install dependencies
         run: |
-          sudo apt-get update
           sudo apt-get install -y --no-install-recommends git make gcc
           make system-deps
       - name: Install custom Trainer and/or Coqpit if requested
@@ -51,7 +48,7 @@ jobs:
       - name: Unit tests
         run: |
           resolution=highest
-          if [ "${{ matrix.python-version }}" == "3.9" ]; then
+          if [ "${{ matrix.python-version }}" == "3.10" ]; then
             resolution=lowest-direct
           fi
           uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }}
@@ -67,22 +64,19 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.12"]
-        subset: ["test_tts", "test_tts2", "test_vocoder", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]
+        python-version: ["3.10", "3.12"]
+        subset: ["test_tts", "test_tts2", "test_vocoder", "test_xtts"]
     steps:
       - uses: actions/checkout@v4
       - name: Setup uv
         uses: ./.github/actions/setup-uv
-      - name: Set up Python ${{ matrix.python-version }}
-        run: uv python install ${{ matrix.python-version }}
       - name: Install Espeak
-        if: contains(fromJSON('["test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset)
+        if: contains(fromJSON('["test_tts", "test_tts2", "test_xtts"]'), matrix.subset)
         run: |
           sudo apt-get update
           sudo apt-get install espeak espeak-ng
       - name: Install dependencies
         run: |
-          sudo apt-get update
           sudo apt-get install -y --no-install-recommends git make gcc
           make system-deps
       - name: Install custom Trainer and/or Coqpit if requested
@@ -96,7 +90,7 @@ jobs:
       - name: Integration tests
         run: |
           resolution=highest
-          if [ "${{ matrix.python-version }}" == "3.9" ]; then
+          if [ "${{ matrix.python-version }}" == "3.10" ]; then
             resolution=lowest-direct
           fi
           uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }}
@@ -107,9 +101,48 @@ jobs:
           name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }}
           path: .coverage.*
           if-no-files-found: ignore
+  zoo:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.12"]
+        partition: ["0", "1", "2"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup uv
+        uses: ./.github/actions/setup-uv
+      - name: Install Espeak
+        run: |
+          sudo apt-get update
+          sudo apt-get install espeak espeak-ng
+      - name: Install dependencies
+        run: |
+          sudo apt-get install -y --no-install-recommends git make gcc
+          make system-deps
+      - name: Install custom Trainer and/or Coqpit if requested
+        run: |
+          if [[ -n "${{ github.event.inputs.trainer_branch }}" ]]; then
+            uv add git+https://github.com/idiap/coqui-ai-Trainer --branch ${{ github.event.inputs.trainer_branch }}
+          fi
+          if [[ -n "${{ github.event.inputs.coqpit_branch }}" ]]; then
+            uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }}
+          fi
+      - name: Zoo tests
+        run: uv run --extra server --extra languages make test_zoo
+        env:
+          NUM_PARTITIONS: 3
+          TEST_PARTITION: ${{ matrix.partition }}
+      - name: Upload coverage data
+        uses: actions/upload-artifact@v4
+        with:
+          include-hidden-files: true
+          name: coverage-data-zoo-${{ matrix.partition }}
+          path: .coverage.*
+          if-no-files-found: ignore
   coverage:
     if: always()
-    needs: [unit, integration]
+    needs: [unit, integration, zoo]
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 62420e9958..2f070ad085 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,13 +7,9 @@ repos:
       - id: check-yaml
       - id: end-of-file-fixer
       - id: trailing-whitespace
-  - repo: "https://github.com/psf/black"
-    rev: 24.2.0
-    hooks:
-      - id: black
-        language_version: python3
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.7.0
+    rev: v0.9.1
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
+      - id: ruff-format
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2b3a973763..5fe9421442 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -88,7 +88,7 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
     uv run make test_all  # run all the tests, report all the errors
     ```
 
-9. Format your code. We use ```black``` for code formatting.
+9. Format your code. We use ```ruff``` for code formatting.
 
     ```bash
     make style
diff --git a/Makefile b/Makefile
index 6964773fb5..d86845ddcf 100644
--- a/Makefile
+++ b/Makefile
@@ -6,55 +6,48 @@ help:
 
 target_dirs := tests TTS notebooks recipes
 
-test_all:	## run tests and don't stop on an error.
-	nose2 --with-coverage --coverage TTS tests
-	./run_bash_tests.sh
-
 test:	## run tests.
-	coverage run -m nose2 -F -v -B tests
+	coverage run -m pytest -x -v --durations=0 tests
 
 test_vocoder:	## run vocoder tests.
-	coverage run -m nose2 -F -v -B tests.vocoder_tests
+	coverage run -m pytest -x -v --durations=0 tests/vocoder_tests
 
 test_tts:	## run tts tests.
-	coverage run -m nose2 -F -v -B tests.tts_tests
+	coverage run -m pytest -x -v --durations=0 tests/tts_tests
 
 test_tts2:	## run tts tests.
-	coverage run -m nose2 -F -v -B tests.tts_tests2
+	coverage run -m pytest -x -v --durations=0 tests/tts_tests2
 
 test_xtts:
-	coverage run -m nose2 -F -v -B tests.xtts_tests
+	coverage run -m pytest -x -v --durations=0 tests/xtts_tests
 
 test_aux:	## run aux tests.
-	coverage run -m nose2 -F -v -B tests.aux_tests
-	./run_bash_tests.sh
+	coverage run -m pytest -x -v --durations=0 tests/aux_tests
+
+test_zoo:	## run zoo tests.
+	coverage run -m pytest -x -v --durations=0 tests/zoo_tests/test_models.py
 
-test_zoo0:	## run zoo tests.
-	coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_0_step_3 \
-	tests.zoo_tests.test_models.test_voice_conversion
-test_zoo1:	## run zoo tests.
-	coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_1_step_3
-test_zoo2:	## run zoo tests.
-	coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_2_step_3
+test_zoo_big:	## run tests for models that are too big for CI.
+	coverage run -m pytest -x -v --durations=0 tests/zoo_tests/test_big_models.py
 
 inference_tests: ## run inference tests.
-	coverage run -m nose2 -F -v -B tests.inference_tests
+	coverage run -m pytest -x -v --durations=0 tests/inference_tests
 
 data_tests: ## run data tests.
-	coverage run -m nose2 -F -v -B tests.data_tests
+	coverage run -m pytest -x -v --durations=0 tests/data_tests
 
 test_text: ## run text tests.
-	coverage run -m nose2 -F -v -B tests.text_tests
+	coverage run -m pytest -x -v --durations=0 tests/text_tests
 
 test_failed:  ## only run tests failed the last time.
-	coverage run -m nose2 -F -v -B tests
+	coverage run -m pytest -x -v --last-failed tests
 
 style:	## update code style.
-	uv run --only-dev black ${target_dirs}
+	uv run --only-dev ruff format ${target_dirs}
 
 lint:	## run linters.
 	uv run --only-dev ruff check ${target_dirs}
-	uv run --only-dev black ${target_dirs} --check
+	uv run --only-dev ruff format ${target_dirs} --check
 
 system-deps:	## install linux system deps
 	sudo apt-get install -y libsndfile1-dev
diff --git a/README.md b/README.md
index 9ccf8657ab..db8868b26d 100644
--- a/README.md
+++ b/README.md
@@ -98,6 +98,7 @@ repository are also still a useful source of information.
 
 ### Voice Conversion
 - [FreeVC](https://arxiv.org/abs/2210.15418)
+- [kNN-VC](https://doi.org/10.21437/Interspeech.2023-419)
 - [OpenVoice](https://arxiv.org/abs/2312.01479)
 
 ### Others
@@ -115,7 +116,7 @@ You can also help us implement more models.
 <!-- start installation -->
 ## Installation
 
-🐸TTS is tested on Ubuntu 24.04 with **python >= 3.9, < 3.13**, but should also
+🐸TTS is tested on Ubuntu 24.04 with **python >= 3.10, < 3.13**, but should also
 work on Mac and Windows.
 
 If you are only interested in [synthesizing speech](https://coqui-tts.readthedocs.io/en/latest/inference.html) with the pretrained 🐸TTS models, installing from PyPI is the easiest option.
@@ -170,7 +171,7 @@ You can also try out Coqui TTS without installation with the docker image.
 Simply run the following command and you will be able to run TTS:
 
 ```bash
-docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
+docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/idiap/coqui-tts-cpu
 python3 TTS/server/server.py --list_models #To get the list of available models
 python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server
 ```
@@ -234,7 +235,7 @@ tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
 
 #### Voice conversion (VC)
 
-Converting the voice in `source_wav` to the voice of `target_wav`
+Converting the voice in `source_wav` to the voice of `target_wav`:
 
 ```python
 tts = TTS("voice_conversion_models/multilingual/vctk/freevc24").to("cuda")
@@ -246,9 +247,13 @@ tts.voice_conversion_to_file(
 ```
 
 Other available voice conversion models:
+- `voice_conversion_models/multilingual/multi-dataset/knnvc`
 - `voice_conversion_models/multilingual/multi-dataset/openvoice_v1`
 - `voice_conversion_models/multilingual/multi-dataset/openvoice_v2`
 
+For more details, see the
+[documentation](https://coqui-tts.readthedocs.io/en/latest/vc.html).
+
 #### Voice cloning by combining single speaker TTS model with the default VC model
 
 This way, you can clone voices by using any model in 🐸TTS. The FreeVC model is
diff --git a/TTS/.models.json b/TTS/.models.json
index 36654d0555..05c88bef43 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -787,6 +787,22 @@
                     "license": "apache 2.0"
                 }
             },
+            "librispeech100": {
+                "wavlm-hifigan": {
+                    "description": "HiFiGAN vocoder for WavLM features from kNN-VC",
+                    "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan.zip",
+                    "commit": "cfba7e0",
+                    "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
+                    "license": "MIT"
+                },
+                "wavlm-hifigan_prematched": {
+                    "description": "Prematched HiFiGAN vocoder for WavLM features from kNN-VC",
+                    "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan_prematched.zip",
+                    "commit": "cfba7e0",
+                    "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
+                    "license": "MIT"
+                }
+            },
             "ljspeech": {
                 "multiband-melgan": {
                     "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
@@ -927,18 +943,27 @@
                 "freevc24": {
                     "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
                     "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
+                    "default_vocoder": null,
                     "author": "Jing-Yi Li @OlaWod",
                     "license": "MIT",
                     "commit": null
                 }
             },
             "multi-dataset": {
+                "knnvc": {
+                    "description": "kNN-VC model from https://github.com/bshall/knn-vc",
+                    "default_vocoder": "vocoder_models/en/librispeech100/wavlm-hifigan_prematched",
+                    "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
+                    "license": "MIT",
+                    "commit": null
+                },
                 "openvoice_v1": {
                     "hf_url": [
                         "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/config.json",
                         "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth"
                     ],
                     "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
+                    "default_vocoder": null,
                     "author": "MyShell.ai",
                     "license": "MIT",
                     "commit": null
@@ -949,6 +974,7 @@
                         "https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/checkpoint.pth"
                     ],
                     "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
+                    "default_vocoder": null,
                     "author": "MyShell.ai",
                     "license": "MIT",
                     "commit": null
diff --git a/TTS/api.py b/TTS/api.py
index 86a311112e..abbd164dea 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -4,7 +4,6 @@
 import tempfile
 import warnings
 from pathlib import Path
-from typing import Optional
 
 from torch import nn
 
@@ -22,15 +21,15 @@ def __init__(
         self,
         model_name: str = "",
         *,
-        model_path: Optional[str] = None,
-        config_path: Optional[str] = None,
-        vocoder_name: Optional[str] = None,
-        vocoder_path: Optional[str] = None,
-        vocoder_config_path: Optional[str] = None,
-        encoder_path: Optional[str] = None,
-        encoder_config_path: Optional[str] = None,
-        speakers_file_path: Optional[str] = None,
-        language_ids_file_path: Optional[str] = None,
+        model_path: str | None = None,
+        config_path: str | None = None,
+        vocoder_name: str | None = None,
+        vocoder_path: str | None = None,
+        vocoder_config_path: str | None = None,
+        encoder_path: str | None = None,
+        encoder_config_path: str | None = None,
+        speakers_file_path: str | None = None,
+        language_ids_file_path: str | None = None,
         progress_bar: bool = True,
         gpu: bool = False,
     ) -> None:
@@ -77,8 +76,8 @@ def __init__(
         super().__init__()
         self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar)
         self.config = load_config(config_path) if config_path else None
-        self.synthesizer = None
-        self.voice_converter = None
+        self.synthesizer: Synthesizer | None = None
+        self.voice_converter: Synthesizer | None = None
         self.model_name = ""
 
         self.vocoder_path = vocoder_path
@@ -95,7 +94,7 @@ def __init__(
             if "tts_models" in model_name:
                 self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
             elif "voice_conversion_models" in model_name:
-                self.load_vc_model_by_name(model_name, gpu=gpu)
+                self.load_vc_model_by_name(model_name, vocoder_name, gpu=gpu)
             # To allow just TTS("xtts")
             else:
                 self.load_model_by_name(model_name, vocoder_name, gpu=gpu)
@@ -156,25 +155,27 @@ def list_models() -> list[str]:
         return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False).list_models()
 
     def download_model_by_name(
-        self, model_name: str, vocoder_name: Optional[str] = None
-    ) -> tuple[Optional[Path], Optional[Path], Optional[Path]]:
+        self, model_name: str, vocoder_name: str | None = None
+    ) -> tuple[Path | None, Path | None, Path | None, Path | None, Path | None]:
         model_path, config_path, model_item = self.manager.download_model(model_name)
         if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
             # return model directory if there are multiple files
             # we assume that the model knows how to load itself
-            return None, None, model_path
+            return None, None, None, None, model_path
         if model_item.get("default_vocoder") is None:
-            return model_path, config_path, None
+            return model_path, config_path, None, None, None
         if vocoder_name is None:
             vocoder_name = model_item["default_vocoder"]
-        vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
-        # A local vocoder model will take precedence if specified via vocoder_path
-        if self.vocoder_path is None or self.vocoder_config_path is None:
-            self.vocoder_path = vocoder_path
-            self.vocoder_config_path = vocoder_config_path
-        return model_path, config_path, None
-
-    def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
+        vocoder_path, vocoder_config_path = None, None
+        # A local vocoder model will take precedence if already specified in __init__
+        if model_item["model_type"] == "tts_models":
+            vocoder_path = self.vocoder_path
+            vocoder_config_path = self.vocoder_config_path
+        if vocoder_path is None or vocoder_config_path is None:
+            vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
+        return model_path, config_path, vocoder_path, vocoder_config_path, None
+
+    def load_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None:
         """Load one of the 🐸TTS models by name.
 
         Args:
@@ -183,7 +184,7 @@ def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None
         """
         self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
 
-    def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False) -> None:
+    def load_vc_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None:
         """Load one of the voice conversion models by name.
 
         Args:
@@ -191,12 +192,19 @@ def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False) -> None:
             gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         """
         self.model_name = model_name
-        model_path, config_path, model_dir = self.download_model_by_name(model_name)
+        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+            model_name, vocoder_name
+        )
         self.voice_converter = Synthesizer(
-            vc_checkpoint=model_path, vc_config=config_path, model_dir=model_dir, use_cuda=gpu
+            vc_checkpoint=model_path,
+            vc_config=config_path,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config_path,
+            model_dir=model_dir,
+            use_cuda=gpu,
         )
 
-    def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
+    def load_tts_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None:
         """Load one of 🐸TTS models by name.
 
         Args:
@@ -208,7 +216,9 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] =
         self.synthesizer = None
         self.model_name = model_name
 
-        model_path, config_path, model_dir = self.download_model_by_name(model_name, vocoder_name)
+        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+            model_name, vocoder_name
+        )
 
         # init synthesizer
         # None values are fetch from the model
@@ -217,8 +227,8 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] =
             tts_config_path=config_path,
             tts_speakers_file=None,
             tts_languages_file=None,
-            vocoder_checkpoint=self.vocoder_path,
-            vocoder_config=self.vocoder_config_path,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config_path,
             encoder_checkpoint=self.encoder_path,
             encoder_config=self.encoder_config_path,
             model_dir=model_dir,
@@ -250,11 +260,11 @@ def load_tts_model_by_path(self, model_path: str, config_path: str, *, gpu: bool
 
     def _check_arguments(
         self,
-        speaker: Optional[str] = None,
-        language: Optional[str] = None,
-        speaker_wav: Optional[str] = None,
-        emotion: Optional[str] = None,
-        speed: Optional[float] = None,
+        speaker: str | None = None,
+        language: str | None = None,
+        speaker_wav: str | None = None,
+        emotion: str | None = None,
+        speed: float | None = None,
         **kwargs,
     ) -> None:
         """Check if the arguments are valid for the model."""
@@ -273,11 +283,11 @@ def _check_arguments(
     def tts(
         self,
         text: str,
-        speaker: str = None,
-        language: str = None,
-        speaker_wav: str = None,
-        emotion: str = None,
-        speed: float = None,
+        speaker: str | None = None,
+        language: str | None = None,
+        speaker_wav: str | None = None,
+        emotion: str | None = None,
+        speed: float | None = None,
         split_sentences: bool = True,
         **kwargs,
     ):
@@ -323,10 +333,10 @@ def tts(
     def tts_to_file(
         self,
         text: str,
-        speaker: str = None,
-        language: str = None,
-        speaker_wav: str = None,
-        emotion: str = None,
+        speaker: str | None = None,
+        language: str | None = None,
+        speaker_wav: str | None = None,
+        emotion: str | None = None,
         speed: float = 1.0,
         pipe_out=None,
         file_path: str = "output.wav",
@@ -378,7 +388,7 @@ def tts_to_file(
     def voice_conversion(
         self,
         source_wav: str,
-        target_wav: str,
+        target_wav: str | list[str],
     ):
         """Voice conversion with FreeVC. Convert source wav to target speaker.
 
@@ -396,7 +406,7 @@ def voice_conversion(
     def voice_conversion_to_file(
         self,
         source_wav: str,
-        target_wav: str,
+        target_wav: str | list[str],
         file_path: str = "output.wav",
         pipe_out=None,
     ) -> str:
@@ -419,9 +429,10 @@ def voice_conversion_to_file(
     def tts_with_vc(
         self,
         text: str,
-        language: str = None,
-        speaker_wav: str = None,
-        speaker: str = None,
+        *,
+        language: str | None = None,
+        speaker_wav: str | list[str],
+        speaker: str | None = None,
         split_sentences: bool = True,
     ):
         """Convert text to speech with voice conversion.
@@ -461,10 +472,11 @@ def tts_with_vc(
     def tts_with_vc_to_file(
         self,
         text: str,
-        language: str = None,
-        speaker_wav: str = None,
+        *,
+        language: str | None = None,
+        speaker_wav: str | list[str],
         file_path: str = "output.wav",
-        speaker: str = None,
+        speaker: str | None = None,
         split_sentences: bool = True,
         pipe_out=None,
     ) -> str:
diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py
index b8f69b54e5..8d7a2633a0 100644
--- a/TTS/bin/compute_attention_masks.py
+++ b/TTS/bin/compute_attention_masks.py
@@ -113,7 +113,7 @@
 
     # compute attentions
     file_paths = []
-    with torch.no_grad():
+    with torch.inference_mode():
         for data in tqdm(loader):
             # setup input data
             text_input = data[0]
diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
index dc0ce5b18b..d450e26fba 100644
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@@ -15,6 +15,88 @@
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 
 
+def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
+        """
+        Example runs:
+        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --config_dataset_path dataset_config.json
+
+        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
+        """,
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        help="Path to model checkpoint file. It defaults to the released speaker encoder.",
+        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
+    )
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        help="Path to model config file. It defaults to the released speaker encoder config.",
+        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
+    )
+    parser.add_argument(
+        "--config_dataset_path",
+        type=str,
+        help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
+        default=None,
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        help="Path for output `pth` or `json` file.",
+        default="speakers.pth",
+    )
+    parser.add_argument(
+        "--old_file",
+        type=str,
+        help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
+        default=None,
+    )
+    parser.add_argument(
+        "--old_append",
+        help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False)
+    parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
+    parser.add_argument(
+        "--formatter_name",
+        type=str,
+        help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        help="Path to the dataset. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--meta_file_train",
+        type=str,
+        help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--meta_file_val",
+        type=str,
+        help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    return parser.parse_args()
+
+
 def compute_embeddings(
     model_path,
     config_path,
@@ -102,88 +184,9 @@ def compute_embeddings(
         print("Speaker embeddings saved at:", mapping_file_path)
 
 
-if __name__ == "__main__":
+def main(arg_list: list[str] | None = None):
     setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
-
-    parser = argparse.ArgumentParser(
-        description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
-        """
-        Example runs:
-        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --config_dataset_path dataset_config.json
-
-        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
-        """,
-        formatter_class=RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        help="Path to model checkpoint file. It defaults to the released speaker encoder.",
-        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
-    )
-    parser.add_argument(
-        "--config_path",
-        type=str,
-        help="Path to model config file. It defaults to the released speaker encoder config.",
-        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
-    )
-    parser.add_argument(
-        "--config_dataset_path",
-        type=str,
-        help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
-        default=None,
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        help="Path for output `pth` or `json` file.",
-        default="speakers.pth",
-    )
-    parser.add_argument(
-        "--old_file",
-        type=str,
-        help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
-        default=None,
-    )
-    parser.add_argument(
-        "--old_append",
-        help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
-        default=False,
-        action="store_true",
-    )
-    parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False)
-    parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
-    parser.add_argument(
-        "--formatter_name",
-        type=str,
-        help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--dataset_path",
-        type=str,
-        help="Path to the dataset. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--meta_file_train",
-        type=str,
-        help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--meta_file_val",
-        type=str,
-        help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    args = parser.parse_args()
+    args = parse_args(arg_list)
 
     compute_embeddings(
         args.model_path,
@@ -200,3 +203,7 @@ def compute_embeddings(
         disable_cuda=args.disable_cuda,
         no_eval=args.no_eval,
     )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py
index acec91c369..1da7a092fb 100755
--- a/TTS/bin/compute_statistics.py
+++ b/TTS/bin/compute_statistics.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 
 import argparse
 import glob
@@ -17,10 +16,7 @@
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 
 
-def main():
-    """Run preprocessing process."""
-    setup_logger("TTS", level=logging.INFO, stream=sys.stderr, formatter=ConsoleFormatter())
-
+def parse_args(arg_list: list[str] | None) -> tuple[argparse.Namespace, list[str]]:
     parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
     parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
     parser.add_argument("out_path", type=str, help="save path (directory and filename).")
@@ -30,7 +26,13 @@ def main():
         required=False,
         help="folder including the target set of wavs overriding dataset config.",
     )
-    args, overrides = parser.parse_known_args()
+    return parser.parse_known_args(arg_list)
+
+
+def main(arg_list: list[str] | None = None):
+    """Run preprocessing process."""
+    setup_logger("TTS", level=logging.INFO, stream=sys.stderr, formatter=ConsoleFormatter())
+    args, overrides = parse_args(arg_list)
 
     CONFIG = load_config(args.config_path)
     CONFIG.parse_known_args(overrides, relaxed_parser=True)
@@ -95,6 +97,7 @@ def main():
     stats["audio_config"] = CONFIG.audio.to_dict()
     np.save(output_file_path, stats, allow_pickle=True)
     print(f" > stats saved to {output_file_path}")
+    sys.exit(0)
 
 
 if __name__ == "__main__":
diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
index a04005ce39..be9387f015 100755
--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@@ -3,8 +3,8 @@
 
 import argparse
 import logging
-import os
 import sys
+from pathlib import Path
 
 import numpy as np
 import torch
@@ -13,8 +13,10 @@
 from trainer.generic_utils import count_parameters
 
 from TTS.config import load_config
+from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.datasets import TTSDataset, load_tts_samples
 from TTS.tts.models import setup_model
+from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
@@ -24,56 +26,66 @@
 use_cuda = torch.cuda.is_available()
 
 
-def setup_loader(ap, r):
-    tokenizer, _ = TTSTokenizer.init_from_config(c)
+def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
+    parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
+    parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
+    parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
+    parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
+    parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
+    parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
+    return parser.parse_args(arg_list)
+
+
+def setup_loader(config: BaseTTSConfig, ap: AudioProcessor, r, speaker_manager: SpeakerManager, samples) -> DataLoader:
+    tokenizer, _ = TTSTokenizer.init_from_config(config)
     dataset = TTSDataset(
         outputs_per_step=r,
         compute_linear_spec=False,
-        samples=meta_data,
+        samples=samples,
         tokenizer=tokenizer,
         ap=ap,
         batch_group_size=0,
-        min_text_len=c.min_text_len,
-        max_text_len=c.max_text_len,
-        min_audio_len=c.min_audio_len,
-        max_audio_len=c.max_audio_len,
-        phoneme_cache_path=c.phoneme_cache_path,
+        min_text_len=config.min_text_len,
+        max_text_len=config.max_text_len,
+        min_audio_len=config.min_audio_len,
+        max_audio_len=config.max_audio_len,
+        phoneme_cache_path=config.phoneme_cache_path,
         precompute_num_workers=0,
         use_noise_augment=False,
-        speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
-        d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
+        speaker_id_mapping=speaker_manager.name_to_id if config.use_speaker_embedding else None,
+        d_vector_mapping=speaker_manager.embeddings if config.use_d_vector_file else None,
     )
 
-    if c.use_phonemes and c.compute_input_seq_cache:
+    if config.use_phonemes and config.compute_input_seq_cache:
         # precompute phonemes to have a better estimate of sequence lengths.
-        dataset.compute_input_seq(c.num_loader_workers)
+        dataset.compute_input_seq(config.num_loader_workers)
     dataset.preprocess_samples()
 
-    loader = DataLoader(
+    return DataLoader(
         dataset,
-        batch_size=c.batch_size,
+        batch_size=config.batch_size,
         shuffle=False,
         collate_fn=dataset.collate_fn,
         drop_last=False,
         sampler=None,
-        num_workers=c.num_loader_workers,
+        num_workers=config.num_loader_workers,
         pin_memory=False,
     )
-    return loader
 
 
-def set_filename(wav_path, out_path):
-    wav_file = os.path.basename(wav_path)
-    file_name = wav_file.split(".")[0]
-    os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
-    os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
-    os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
-    os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
-    wavq_path = os.path.join(out_path, "quant", file_name)
-    mel_path = os.path.join(out_path, "mel", file_name)
-    wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
-    wav_path = os.path.join(out_path, "wav", file_name + ".wav")
-    return file_name, wavq_path, mel_path, wav_gl_path, wav_path
+def set_filename(wav_path: str, out_path: Path) -> tuple[Path, Path, Path, Path]:
+    wav_name = Path(wav_path).stem
+    (out_path / "quant").mkdir(exist_ok=True, parents=True)
+    (out_path / "mel").mkdir(exist_ok=True, parents=True)
+    (out_path / "wav_gl").mkdir(exist_ok=True, parents=True)
+    (out_path / "wav").mkdir(exist_ok=True, parents=True)
+    wavq_path = out_path / "quant" / wav_name
+    mel_path = out_path / "mel" / wav_name
+    wav_gl_path = out_path / "wav_gl" / f"{wav_name}.wav"
+    out_wav_path = out_path / "wav" / f"{wav_name}.wav"
+    return wavq_path, mel_path, wav_gl_path, out_wav_path
 
 
 def format_data(data):
@@ -115,18 +127,18 @@ def format_data(data):
     )
 
 
-@torch.no_grad()
+@torch.inference_mode()
 def inference(
-    model_name,
-    model,
-    ap,
+    model_name: str,
+    model: BaseTTS,
+    ap: AudioProcessor,
     text_input,
     text_lengths,
     mel_input,
     mel_lengths,
     speaker_ids=None,
     d_vectors=None,
-):
+) -> np.ndarray:
     if model_name == "glow_tts":
         speaker_c = None
         if speaker_ids is not None:
@@ -141,9 +153,9 @@ def inference(
             aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
         )
         model_output = outputs["model_outputs"]
-        model_output = model_output.detach().cpu().numpy()
+        return model_output.detach().cpu().numpy()
 
-    elif "tacotron" in model_name:
+    if "tacotron" in model_name:
         aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
         outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
         postnet_outputs = outputs["model_outputs"]
@@ -154,16 +166,24 @@ def inference(
             for b in range(postnet_outputs.shape[0]):
                 postnet_output = postnet_outputs[b]
                 mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
-            model_output = torch.stack(mel_specs).cpu().numpy()
-
-        elif model_name == "tacotron2":
-            model_output = postnet_outputs.detach().cpu().numpy()
-    return model_output
+            return torch.stack(mel_specs).cpu().numpy()
+        if model_name == "tacotron2":
+            return postnet_outputs.detach().cpu().numpy()
+    msg = f"Model not supported: {model_name}"
+    raise ValueError(msg)
 
 
 def extract_spectrograms(
-    data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
-):
+    model_name: str,
+    data_loader: DataLoader,
+    model: BaseTTS,
+    ap: AudioProcessor,
+    output_path: Path,
+    quantize_bits: int = 0,
+    save_audio: bool = False,
+    debug: bool = False,
+    metadata_name: str = "metadata.txt",
+) -> None:
     model.eval()
     export_metadata = []
     for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
@@ -182,7 +202,7 @@ def extract_spectrograms(
         ) = format_data(data)
 
         model_output = inference(
-            c.model.lower(),
+            model_name,
             model,
             ap,
             text_input,
@@ -196,7 +216,7 @@ def extract_spectrograms(
         for idx in range(text_input.shape[0]):
             wav_file_path = item_idx[idx]
             wav = ap.load_wav(wav_file_path)
-            _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
+            wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
 
             # quantize and save wav
             if quantize_bits > 0:
@@ -218,74 +238,67 @@ def extract_spectrograms(
                 wav = ap.inv_melspectrogram(mel)
                 ap.save_wav(wav, wav_gl_path)
 
-    with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
+    with (output_path / metadata_name).open("w") as f:
         for data in export_metadata:
-            f.write(f"{data[0]}|{data[1]+'.npy'}\n")
+            f.write(f"{data[0] / data[1]}.npy\n")
 
 
-def main(args):  # pylint: disable=redefined-outer-name
-    # pylint: disable=global-variable-undefined
-    global meta_data, speaker_manager
+def main(arg_list: list[str] | None = None) -> None:
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
+    args = parse_args(arg_list)
+    config = load_config(args.config_path)
+    config.audio.trim_silence = False
 
     # Audio processor
-    ap = AudioProcessor(**c.audio)
+    ap = AudioProcessor(**config.audio)
 
     # load data instances
     meta_data_train, meta_data_eval = load_tts_samples(
-        c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
+        config.datasets,
+        eval_split=args.eval,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
     )
 
     # use eval and training partitions
     meta_data = meta_data_train + meta_data_eval
 
     # init speaker manager
-    if c.use_speaker_embedding:
+    if config.use_speaker_embedding:
         speaker_manager = SpeakerManager(data_items=meta_data)
-    elif c.use_d_vector_file:
-        speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
+    elif config.use_d_vector_file:
+        speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file)
     else:
         speaker_manager = None
 
     # setup model
-    model = setup_model(c)
+    model = setup_model(config)
 
     # restore model
-    model.load_checkpoint(c, args.checkpoint_path, eval=True)
+    model.load_checkpoint(config, args.checkpoint_path, eval=True)
 
     if use_cuda:
         model.cuda()
 
     num_params = count_parameters(model)
-    print("\n > Model has {} parameters".format(num_params), flush=True)
+    print(f"\n > Model has {num_params} parameters", flush=True)
     # set r
-    r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
-    own_loader = setup_loader(ap, r)
+    r = 1 if config.model.lower() == "glow_tts" else model.decoder.r
+    own_loader = setup_loader(config, ap, r, speaker_manager, meta_data)
 
     extract_spectrograms(
+        config.model.lower(),
         own_loader,
         model,
         ap,
-        args.output_path,
+        Path(args.output_path),
         quantize_bits=args.quantize_bits,
         save_audio=args.save_audio,
         debug=args.debug,
-        metada_name="metada.txt",
+        metadata_name="metadata.txt",
     )
+    sys.exit(0)
 
 
 if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
-    parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
-    parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
-    parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
-    parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
-    parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
-    parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
-    args = parser.parse_args()
-
-    c = load_config(args.config_path)
-    c.audio.trim_silence = False
-    main(args)
+    main()
diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py
index 7c68fdb070..40afa1456c 100644
--- a/TTS/bin/find_unique_phonemes.py
+++ b/TTS/bin/find_unique_phonemes.py
@@ -1,4 +1,4 @@
-"""Find all the unique characters in a dataset"""
+"""Find all the unique characters in a dataset."""
 
 import argparse
 import logging
@@ -14,18 +14,13 @@
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 
 
-def compute_phonemes(item):
+def compute_phonemes(item: dict) -> set[str]:
     text = item["text"]
     ph = phonemizer.phonemize(text).replace("|", "")
     return set(ph)
 
 
-def main():
-    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
-
-    # pylint: disable=W0601
-    global c, phonemizer
-    # pylint: disable=bad-option-value
+def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="""Find all the unique characters or phonemes in a dataset.\n\n"""
         """
@@ -36,13 +31,21 @@ def main():
         formatter_class=RawTextHelpFormatter,
     )
     parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
-    args = parser.parse_args()
+    return parser.parse_args(arg_list)
 
-    c = load_config(args.config_path)
+
+def main(arg_list: list[str] | None = None) -> None:
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
+    global phonemizer
+    args = parse_args(arg_list)
+    config = load_config(args.config_path)
 
     # load all datasets
     train_items, eval_items = load_tts_samples(
-        c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
+        config.datasets,
+        eval_split=True,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
     )
     items = train_items + eval_items
     print("Num items:", len(items))
@@ -50,13 +53,16 @@ def main():
     language_list = [item["language"] for item in items]
     is_lang_def = all(language_list)
 
-    if not c.phoneme_language or not is_lang_def:
-        raise ValueError("Phoneme language must be defined in config.")
+    if not config.phoneme_language or not is_lang_def:
+        msg = "Phoneme language must be defined in config."
+        raise ValueError(msg)
 
-    if not language_list.count(language_list[0]) == len(language_list):
-        raise ValueError(
-            "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
+    if language_list.count(language_list[0]) != len(language_list):
+        msg = (
+            "Currently, just one phoneme language per config file is supported !! "
+            "Please split the dataset config into different configs and run it individually for each language !!"
         )
+        raise ValueError(msg)
 
     phonemizer = Gruut(language=language_list[0], keep_puncs=True)
 
@@ -74,6 +80,7 @@ def main():
     print(f" > Unique phonemes: {''.join(sorted(phones))}")
     print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
     print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
+    sys.exit(0)
 
 
 if __name__ == "__main__":
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index 5d20db6a59..00d7530427 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -134,7 +134,7 @@
 """
 
 
-def parse_args() -> argparse.Namespace:
+def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
     """Parse arguments."""
     parser = argparse.ArgumentParser(
         description=description.replace("    ```\n", ""),
@@ -274,13 +274,14 @@ def parse_args() -> argparse.Namespace:
         "--source_wav",
         type=str,
         default=None,
-        help="Original audio file to convert in the voice of the target_wav",
+        help="Original audio file to convert into the voice of the target_wav",
     )
     parser.add_argument(
         "--target_wav",
         type=str,
+        nargs="*",
         default=None,
-        help="Target audio file to convert in the voice of the source_wav",
+        help="Audio file(s) of the target voice into which to convert the source_wav",
     )
 
     parser.add_argument(
@@ -290,7 +291,7 @@ def parse_args() -> argparse.Namespace:
         help="Voice dir for tortoise model",
     )
 
-    args = parser.parse_args()
+    args = parser.parse_args(arg_list)
 
     # print the description if either text or list_models is not set
     check_args = [
@@ -309,9 +310,9 @@ def parse_args() -> argparse.Namespace:
     return args
 
 
-def main() -> None:
+def main(arg_list: list[str] | None = None) -> None:
     """Entry point for `tts` command line interface."""
-    args = parse_args()
+    args = parse_args(arg_list)
     stream = sys.stderr if args.pipe_out else sys.stdout
     setup_logger("TTS", level=logging.INFO, stream=stream, formatter=ConsoleFormatter())
 
@@ -340,18 +341,18 @@ def main() -> None:
         # 1) List pre-trained TTS models
         if args.list_models:
             manager.list_models()
-            sys.exit()
+            sys.exit(0)
 
         # 2) Info about pre-trained TTS models (without loading a model)
         if args.model_info_by_idx:
             model_query = args.model_info_by_idx
             manager.model_info_by_idx(model_query)
-            sys.exit()
+            sys.exit(0)
 
         if args.model_info_by_name:
             model_query_full_name = args.model_info_by_name
             manager.model_info_by_full_name(model_query_full_name)
-            sys.exit()
+            sys.exit(0)
 
         # 3) Load a model for further info or TTS/VC
         device = args.device
@@ -377,23 +378,23 @@ def main() -> None:
         if args.list_speaker_idxs:
             if not api.is_multi_speaker:
                 logger.info("Model only has a single speaker.")
-                return
+                sys.exit(0)
             logger.info(
                 "Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
             )
             logger.info(api.speakers)
-            return
+            sys.exit(0)
 
         # query langauge ids of a multi-lingual model.
         if args.list_language_idxs:
             if not api.is_multi_lingual:
                 logger.info("Monolingual model.")
-                return
+                sys.exit(0)
             logger.info(
                 "Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
             )
             logger.info(api.languages)
-            return
+            sys.exit(0)
 
         # check the arguments against a multi-speaker model.
         if api.is_multi_speaker and (not args.speaker_idx and not args.speaker_wav):
@@ -401,7 +402,7 @@ def main() -> None:
                 "Looks like you use a multi-speaker model. Define `--speaker_idx` to "
                 "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
             )
-            return
+            sys.exit(1)
 
         # RUN THE SYNTHESIS
         if args.text:
@@ -430,6 +431,7 @@ def main() -> None:
                 pipe_out=pipe_out,
             )
             logger.info("Saved VC output to %s", args.out_path)
+        sys.exit(0)
 
 
 if __name__ == "__main__":
diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py
index 84123d2db3..914c729856 100644
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 
 import logging
 import os
@@ -87,7 +86,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False):
 def evaluation(model, criterion, data_loader, global_step):
     eval_loss = 0
     for _, data in enumerate(data_loader):
-        with torch.no_grad():
+        with torch.inference_mode():
             # setup input data
             inputs, labels = data
 
@@ -219,10 +218,8 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
 
             if global_step % c.print_step == 0:
                 print(
-                    "   | > Step:{}  Loss:{:.5f}  GradNorm:{:.5f}  "
-                    "StepTime:{:.2f}  LoaderTime:{:.2f}  AvGLoaderTime:{:.2f}  LR:{:.6f}".format(
-                        global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
-                    ),
+                    f"   | > Step:{global_step}  Loss:{loss.item():.5f}  GradNorm:{grad_norm:.5f}  "
+                    f"StepTime:{step_time:.2f}  LoaderTime:{loader_time:.2f}  AvGLoaderTime:{avg_loader_time:.2f}  LR:{current_lr:.6f}",
                     flush=True,
                 )
 
@@ -236,10 +233,8 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
 
         print("")
         print(
-            ">>> Epoch:{}  AvgLoss: {:.5f} GradNorm:{:.5f}  "
-            "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
-                epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
-            ),
+            f">>> Epoch:{epoch}  AvgLoss: {tot_loss / len(data_loader):.5f} GradNorm:{grad_norm:.5f}  "
+            f"EpochTime:{epoch_time:.2f} AvGLoaderTime:{avg_loader_time:.2f} ",
             flush=True,
         )
         # evaluation
@@ -249,7 +244,7 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
             print("\n\n")
             print("--> EVAL PERFORMANCE")
             print(
-                "   | > Epoch:{}  AvgLoss: {:.5f} ".format(epoch, eval_loss),
+                f"   | > Epoch:{epoch}  AvgLoss: {eval_loss:.5f} ",
                 flush=True,
             )
             # save the best checkpoint
@@ -301,7 +296,7 @@ def main(args):  # pylint: disable=redefined-outer-name
         criterion, args.restore_step = model.load_checkpoint(
             c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
         )
-        print(" > Model restored from step %d" % args.restore_step, flush=True)
+        print(f" > Model restored from step {args.restore_step}", flush=True)
     else:
         args.restore_step = 0
 
@@ -311,7 +306,7 @@ def main(args):  # pylint: disable=redefined-outer-name
         scheduler = None
 
     num_params = count_parameters(model)
-    print("\n > Model has {} parameters".format(num_params), flush=True)
+    print(f"\n > Model has {num_params} parameters", flush=True)
 
     if use_cuda:
         model = model.cuda()
diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py
index aa04177068..58122b9005 100644
--- a/TTS/bin/train_vocoder.py
+++ b/TTS/bin/train_vocoder.py
@@ -17,7 +17,7 @@ class TrainVocoderArgs(TrainerArgs):
     config_path: str = field(default=None, metadata={"help": "Path to the config file."})
 
 
-def main():
+def main(arg_list: list[str] | None = None):
     """Run `tts` model training directly by a `config.json` file."""
     setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
@@ -26,7 +26,7 @@ def main():
     parser = train_args.init_argparse(arg_prefix="")
 
     # override trainer args from comman-line args
-    args, config_overrides = parser.parse_known_args()
+    args, config_overrides = parser.parse_known_args(arg_list)
     train_args.parse_args(args)
 
     # load config.json and register
@@ -76,6 +76,7 @@ def main():
         parse_command_line_args=False,
     )
     trainer.fit()
+    sys.exit(0)
 
 
 if __name__ == "__main__":
diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py
index e5f40c0296..401003504e 100644
--- a/TTS/config/__init__.py
+++ b/TTS/config/__init__.py
@@ -1,7 +1,7 @@
 import json
 import os
 import re
-from typing import Any, Dict, Union
+from typing import Any, Union
 
 import fsspec
 import yaml
@@ -54,11 +54,11 @@ def register_config(model_name: str) -> Coqpit:
     return config_class
 
 
-def _process_model_name(config_dict: Dict) -> str:
+def _process_model_name(config_dict: dict) -> str:
     """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
 
     Args:
-        config_dict (Dict): A dictionary including the config fields.
+        config_dict (dict): A dictionary including the config fields.
 
     Returns:
         str: Formatted modelname.
@@ -68,7 +68,7 @@ def _process_model_name(config_dict: Dict) -> str:
     return model_name
 
 
-def load_config(config_path: Union[str, os.PathLike[Any]]) -> Coqpit:
+def load_config(config_path: str | os.PathLike[Any]) -> Coqpit:
     """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
     to find the corresponding Config class. Then initialize the Config.
 
diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py
index 7fae77d613..a0a013b0de 100644
--- a/TTS/config/shared_configs.py
+++ b/TTS/config/shared_configs.py
@@ -1,5 +1,4 @@
 from dataclasses import asdict, dataclass
-from typing import List
 
 from coqpit import Coqpit, check_argument
 from trainer import TrainerConfig
@@ -227,7 +226,7 @@ class BaseDatasetConfig(Coqpit):
     dataset_name: str = ""
     path: str = ""
     meta_file_train: str = ""
-    ignored_speakers: List[str] = None
+    ignored_speakers: list[str] = None
     language: str = ""
     phonemizer: str = ""
     meta_file_val: str = ""
diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py
index 7ac38ed6ee..dac5f0870a 100644
--- a/TTS/demos/xtts_ft_demo/xtts_demo.py
+++ b/TTS/demos/xtts_ft_demo/xtts_demo.py
@@ -104,7 +104,7 @@ def isatty(self):
 
 def read_logs():
     sys.stdout.flush()
-    with open(sys.stdout.log_file, "r") as f:
+    with open(sys.stdout.log_file) as f:
         return f.read()
 
 
diff --git a/TTS/encoder/configs/base_encoder_config.py b/TTS/encoder/configs/base_encoder_config.py
index ebbaa0457b..d2d0ef580d 100644
--- a/TTS/encoder/configs/base_encoder_config.py
+++ b/TTS/encoder/configs/base_encoder_config.py
@@ -1,5 +1,4 @@
 from dataclasses import asdict, dataclass, field
-from typing import Dict, List
 
 from coqpit import MISSING
 
@@ -12,9 +11,9 @@ class BaseEncoderConfig(BaseTrainingConfig):
 
     model: str = None
     audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
-    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
+    datasets: list[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
     # model params
-    model_params: Dict = field(
+    model_params: dict = field(
         default_factory=lambda: {
             "model_name": "lstm",
             "input_dim": 80,
@@ -25,7 +24,7 @@ class BaseEncoderConfig(BaseTrainingConfig):
         }
     )
 
-    audio_augmentation: Dict = field(default_factory=lambda: {})
+    audio_augmentation: dict = field(default_factory=lambda: {})
 
     # training params
     epochs: int = 10000
@@ -33,7 +32,7 @@ class BaseEncoderConfig(BaseTrainingConfig):
     grad_clip: float = 3.0
     lr: float = 0.0001
     optimizer: str = "radam"
-    optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
     lr_decay: bool = False
     warmup_steps: int = 4000
 
@@ -56,6 +55,6 @@ class BaseEncoderConfig(BaseTrainingConfig):
     def check_values(self):
         super().check_values()
         c = asdict(self)
-        assert (
-            c["model_params"]["input_dim"] == self.audio.num_mels
-        ), " [!] model input dimendion must be equal to melspectrogram dimension."
+        assert c["model_params"]["input_dim"] == self.audio.num_mels, (
+            " [!] model input dimendion must be equal to melspectrogram dimension."
+        )
diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py
index 2082019aad..c6680c3a25 100644
--- a/TTS/encoder/models/base_encoder.py
+++ b/TTS/encoder/models/base_encoder.py
@@ -34,7 +34,7 @@ class BaseEncoder(nn.Module):
 
     # pylint: disable=W0102
     def __init__(self):
-        super(BaseEncoder, self).__init__()
+        super().__init__()
 
     def get_torch_mel_spectrogram_class(self, audio_config):
         return torch.nn.Sequential(
@@ -64,11 +64,11 @@ def get_torch_mel_spectrogram_class(self, audio_config):
             ),
         )
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, x, l2_norm=True):
         return self.forward(x, l2_norm)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
         """
         Generate embeddings for a batch of utterances
@@ -107,7 +107,7 @@ def get_criterion(self, c: Coqpit, num_classes=None):
         elif c.loss == "softmaxproto":
             criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
         else:
-            raise Exception("The %s  not is a loss supported" % c.loss)
+            raise Exception(f"The {c.loss}  not is a loss supported")
         return criterion
 
     def load_checkpoint(
diff --git a/TTS/encoder/models/resnet.py b/TTS/encoder/models/resnet.py
index 5eafcd6005..d7f3a2f4bd 100644
--- a/TTS/encoder/models/resnet.py
+++ b/TTS/encoder/models/resnet.py
@@ -7,7 +7,7 @@
 
 class SELayer(nn.Module):
     def __init__(self, channel, reduction=8):
-        super(SELayer, self).__init__()
+        super().__init__()
         self.avg_pool = nn.AdaptiveAvgPool2d(1)
         self.fc = nn.Sequential(
             nn.Linear(channel, channel // reduction),
@@ -27,7 +27,7 @@ class SEBasicBlock(nn.Module):
     expansion = 1
 
     def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
-        super(SEBasicBlock, self).__init__()
+        super().__init__()
         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
         self.bn1 = nn.BatchNorm2d(planes)
         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
@@ -73,7 +73,7 @@ def __init__(
         use_torch_spec=False,
         audio_config=None,
     ):
-        super(ResNetSpeakerEncoder, self).__init__()
+        super().__init__()
 
         self.encoder_type = encoder_type
         self.input_dim = input_dim
diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py
index 495b4def5a..d6c4f9fa50 100644
--- a/TTS/encoder/utils/generic_utils.py
+++ b/TTS/encoder/utils/generic_utils.py
@@ -12,7 +12,7 @@
 logger = logging.getLogger(__name__)
 
 
-class AugmentWAV(object):
+class AugmentWAV:
     def __init__(self, ap, augmentation_config):
         self.ap = ap
         self.use_additive_noise = False
diff --git a/TTS/encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py
index 37619ed0f8..8d50ffd5f5 100644
--- a/TTS/encoder/utils/prepare_voxceleb.py
+++ b/TTS/encoder/utils/prepare_voxceleb.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo
 # All rights reserved.
 #
@@ -17,7 +16,7 @@
 # Only support eager mode and TF>=2.0.0
 # pylint: disable=no-member, invalid-name, relative-beyond-top-level
 # pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
-""" voxceleb 1 & 2 """
+"""voxceleb 1 & 2"""
 
 import csv
 import hashlib
@@ -81,19 +80,19 @@ def download_and_extract(directory, subset, urls):
             zip_filepath = os.path.join(directory, url.split("/")[-1])
             if os.path.exists(zip_filepath):
                 continue
-            logger.info("Downloading %s to %s" % (url, zip_filepath))
+            logger.info("Downloading %s to %s", url, zip_filepath)
             subprocess.call(
-                "wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath),
+                "wget {} --user {} --password {} -O {}".format(url, USER["user"], USER["password"], zip_filepath),
                 shell=True,
             )
 
             statinfo = os.stat(zip_filepath)
-            logger.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
+            logger.info("Successfully downloaded %s, size(bytes): %d", url, statinfo.st_size)
 
         # concatenate all parts into zip files
         if ".zip" not in zip_filepath:
             zip_filepath = "_".join(zip_filepath.split("_")[:-1])
-            subprocess.call("cat %s* > %s.zip" % (zip_filepath, zip_filepath), shell=True)
+            subprocess.call(f"cat {zip_filepath}* > {zip_filepath}.zip", shell=True)
             zip_filepath += ".zip"
         extract_path = zip_filepath.strip(".zip")
 
@@ -101,12 +100,12 @@ def download_and_extract(directory, subset, urls):
         with open(zip_filepath, "rb") as f_zip:
             md5 = hashlib.md5(f_zip.read()).hexdigest()
         if md5 != MD5SUM[subset]:
-            raise ValueError("md5sum of %s mismatch" % zip_filepath)
+            raise ValueError(f"md5sum of {zip_filepath} mismatch")
 
         with zipfile.ZipFile(zip_filepath, "r") as zfile:
             zfile.extractall(directory)
             extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename)
-            subprocess.call("mv %s %s" % (extract_path_ori, extract_path), shell=True)
+            subprocess.call(f"mv {extract_path_ori} {extract_path}", shell=True)
     finally:
         # os.remove(zip_filepath)
         pass
@@ -122,9 +121,9 @@ def exec_cmd(cmd):
     try:
         retcode = subprocess.call(cmd, shell=True)
         if retcode < 0:
-            logger.info(f"Child was terminated by signal {retcode}")
+            logger.info("Child was terminated by signal %d", retcode)
     except OSError as e:
-        logger.info(f"Execution failed: {e}")
+        logger.info("Execution failed: %s", e)
         retcode = -999
     return retcode
 
@@ -138,10 +137,10 @@ def decode_aac_with_ffmpeg(aac_file, wav_file):
         bool, True if success.
     """
     cmd = f"ffmpeg -i {aac_file} {wav_file}"
-    logger.info(f"Decoding aac file using command line: {cmd}")
+    logger.info("Decoding aac file using command line: %s", cmd)
     ret = exec_cmd(cmd)
     if ret != 0:
-        logger.error(f"Failed to decode aac file with retcode {ret}")
+        logger.error("Failed to decode aac file with retcode %s", ret)
         logger.error("Please check your ffmpeg installation.")
         return False
     return True
@@ -156,7 +155,7 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
         output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv
     """
 
-    logger.info("Preprocessing audio and label for subset %s" % subset)
+    logger.info("Preprocessing audio and label for subset %s", subset)
     source_dir = os.path.join(input_dir, subset)
 
     files = []
@@ -194,7 +193,7 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
         writer.writerow(["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
         for wav_file in files:
             writer.writerow(wav_file)
-    logger.info("Successfully generated csv file {}".format(csv_file_path))
+    logger.info("Successfully generated csv file %s", csv_file_path)
 
 
 def processor(directory, subset, force_process):
diff --git a/TTS/model.py b/TTS/model.py
index 779b1775a3..39faa7f690 100644
--- a/TTS/model.py
+++ b/TTS/model.py
@@ -1,6 +1,6 @@
 import os
 from abc import abstractmethod
-from typing import Any, Union
+from typing import Any
 
 import torch
 from coqpit import Coqpit
@@ -48,7 +48,7 @@ def inference(self, input: torch.Tensor, aux_input: dict[str, Any] = {}) -> dict
     def load_checkpoint(
         self,
         config: Coqpit,
-        checkpoint_path: Union[str, os.PathLike[Any]],
+        checkpoint_path: str | os.PathLike[Any],
         eval: bool = False,
         strict: bool = True,
         cache: bool = False,
@@ -64,3 +64,7 @@ def load_checkpoint(
                 It is cached under `trainer.io.get_user_data_dir()/tts_cache`. Defaults to False.
         """
         ...
+
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
diff --git a/TTS/server/server.py b/TTS/server/server.py
index 6a4642f9a2..753e9103ab 100644
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@@ -8,9 +8,7 @@
 import logging
 import os
 import sys
-from pathlib import Path
 from threading import Lock
-from typing import Union
 from urllib.parse import parse_qs
 
 try:
@@ -19,10 +17,9 @@
     msg = "Server requires requires flask, use `pip install coqui-tts[server]`"
     raise ImportError(msg) from e
 
-from TTS.config import load_config
+from TTS.api import TTS
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 from TTS.utils.manage import ModelManager
-from TTS.utils.synthesizer import Synthesizer
 
 logger = logging.getLogger(__name__)
 setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
@@ -60,6 +57,7 @@ def create_argparser() -> argparse.ArgumentParser:
     parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
     parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
     parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
+    parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
     parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="true to use CUDA.")
     parser.add_argument(
         "--debug", action=argparse.BooleanOptionalAction, default=False, help="true to enable Flask debug mode."
@@ -73,8 +71,7 @@ def create_argparser() -> argparse.ArgumentParser:
 # parse the args
 args = create_argparser().parse_args()
 
-path = Path(__file__).parent / "../.models.json"
-manager = ModelManager(path)
+manager = ModelManager(models_file=TTS.get_models_file_path())
 
 # update in-use models to the specified released models.
 model_path = None
@@ -86,55 +83,31 @@ def create_argparser() -> argparse.ArgumentParser:
 # CASE1: list pre-trained TTS models
 if args.list_models:
     manager.list_models()
-    sys.exit()
-
-# CASE2: load pre-trained model paths
-if args.model_name is not None and not args.model_path:
-    model_path, config_path, model_item = manager.download_model(args.model_name)
-    args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
-
-if args.vocoder_name is not None and not args.vocoder_path:
-    vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
-
-# CASE3: set custom model paths
-if args.model_path is not None:
-    model_path = args.model_path
-    config_path = args.config_path
-    speakers_file_path = args.speakers_file_path
-
-if args.vocoder_path is not None:
-    vocoder_path = args.vocoder_path
-    vocoder_config_path = args.vocoder_config_path
-
-# load models
-synthesizer = Synthesizer(
-    tts_checkpoint=model_path,
-    tts_config_path=config_path,
-    tts_speakers_file=speakers_file_path,
-    tts_languages_file=None,
-    vocoder_checkpoint=vocoder_path,
-    vocoder_config=vocoder_config_path,
-    encoder_checkpoint="",
-    encoder_config="",
-    use_cuda=args.use_cuda,
-)
-
-use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and (
-    synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None
-)
-speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
-
-use_multi_language = hasattr(synthesizer.tts_model, "num_languages") and (
-    synthesizer.tts_model.num_languages > 1 or synthesizer.tts_languages_file is not None
-)
-language_manager = getattr(synthesizer.tts_model, "language_manager", None)
+    sys.exit(0)
+
+device = args.device
+if args.use_cuda:
+    device = "cuda"
+
+# CASE2: load models
+model_name = args.model_name if args.model_path is None else None
+api = TTS(
+    model_name=model_name,
+    model_path=args.model_path,
+    config_path=args.config_path,
+    vocoder_name=args.vocoder_name,
+    vocoder_path=args.vocoder_path,
+    vocoder_config_path=args.vocoder_config_path,
+    speakers_file_path=args.speakers_file_path,
+    # language_ids_file_path=args.language_ids_file_path,
+).to(device)
 
 # TODO: set this from SpeakerManager
-use_gst = synthesizer.tts_config.get("use_gst", False)
+use_gst = api.synthesizer.tts_config.get("use_gst", False)
 app = Flask(__name__)
 
 
-def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
+def style_wav_uri_to_dict(style_wav: str) -> str | dict:
     """Transform an uri style_wav, in either a string (path to wav file to be use for style transfer)
     or a dict (gst tokens/values to be use for styling)
 
@@ -158,27 +131,18 @@ def index():
     return render_template(
         "index.html",
         show_details=args.show_details,
-        use_multi_speaker=use_multi_speaker,
-        use_multi_language=use_multi_language,
-        speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None,
-        language_ids=language_manager.name_to_id if language_manager is not None else None,
+        use_multi_speaker=api.is_multi_speaker,
+        use_multi_language=api.is_multi_lingual,
+        speaker_ids=api.speakers,
+        language_ids=api.languages,
         use_gst=use_gst,
     )
 
 
 @app.route("/details")
 def details():
-    if args.config_path is not None and os.path.isfile(args.config_path):
-        model_config = load_config(args.config_path)
-    elif args.model_name is not None:
-        model_config = load_config(config_path)
-
-    if args.vocoder_config_path is not None and os.path.isfile(args.vocoder_config_path):
-        vocoder_config = load_config(args.vocoder_config_path)
-    elif args.vocoder_name is not None:
-        vocoder_config = load_config(vocoder_config_path)
-    else:
-        vocoder_config = None
+    model_config = api.synthesizer.tts_config
+    vocoder_config = api.synthesizer.vocoder_config or None
 
     return render_template(
         "details.html",
@@ -196,17 +160,23 @@ def details():
 def tts():
     with lock:
         text = request.headers.get("text") or request.values.get("text", "")
-        speaker_idx = request.headers.get("speaker-id") or request.values.get("speaker_id", "")
-        language_idx = request.headers.get("language-id") or request.values.get("language_id", "")
+        speaker_idx = (
+            request.headers.get("speaker-id") or request.values.get("speaker_id", "") if api.is_multi_speaker else None
+        )
+        language_idx = (
+            request.headers.get("language-id") or request.values.get("language_id", "")
+            if api.is_multi_lingual
+            else None
+        )
         style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "")
         style_wav = style_wav_uri_to_dict(style_wav)
 
         logger.info("Model input: %s", text)
         logger.info("Speaker idx: %s", speaker_idx)
         logger.info("Language idx: %s", language_idx)
-        wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
+        wavs = api.tts(text, speaker=speaker_idx, language=language_idx, style_wav=style_wav)
         out = io.BytesIO()
-        synthesizer.save_wav(wavs, out)
+        api.synthesizer.save_wav(wavs, out)
     return send_file(out, mimetype="audio/wav")
 
 
@@ -248,9 +218,9 @@ def mary_tts_api_process():
         else:
             text = request.args.get("INPUT_TEXT", "")
         logger.info("Model input: %s", text)
-        wavs = synthesizer.tts(text)
+        wavs = api.tts(text)
         out = io.BytesIO()
-        synthesizer.save_wav(wavs, out)
+        api.synthesizer.save_wav(wavs, out)
     return send_file(out, mimetype="audio/wav")
 
 
diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py
index 317a01af53..2224396d1e 100644
--- a/TTS/tts/configs/align_tts_config.py
+++ b/TTS/tts/configs/align_tts_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.align_tts import AlignTTSArgs
@@ -70,7 +69,7 @@ class AlignTTSConfig(BaseTTSConfig):
     model: str = "align_tts"
     # model specific params
     model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs)
-    phase_start_steps: List[int] = None
+    phase_start_steps: list[int] = None
 
     ssim_alpha: float = 1.0
     spec_loss_alpha: float = 1.0
@@ -96,7 +95,7 @@ class AlignTTSConfig(BaseTTSConfig):
     r: int = 1
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] = field(
         default_factory=lambda: [
             "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             "Be a voice, not an echo.",
diff --git a/TTS/tts/configs/bark_config.py b/TTS/tts/configs/bark_config.py
index b846febe85..61d67b987a 100644
--- a/TTS/tts/configs/bark_config.py
+++ b/TTS/tts/configs/bark_config.py
@@ -1,6 +1,5 @@
 import os
 from dataclasses import dataclass, field
-from typing import Dict
 
 from trainer.io import get_user_data_dir
 
@@ -70,9 +69,9 @@ class BarkConfig(BaseTTSConfig):
     COARSE_INFER_TOKEN: int = 12_050
 
     REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/"
-    REMOTE_MODEL_PATHS: Dict = None
-    LOCAL_MODEL_PATHS: Dict = None
-    SMALL_REMOTE_MODEL_PATHS: Dict = None
+    REMOTE_MODEL_PATHS: dict = None
+    LOCAL_MODEL_PATHS: dict = None
+    SMALL_REMOTE_MODEL_PATHS: dict = None
     CACHE_DIR: str = str(get_user_data_dir("tts/suno/bark_v0"))
     DEF_SPEAKER_DIR: str = str(get_user_data_dir("tts/bark_v0/speakers"))
 
diff --git a/TTS/tts/configs/delightful_tts_config.py b/TTS/tts/configs/delightful_tts_config.py
index 805d995369..7f9e7a6ab2 100644
--- a/TTS/tts/configs/delightful_tts_config.py
+++ b/TTS/tts/configs/delightful_tts_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig
@@ -73,7 +72,7 @@ class DelightfulTTSConfig(BaseTTSConfig):
 
     # optimizer
     steps_to_start_discriminator: int = 200000
-    grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
+    grad_clip: list[float] = field(default_factory=lambda: [1000, 1000])
     lr_gen: float = 0.0002
     lr_disc: float = 0.0002
     lr_scheduler_gen: str = "ExponentialLR"
@@ -140,7 +139,7 @@ class DelightfulTTSConfig(BaseTTSConfig):
     d_vector_dim: int = None
 
     # testing
-    test_sentences: List[List[str]] = field(
+    test_sentences: list[list[str]] = field(
         default_factory=lambda: [
             ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
             ["Be a voice, not an echo."],
diff --git a/TTS/tts/configs/fast_pitch_config.py b/TTS/tts/configs/fast_pitch_config.py
index d086d26564..5b50122e09 100644
--- a/TTS/tts/configs/fast_pitch_config.py
+++ b/TTS/tts/configs/fast_pitch_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.forward_tts import ForwardTTSArgs
@@ -153,7 +152,7 @@ class FastPitchConfig(BaseTTSConfig):
     f0_cache_path: str = None
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] = field(
         default_factory=lambda: [
             "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             "Be a voice, not an echo.",
diff --git a/TTS/tts/configs/fast_speech_config.py b/TTS/tts/configs/fast_speech_config.py
index af6c2db6fa..f375292256 100644
--- a/TTS/tts/configs/fast_speech_config.py
+++ b/TTS/tts/configs/fast_speech_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.forward_tts import ForwardTTSArgs
@@ -147,7 +146,7 @@ class FastSpeechConfig(BaseTTSConfig):
     f0_cache_path: str = None
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] = field(
         default_factory=lambda: [
             "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             "Be a voice, not an echo.",
diff --git a/TTS/tts/configs/fastspeech2_config.py b/TTS/tts/configs/fastspeech2_config.py
index d179617fb0..3d6ce4f4b3 100644
--- a/TTS/tts/configs/fastspeech2_config.py
+++ b/TTS/tts/configs/fastspeech2_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.forward_tts import ForwardTTSArgs
@@ -168,7 +167,7 @@ class Fastspeech2Config(BaseTTSConfig):
     energy_cache_path: str = None
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] = field(
         default_factory=lambda: [
             "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             "Be a voice, not an echo.",
diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py
index f42f3e5a51..34b4057093 100644
--- a/TTS/tts/configs/glow_tts_config.py
+++ b/TTS/tts/configs/glow_tts_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 
@@ -171,7 +170,7 @@ class GlowTTSConfig(BaseTTSConfig):
     r: int = 1  # DO NOT CHANGE - TODO: make this immutable once coqpit implements it.
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] = field(
         default_factory=lambda: [
             "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             "Be a voice, not an echo.",
diff --git a/TTS/tts/configs/neuralhmm_tts_config.py b/TTS/tts/configs/neuralhmm_tts_config.py
index 50f72847ed..bd1736c880 100644
--- a/TTS/tts/configs/neuralhmm_tts_config.py
+++ b/TTS/tts/configs/neuralhmm_tts_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 
@@ -126,7 +125,7 @@ class NeuralhmmTTSConfig(BaseTTSConfig):
     memory_rnn_dim: int = 1024
 
     ## Outputnet parameters
-    outputnet_size: List[int] = field(default_factory=lambda: [1024])
+    outputnet_size: list[int] = field(default_factory=lambda: [1024])
     flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
     std_floor: float = 0.001
 
@@ -143,7 +142,7 @@ class NeuralhmmTTSConfig(BaseTTSConfig):
     min_audio_len: int = 512
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] = field(
         default_factory=lambda: [
             "Be a voice, not an echo.",
         ]
@@ -162,9 +161,9 @@ def check_values(self):
             AssertionError: transition probability is not between 0 and 1
         """
         assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."
-        assert (
-            len(self.outputnet_size) >= 1
-        ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
-        assert (
-            0 < self.flat_start_params["transition_p"] < 1
-        ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
+        assert len(self.outputnet_size) >= 1, (
+            f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
+        )
+        assert 0 < self.flat_start_params["transition_p"] < 1, (
+            f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
+        )
diff --git a/TTS/tts/configs/overflow_config.py b/TTS/tts/configs/overflow_config.py
index dc3e5548b8..93a6a9e377 100644
--- a/TTS/tts/configs/overflow_config.py
+++ b/TTS/tts/configs/overflow_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 
@@ -145,7 +144,7 @@ class OverflowConfig(BaseTTSConfig):  # The classname has to be camel case
     memory_rnn_dim: int = 1024
 
     ## Outputnet parameters
-    outputnet_size: List[int] = field(default_factory=lambda: [1024])
+    outputnet_size: list[int] = field(default_factory=lambda: [1024])
     flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
     std_floor: float = 0.01
 
@@ -174,7 +173,7 @@ class OverflowConfig(BaseTTSConfig):  # The classname has to be camel case
     min_audio_len: int = 512
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] = field(
         default_factory=lambda: [
             "Be a voice, not an echo.",
         ]
@@ -193,9 +192,9 @@ def check_values(self):
             AssertionError: transition probability is not between 0 and 1
         """
         assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."
-        assert (
-            len(self.outputnet_size) >= 1
-        ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
-        assert (
-            0 < self.flat_start_params["transition_p"] < 1
-        ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
+        assert len(self.outputnet_size) >= 1, (
+            f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
+        )
+        assert 0 < self.flat_start_params["transition_p"] < 1, (
+            f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
+        )
diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index bf17322c19..bd5a28b43c 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -1,5 +1,4 @@
 from dataclasses import asdict, dataclass, field
-from typing import Dict, List
 
 from coqpit import Coqpit, check_argument
 
@@ -138,7 +137,7 @@ class CharactersConfig(Coqpit):
     characters_class: str = None
 
     # using BaseVocabulary
-    vocab_dict: Dict = None
+    vocab_dict: dict = None
 
     # using on BaseCharacters
     pad: str = None
@@ -323,7 +322,7 @@ class BaseTTSConfig(BaseTrainingConfig):
     shuffle: bool = False
     drop_last: bool = False
     # dataset
-    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
+    datasets: list[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
     # optimizer
     optimizer: str = "radam"
     optimizer_params: dict = None
@@ -331,7 +330,7 @@ class BaseTTSConfig(BaseTrainingConfig):
     lr_scheduler: str = None
     lr_scheduler_params: dict = field(default_factory=lambda: {})
     # testing
-    test_sentences: List[str] = field(default_factory=lambda: [])
+    test_sentences: list[str] = field(default_factory=lambda: [])
     # evaluation
     eval_split_max_size: int = None
     eval_split_size: float = 0.01
diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py
index bf8517dfc4..29221d7b25 100644
--- a/TTS/tts/configs/speedy_speech_config.py
+++ b/TTS/tts/configs/speedy_speech_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.forward_tts import ForwardTTSArgs
@@ -164,7 +163,7 @@ class SpeedySpeechConfig(BaseTTSConfig):
     f0_cache_path: str = None
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] = field(
         default_factory=lambda: [
             "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             "Be a voice, not an echo.",
diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py
index 350b5ea996..e4b419d1fa 100644
--- a/TTS/tts/configs/tacotron_config.py
+++ b/TTS/tts/configs/tacotron_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig, CapacitronVAEConfig, GSTConfig
 
@@ -154,7 +153,7 @@ class TacotronConfig(BaseTTSConfig):
     num_speakers: int = 1
     num_chars: int = 0
     r: int = 2
-    gradual_training: List[List[int]] = None
+    gradual_training: list[list[int]] = None
     memory_size: int = -1
     prenet_type: str = "original"
     prenet_dropout: bool = True
@@ -212,7 +211,7 @@ class TacotronConfig(BaseTTSConfig):
     ga_alpha: float = 5.0
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] = field(
         default_factory=lambda: [
             "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             "Be a voice, not an echo.",
@@ -224,12 +223,12 @@ class TacotronConfig(BaseTTSConfig):
 
     def check_values(self):
         if self.gradual_training:
-            assert (
-                self.gradual_training[0][1] == self.r
-            ), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}"
+            assert self.gradual_training[0][1] == self.r, (
+                f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}"
+            )
         if self.model == "tacotron" and self.audio is not None:
-            assert self.out_channels == (
-                self.audio.fft_size // 2 + 1
-            ), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}"
+            assert self.out_channels == (self.audio.fft_size // 2 + 1), (
+                f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}"
+            )
         if self.model == "tacotron2" and self.audio is not None:
             assert self.out_channels == self.audio.num_mels
diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py
index 2d0242bf13..d85684c721 100644
--- a/TTS/tts/configs/vits_config.py
+++ b/TTS/tts/configs/vits_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
@@ -112,7 +111,7 @@ class VitsConfig(BaseTTSConfig):
     audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
 
     # optimizer
-    grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
+    grad_clip: list[float] = field(default_factory=lambda: [1000, 1000])
     lr_gen: float = 0.0002
     lr_disc: float = 0.0002
     lr_scheduler_gen: str = "ExponentialLR"
@@ -146,7 +145,7 @@ class VitsConfig(BaseTTSConfig):
     add_blank: bool = True
 
     # testing
-    test_sentences: List[List] = field(
+    test_sentences: list[list] = field(
         default_factory=lambda: [
             ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
             ["Be a voice, not an echo."],
@@ -167,7 +166,7 @@ class VitsConfig(BaseTTSConfig):
 
     # use d-vectors
     use_d_vector_file: bool = False
-    d_vector_file: List[str] = None
+    d_vector_file: list[str] = None
     d_vector_dim: int = None
 
     def __post_init__(self):
diff --git a/TTS/tts/configs/xtts_config.py b/TTS/tts/configs/xtts_config.py
index bbf048e1ab..da6cc6edc6 100644
--- a/TTS/tts/configs/xtts_config.py
+++ b/TTS/tts/configs/xtts_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig
@@ -70,7 +69,7 @@ class XttsConfig(BaseTTSConfig):
     model_args: XttsArgs = field(default_factory=XttsArgs)
     audio: XttsAudioConfig = field(default_factory=XttsAudioConfig)
     model_dir: str = None
-    languages: List[str] = field(
+    languages: list[str] = field(
         default_factory=lambda: [
             "en",
             "es",
diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py
index d1a37da4c1..d83abce00a 100644
--- a/TTS/tts/datasets/__init__.py
+++ b/TTS/tts/datasets/__init__.py
@@ -2,8 +2,8 @@
 import os
 import sys
 from collections import Counter
+from collections.abc import Callable
 from pathlib import Path
-from typing import Callable, Dict, List, Tuple, Union
 
 import numpy as np
 
@@ -17,7 +17,7 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
     """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
 
     Args:
-        items (List[List]):
+        items (list[list]):
             A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
 
         eval_split_max_size (int):
@@ -37,10 +37,8 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
         else:
             eval_split_size = int(len(items) * eval_split_size)
 
-    assert (
-        eval_split_size > 0
-    ), " [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {}".format(
-        1 / len(items)
+    assert eval_split_size > 0, (
+        f" [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {1 / len(items)}"
     )
     np.random.seed(0)
     np.random.shuffle(items)
@@ -71,18 +69,18 @@ def add_extra_keys(metadata, language, dataset_name):
 
 
 def load_tts_samples(
-    datasets: Union[List[Dict], Dict],
+    datasets: list[dict] | dict,
     eval_split=True,
     formatter: Callable = None,
     eval_split_max_size=None,
     eval_split_size=0.01,
-) -> Tuple[List[List], List[List]]:
-    """Parse the dataset from the datasets config, load the samples as a List and load the attention alignments if provided.
+) -> tuple[list[list], list[list]]:
+    """Parse the dataset from the datasets config, load the samples as a list and load the attention alignments if provided.
     If `formatter` is not None, apply the formatter to the samples else pick the formatter from the available ones based
     on the dataset name.
 
     Args:
-        datasets (List[Dict], Dict): A list of datasets or a single dataset dictionary. If multiple datasets are
+        datasets (list[dict], dict): A list of datasets or a single dataset dictionary. If multiple datasets are
             in the list, they are all merged.
 
         eval_split (bool, optional): If true, create a evaluation split. If an eval split provided explicitly, generate
@@ -101,7 +99,7 @@ def load_tts_samples(
             If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
 
     Returns:
-        Tuple[List[List], List[List]: training and evaluation splits of the dataset.
+        tuple[list[list], list[list]: training and evaluation splits of the dataset.
     """
     meta_data_train_all = []
     meta_data_eval_all = [] if eval_split else None
@@ -153,7 +151,7 @@ def load_tts_samples(
 
 def load_attention_mask_meta_data(metafile_path):
     """Load meta data file created by compute_attention_masks.py"""
-    with open(metafile_path, "r", encoding="utf-8") as f:
+    with open(metafile_path, encoding="utf-8") as f:
         lines = f.readlines()
 
     meta_data = []
diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
index 5f629f32a9..6f21dcd1e0 100644
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@@ -3,7 +3,7 @@
 import logging
 import os
 import random
-from typing import Any, Optional, Union
+from typing import Any
 
 import numpy as np
 import numpy.typing as npt
@@ -47,7 +47,7 @@ def string2filename(string: str) -> str:
     return base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore")
 
 
-def get_audio_size(audiopath: Union[str, os.PathLike[Any]]) -> int:
+def get_audio_size(audiopath: str | os.PathLike[Any]) -> int:
     """Return the number of samples in the audio file."""
     if not isinstance(audiopath, str):
         audiopath = str(audiopath)
@@ -63,7 +63,7 @@ def get_audio_size(audiopath: Union[str, os.PathLike[Any]]) -> int:
         raise RuntimeError(msg) from e
 
 
-def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: Optional[dict] = None):
+def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: dict | None = None):
     """Create inverse frequency weights for balancing the dataset.
 
     Use `multi_dict` to scale relative weights."""
@@ -94,23 +94,23 @@ def __init__(
         outputs_per_step: int = 1,
         compute_linear_spec: bool = False,
         ap: AudioProcessor = None,
-        samples: Optional[list[dict]] = None,
+        samples: list[dict] | None = None,
         tokenizer: "TTSTokenizer" = None,
         compute_f0: bool = False,
         compute_energy: bool = False,
-        f0_cache_path: Optional[str] = None,
-        energy_cache_path: Optional[str] = None,
+        f0_cache_path: str | None = None,
+        energy_cache_path: str | None = None,
         return_wav: bool = False,
         batch_group_size: int = 0,
         min_text_len: int = 0,
         max_text_len: int = float("inf"),
         min_audio_len: int = 0,
         max_audio_len: int = float("inf"),
-        phoneme_cache_path: Optional[str] = None,
+        phoneme_cache_path: str | None = None,
         precompute_num_workers: int = 0,
-        speaker_id_mapping: Optional[dict] = None,
-        d_vector_mapping: Optional[dict] = None,
-        language_id_mapping: Optional[dict] = None,
+        speaker_id_mapping: dict | None = None,
+        d_vector_mapping: dict | None = None,
+        language_id_mapping: dict | None = None,
         use_noise_augment: bool = False,
         start_by_longest: bool = False,
     ) -> None:
@@ -231,7 +231,7 @@ def lengths(self) -> list[int]:
             try:
                 audio_len = get_audio_size(wav_file)
             except RuntimeError:
-                logger.warning(f"Failed to compute length for {item['audio_file']}")
+                logger.warning("Failed to compute length for %s", item["audio_file"])
                 audio_len = 0
             lens.append(audio_len)
         return lens
@@ -352,7 +352,7 @@ def _compute_lengths(samples):
             try:
                 audio_length = get_audio_size(item["audio_file"])
             except RuntimeError:
-                logger.warning(f"Failed to compute length, skipping {item['audio_file']}")
+                logger.warning("Failed to compute length, skipping %s", item["audio_file"])
                 continue
             text_lenght = len(item["text"])
             item["audio_length"] = audio_length
@@ -437,14 +437,14 @@ def preprocess_samples(self) -> None:
         self.samples = samples
 
         logger.info("Preprocessing samples")
-        logger.info(f"Max text length: {np.max(text_lengths)}")
-        logger.info(f"Min text length: {np.min(text_lengths)}")
-        logger.info(f"Avg text length: {np.mean(text_lengths)}")
-        logger.info(f"Max audio length: {np.max(audio_lengths)}")
-        logger.info(f"Min audio length: {np.min(audio_lengths)}")
-        logger.info(f"Avg audio length: {np.mean(audio_lengths)}")
+        logger.info("Max text length: %d", np.max(text_lengths))
+        logger.info("Min text length: %d", np.min(text_lengths))
+        logger.info("Avg text length: %.2f", np.mean(text_lengths))
+        logger.info("Max audio length: %.2f", np.max(audio_lengths))
+        logger.info("Min audio length: %.2f", np.min(audio_lengths))
+        logger.info("Avg audio length: %.2f", np.mean(audio_lengths))
         logger.info("Num. instances discarded samples: %d", len(ignore_idx))
-        logger.info(f"Batch group size: {self.batch_group_size}.")
+        logger.info("Batch group size: %d", self.batch_group_size)
 
     @staticmethod
     def _sort_batch(batch, text_lengths):
@@ -640,7 +640,7 @@ class PhonemeDataset(Dataset):
 
     def __init__(
         self,
-        samples: Union[list[dict], list[list]],
+        samples: list[dict] | list[list],
         tokenizer: "TTSTokenizer",
         cache_path: str,
         precompute_num_workers: int = 0,
@@ -744,10 +744,10 @@ class F0Dataset:
 
     def __init__(
         self,
-        samples: Union[list[list], list[dict]],
+        samples: list[list] | list[dict],
         ap: "AudioProcessor",
         audio_config=None,  # pylint: disable=unused-argument
-        cache_path: Optional[str] = None,
+        cache_path: str | None = None,
         precompute_num_workers: int = 0,
         normalize_f0: bool = True,
     ) -> None:
@@ -896,9 +896,9 @@ class EnergyDataset:
 
     def __init__(
         self,
-        samples: Union[list[list], list[dict]],
+        samples: list[list] | list[dict],
         ap: "AudioProcessor",
-        cache_path: Optional[str] = None,
+        cache_path: str | None = None,
         precompute_num_workers=0,
         normalize_energy=True,
     ) -> None:
diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index ff1a76e2c9..3a4605275a 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -5,7 +5,6 @@
 import xml.etree.ElementTree as ET
 from glob import glob
 from pathlib import Path
-from typing import List
 
 from tqdm import tqdm
 
@@ -21,7 +20,7 @@ def cml_tts(root_path, meta_file, ignored_speakers=None):
     https://github.com/freds0/CML-TTS-Dataset/"""
     filepath = os.path.join(root_path, meta_file)
     # ensure there are 4 columns for every line
-    with open(filepath, "r", encoding="utf8") as f:
+    with open(filepath, encoding="utf8") as f:
         lines = f.readlines()
     num_cols = len(lines[0].split("|"))  # take the first row as reference
     for idx, line in enumerate(lines[1:]):
@@ -61,7 +60,7 @@ def coqui(root_path, meta_file, ignored_speakers=None):
     """Interal dataset formatter."""
     filepath = os.path.join(root_path, meta_file)
     # ensure there are 4 columns for every line
-    with open(filepath, "r", encoding="utf8") as f:
+    with open(filepath, encoding="utf8") as f:
         lines = f.readlines()
     num_cols = len(lines[0].split("|"))  # take the first row as reference
     for idx, line in enumerate(lines[1:]):
@@ -104,7 +103,7 @@ def tweb(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "tweb"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("\t")
             wav_file = os.path.join(root_path, cols[0] + ".wav")
@@ -118,7 +117,7 @@ def mozilla(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "mozilla"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = cols[1].strip()
@@ -133,7 +132,7 @@ def mozilla_de(root_path, meta_file, **kwargs):  # pylint: disable=unused-argume
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "mozilla"
-    with open(txt_file, "r", encoding="ISO 8859-1") as ttf:
+    with open(txt_file, encoding="ISO 8859-1") as ttf:
         for line in ttf:
             cols = line.strip().split("|")
             wav_file = cols[0].strip()
@@ -177,7 +176,7 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None):
             if speaker_name in ignored_speakers:
                 continue
         logger.info(csv_file)
-        with open(txt_file, "r", encoding="utf-8") as ttf:
+        with open(txt_file, encoding="utf-8") as ttf:
             for line in ttf:
                 cols = line.split("|")
                 if not meta_files:
@@ -201,7 +200,7 @@ def ljspeech(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "ljspeech"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
@@ -215,7 +214,7 @@ def ljspeech_test(root_path, meta_file, **kwargs):  # pylint: disable=unused-arg
     https://keithito.com/LJ-Speech-Dataset/"""
     txt_file = os.path.join(root_path, meta_file)
     items = []
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         speaker_id = 0
         for idx, line in enumerate(ttf):
             # 2 samples per speaker to avoid eval split issues
@@ -236,7 +235,7 @@ def thorsten(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "thorsten"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
@@ -268,7 +267,7 @@ def ruslan(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "ruslan"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, "RUSLAN", cols[0] + ".wav")
@@ -282,7 +281,7 @@ def css10(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "css10"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, cols[0])
@@ -296,7 +295,7 @@ def nancy(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "nancy"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             utt_id = line.split()[1]
             text = line[line.find('"') + 1 : line.rfind('"') - 1]
@@ -309,7 +308,7 @@ def common_voice(root_path, meta_file, ignored_speakers=None):
     """Normalize the common voice meta data file to TTS format."""
     txt_file = os.path.join(root_path, meta_file)
     items = []
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             if line.startswith("client_id"):
                 continue
@@ -338,7 +337,7 @@ def libri_tts(root_path, meta_files=None, ignored_speakers=None):
 
     for meta_file in meta_files:
         _meta_file = os.path.basename(meta_file).split(".")[0]
-        with open(meta_file, "r", encoding="utf-8") as ttf:
+        with open(meta_file, encoding="utf-8") as ttf:
             for line in ttf:
                 cols = line.split("\t")
                 file_name = cols[0]
@@ -368,7 +367,7 @@ def custom_turkish(root_path, meta_file, **kwargs):  # pylint: disable=unused-ar
     items = []
     speaker_name = "turkish-female"
     skipped_files = []
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, "wavs", cols[0].strip() + ".wav")
@@ -386,7 +385,7 @@ def brspeech(root_path, meta_file, ignored_speakers=None):
     """BRSpeech 3.0 beta"""
     txt_file = os.path.join(root_path, meta_file)
     items = []
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             if line.startswith("wav_filename"):
                 continue
@@ -425,7 +424,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic
     """
     file_ext = "flac"
     items = []
-    meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
+    meta_files = glob(f"{os.path.join(root_path, 'txt')}/**/*.txt", recursive=True)
     for meta_file in meta_files:
         _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
         file_id = txt_file.split(".")[0]
@@ -433,7 +432,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic
         if isinstance(ignored_speakers, list):
             if speaker_id in ignored_speakers:
                 continue
-        with open(meta_file, "r", encoding="utf-8") as file_text:
+        with open(meta_file, encoding="utf-8") as file_text:
             text = file_text.readlines()[0]
         # p280 has no mic2 recordings
         if speaker_id == "p280":
@@ -452,7 +451,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic
 def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None):
     """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"""
     items = []
-    meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
+    meta_files = glob(f"{os.path.join(root_path, 'txt')}/**/*.txt", recursive=True)
     for meta_file in meta_files:
         _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
         file_id = txt_file.split(".")[0]
@@ -460,7 +459,7 @@ def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=Non
         if isinstance(ignored_speakers, list):
             if speaker_id in ignored_speakers:
                 continue
-        with open(meta_file, "r", encoding="utf-8") as file_text:
+        with open(meta_file, encoding="utf-8") as file_text:
             text = file_text.readlines()[0]
         wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav")
         items.append(
@@ -482,7 +481,7 @@ def synpaflex(root_path, metafiles=None, **kwargs):  # pylint: disable=unused-ar
                 os.path.dirname(wav_file), "txt", os.path.basename(wav_file).replace(".wav", ".txt")
             )
         if os.path.exists(txt_file) and os.path.exists(wav_file):
-            with open(txt_file, "r", encoding="utf-8") as file_text:
+            with open(txt_file, encoding="utf-8") as file_text:
                 text = file_text.readlines()[0]
             items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
     return items
@@ -500,7 +499,7 @@ def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, igno
         if isinstance(ignored_speakers, list):
             if speaker_id in ignored_speakers:
                 continue
-        with open(meta_file, "r", encoding="utf-8") as file_text:
+        with open(meta_file, encoding="utf-8") as file_text:
             text = file_text.readline().replace("\n", "")
         # ignore sentences that contains digits
         if ignore_digits_sentences and any(map(str.isdigit, text)):
@@ -513,7 +512,7 @@ def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, igno
 def mls(root_path, meta_files=None, ignored_speakers=None):
     """http://www.openslr.org/94/"""
     items = []
-    with open(os.path.join(root_path, meta_files), "r", encoding="utf-8") as meta:
+    with open(os.path.join(root_path, meta_files), encoding="utf-8") as meta:
         for line in meta:
             file, text = line.split("\t")
             text = text[:-1]
@@ -553,7 +552,7 @@ def _voxcel_x(root_path, meta_file, voxcel_idx):
 
     # if not exists meta file, crawl recursively for 'wav' files
     if meta_file is not None:
-        with open(str(meta_file), "r", encoding="utf-8") as f:
+        with open(str(meta_file), encoding="utf-8") as f:
             return [x.strip().split("|") for x in f.readlines()]
 
     elif not cache_to.exists():
@@ -575,7 +574,7 @@ def _voxcel_x(root_path, meta_file, voxcel_idx):
         if cnt < expected_count:
             raise ValueError(f"Found too few instances for Voxceleb. Should be around {expected_count}, is: {cnt}")
 
-    with open(str(cache_to), "r", encoding="utf-8") as f:
+    with open(str(cache_to), encoding="utf-8") as f:
         return [x.strip().split("|") for x in f.readlines()]
 
 
@@ -583,7 +582,7 @@ def emotion(root_path, meta_file, ignored_speakers=None):
     """Generic emotion dataset"""
     txt_file = os.path.join(root_path, meta_file)
     items = []
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             if line.startswith("file_path"):
                 continue
@@ -601,7 +600,7 @@ def emotion(root_path, meta_file, ignored_speakers=None):
     return items
 
 
-def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]:  # pylint: disable=unused-argument
+def baker(root_path: str, meta_file: str, **kwargs) -> list[list[str]]:  # pylint: disable=unused-argument
     """Normalizes the Baker meta data file to TTS format
 
     Args:
@@ -613,7 +612,7 @@ def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]:  # pylin
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "baker"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             wav_name, text = line.rstrip("\n").split("|")
             wav_path = os.path.join(root_path, "clips_22", wav_name)
@@ -626,7 +625,7 @@ def kokoro(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "kokoro"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
@@ -640,7 +639,7 @@ def kss(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "kss"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, cols[0])
@@ -653,7 +652,7 @@ def bel_tts_formatter(root_path, meta_file, **kwargs):  # pylint: disable=unused
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "bel_tts"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, cols[0])
diff --git a/TTS/tts/layers/bark/hubert/kmeans_hubert.py b/TTS/tts/layers/bark/hubert/kmeans_hubert.py
index ade84794eb..87be97d5d1 100644
--- a/TTS/tts/layers/bark/hubert/kmeans_hubert.py
+++ b/TTS/tts/layers/bark/hubert/kmeans_hubert.py
@@ -7,7 +7,6 @@
 
 # Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
 
-
 import torch
 from einops import pack, unpack
 from torch import nn
diff --git a/TTS/tts/layers/bark/inference_funcs.py b/TTS/tts/layers/bark/inference_funcs.py
index 65c7800dcf..457a20ea28 100644
--- a/TTS/tts/layers/bark/inference_funcs.py
+++ b/TTS/tts/layers/bark/inference_funcs.py
@@ -2,7 +2,6 @@
 import os
 import re
 from glob import glob
-from typing import Dict, List, Optional, Tuple
 
 import librosa
 import numpy as np
@@ -34,9 +33,9 @@ def _normalize_whitespace(text):
     return re.sub(r"\s+", " ", text).strip()
 
 
-def get_voices(extra_voice_dirs: List[str] = []):  # pylint: disable=dangerous-default-value
+def get_voices(extra_voice_dirs: list[str] = []):  # pylint: disable=dangerous-default-value
     dirs = extra_voice_dirs
-    voices: Dict[str, List[str]] = {}
+    voices: dict[str, list[str]] = {}
     for d in dirs:
         subs = os.listdir(d)
         for sub in subs:
@@ -49,7 +48,7 @@ def get_voices(extra_voice_dirs: List[str] = []):  # pylint: disable=dangerous-d
     return voices
 
 
-def load_npz(npz_file: str) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]:
+def load_npz(npz_file: str) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]:
     x_history = np.load(npz_file)
     semantic = x_history["semantic_prompt"]
     coarse = x_history["coarse_prompt"]
@@ -58,10 +57,8 @@ def load_npz(npz_file: str) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64
 
 
 def load_voice(
-    model, voice: str, extra_voice_dirs: List[str] = []
-) -> Tuple[
-    Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]]
-]:  # pylint: disable=dangerous-default-value
+    model, voice: str, extra_voice_dirs: list[str] = []
+) -> tuple[npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None]:  # pylint: disable=dangerous-default-value
     if voice == "random":
         return None, None, None
 
@@ -206,8 +203,8 @@ def generate_text_semantic(
         semantic_history = None
     encoded_text = np.array(_tokenize(model.tokenizer, text)) + model.config.TEXT_ENCODING_OFFSET
     if len(encoded_text) > 256:
-        p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
-        logger.warning(f"warning, text too long, lopping of last {p}%")
+        p = (len(encoded_text) - 256) / len(encoded_text) * 100
+        logger.warning("warning, text too long, lopping of last %.1f%%", p)
         encoded_text = encoded_text[:256]
     encoded_text = np.pad(
         encoded_text,
diff --git a/TTS/tts/layers/bark/load_model.py b/TTS/tts/layers/bark/load_model.py
index 6b7caab916..dcec5b5bbc 100644
--- a/TTS/tts/layers/bark/load_model.py
+++ b/TTS/tts/layers/bark/load_model.py
@@ -88,7 +88,7 @@ def clear_cuda_cache():
 
 
 def load_model(ckpt_path, device, config, model_type="text"):
-    logger.info(f"loading {model_type} model from {ckpt_path}...")
+    logger.info("loading %s model from %s...", model_type, ckpt_path)
 
     if device == "cpu":
         logger.warning("No GPU being used. Careful, Inference might be extremely slow!")
@@ -108,11 +108,13 @@ def load_model(ckpt_path, device, config, model_type="text"):
         and os.path.exists(ckpt_path)
         and _md5(ckpt_path) != config.REMOTE_MODEL_PATHS[model_type]["checksum"]
     ):
-        logger.warning(f"found outdated {model_type} model, removing...")
+        logger.warning("found outdated %s model, removing...", model_type)
         os.remove(ckpt_path)
     if not os.path.exists(ckpt_path):
-        logger.info(f"{model_type} model not found, downloading...")
-        _download(config.REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path, config.CACHE_DIR)
+        logger.info("%s model not found, downloading...", model_type)
+        # The URL in the config is a 404 and needs to be fixed
+        download_url = config.REMOTE_MODEL_PATHS[model_type]["path"].replace("tree", "resolve")
+        _download(download_url, ckpt_path, config.CACHE_DIR)
 
     checkpoint = torch.load(ckpt_path, map_location=device, weights_only=is_pytorch_at_least_2_4())
     # this is a hack
@@ -148,7 +150,7 @@ def load_model(ckpt_path, device, config, model_type="text"):
     model.load_state_dict(state_dict, strict=False)
     n_params = model.get_num_params()
     val_loss = checkpoint["best_val_loss"].item()
-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
+    logger.info("model loaded: %.1fM params, %.3f loss", n_params / 1e6, val_loss)
     model.eval()
     model.to(device)
     del checkpoint, state_dict
diff --git a/TTS/tts/layers/bark/model.py b/TTS/tts/layers/bark/model.py
index 54a9cecec0..4850d0a88b 100644
--- a/TTS/tts/layers/bark/model.py
+++ b/TTS/tts/layers/bark/model.py
@@ -175,9 +175,9 @@ def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use
                 assert idx.shape[1] >= 256 + 256 + 1
                 t = idx.shape[1] - 256
             else:
-                assert (
-                    t <= self.config.block_size
-                ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+                assert t <= self.config.block_size, (
+                    f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+                )
 
             # forward the GPT model itself
             if merge_context:
diff --git a/TTS/tts/layers/bark/model_fine.py b/TTS/tts/layers/bark/model_fine.py
index 29126b41ab..20f54d2152 100644
--- a/TTS/tts/layers/bark/model_fine.py
+++ b/TTS/tts/layers/bark/model_fine.py
@@ -101,9 +101,9 @@ def __init__(self, config):
     def forward(self, pred_idx, idx):
         device = idx.device
         b, t, codes = idx.size()
-        assert (
-            t <= self.config.block_size
-        ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        assert t <= self.config.block_size, (
+            f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        )
         assert pred_idx > 0, "cannot predict 0th codebook"
         assert codes == self.n_codes_total, (b, t, codes)
         pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)  # shape (1, t)
diff --git a/TTS/tts/layers/delightful_tts/acoustic_model.py b/TTS/tts/layers/delightful_tts/acoustic_model.py
index 981d6cdb1f..9110ff5fd0 100644
--- a/TTS/tts/layers/delightful_tts/acoustic_model.py
+++ b/TTS/tts/layers/delightful_tts/acoustic_model.py
@@ -1,6 +1,6 @@
 ### credit: https://github.com/dunky11/voicesmith
 import logging
-from typing import Callable, Dict, Tuple
+from collections.abc import Callable
 
 import torch
 import torch.nn.functional as F
@@ -177,7 +177,7 @@ def init_multispeaker(self, args: Coqpit):  # pylint: disable=unused-argument
             self._init_d_vector()
 
     @staticmethod
-    def _set_cond_input(aux_input: Dict):
+    def _set_cond_input(aux_input: dict):
         """Set the speaker conditioning input based on the multi-speaker mode."""
         sid, g, lid, durations = None, None, None, None
         if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None:
@@ -194,11 +194,11 @@ def _set_cond_input(aux_input: Dict):
 
         return sid, g, lid, durations
 
-    def get_aux_input(self, aux_input: Dict):
+    def get_aux_input(self, aux_input: dict):
         sid, g, lid, _ = self._set_cond_input(aux_input)
         return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid}
 
-    def _set_speaker_input(self, aux_input: Dict):
+    def _set_speaker_input(self, aux_input: dict):
         d_vectors = aux_input.get("d_vectors", None)
         speaker_ids = aux_input.get("speaker_ids", None)
 
@@ -237,7 +237,7 @@ def _forward_aligner(
         x_mask: torch.IntTensor,
         y_mask: torch.IntTensor,
         attn_priors: torch.FloatTensor,
-    ) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         """Aligner forward pass.
 
         1. Compute a mask to apply to the attention map.
@@ -298,7 +298,7 @@ def forward(
         use_ground_truth: bool = True,
         d_vectors: torch.Tensor = None,
         speaker_idx: torch.Tensor = None,
-    ) -> Dict[str, torch.Tensor]:
+    ) -> dict[str, torch.Tensor]:
         sid, g, lid, _ = self._set_cond_input(  # pylint: disable=unused-variable
             {"d_vectors": d_vectors, "speaker_ids": speaker_idx}
         )  # pylint: disable=unused-variable
@@ -421,7 +421,7 @@ def forward(
             "spk_emb": speaker_embedding,
         }
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         tokens: torch.Tensor,
diff --git a/TTS/tts/layers/delightful_tts/conv_layers.py b/TTS/tts/layers/delightful_tts/conv_layers.py
index 1d5139571e..5cf41d4ff6 100644
--- a/TTS/tts/layers/delightful_tts/conv_layers.py
+++ b/TTS/tts/layers/delightful_tts/conv_layers.py
@@ -1,11 +1,9 @@
-from typing import Tuple
-
 import torch
 import torch.nn as nn  # pylint: disable=consider-using-from-import
 import torch.nn.functional as F
 
 
-def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
+def calc_same_padding(kernel_size: int) -> tuple[int, int]:
     pad = kernel_size // 2
     return (pad, pad - (kernel_size + 1) % 2)
 
@@ -52,7 +50,7 @@ def __init__(
         w_init_gain="linear",
         use_weight_norm=False,
     ):
-        super(ConvNorm, self).__init__()  # pylint: disable=super-with-arguments
+        super().__init__()
         if padding is None:
             assert kernel_size % 2 == 1
             padding = int(dilation * (kernel_size - 1) / 2)
@@ -94,7 +92,7 @@ def __init__(
         lstm_type="bilstm",
         use_linear=True,
     ):
-        super(ConvLSTMLinear, self).__init__()  # pylint: disable=super-with-arguments
+        super().__init__()
         self.out_dim = out_dim
         self.lstm_type = lstm_type
         self.use_linear = use_linear
diff --git a/TTS/tts/layers/delightful_tts/encoders.py b/TTS/tts/layers/delightful_tts/encoders.py
index bd0c319dc1..31bab8cc97 100644
--- a/TTS/tts/layers/delightful_tts/encoders.py
+++ b/TTS/tts/layers/delightful_tts/encoders.py
@@ -1,5 +1,3 @@
-from typing import List, Tuple, Union
-
 import torch
 import torch.nn as nn  # pylint: disable=consider-using-from-import
 import torch.nn.functional as F
@@ -36,9 +34,9 @@ class ReferenceEncoder(nn.Module):
     def __init__(
         self,
         num_mels: int,
-        ref_enc_filters: List[Union[int, int, int, int, int, int]],
+        ref_enc_filters: list[int | int | int | int | int | int],
         ref_enc_size: int,
-        ref_enc_strides: List[Union[int, int, int, int, int]],
+        ref_enc_strides: list[int | int | int | int | int],
         ref_enc_gru_size: int,
     ):
         super().__init__()
@@ -80,7 +78,7 @@ def __init__(
             batch_first=True,
         )
 
-    def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         inputs --- [N,  n_mels, timesteps]
         outputs --- [N, E//2]
@@ -120,9 +118,9 @@ class UtteranceLevelProsodyEncoder(nn.Module):
     def __init__(
         self,
         num_mels: int,
-        ref_enc_filters: List[Union[int, int, int, int, int, int]],
+        ref_enc_filters: list[int | int | int | int | int | int],
         ref_enc_size: int,
-        ref_enc_strides: List[Union[int, int, int, int, int]],
+        ref_enc_strides: list[int | int | int | int | int],
         ref_enc_gru_size: int,
         dropout: float,
         n_hidden: int,
@@ -192,9 +190,9 @@ class PhonemeLevelProsodyEncoder(nn.Module):
     def __init__(
         self,
         num_mels: int,
-        ref_enc_filters: List[Union[int, int, int, int, int, int]],
+        ref_enc_filters: list[int | int | int | int | int | int],
         ref_enc_size: int,
-        ref_enc_strides: List[Union[int, int, int, int, int]],
+        ref_enc_strides: list[int | int | int | int | int],
         ref_enc_gru_size: int,
         dropout: float,
         n_hidden: int,
diff --git a/TTS/tts/layers/delightful_tts/energy_adaptor.py b/TTS/tts/layers/delightful_tts/energy_adaptor.py
index ea0d1e4721..d2b4b0ffa8 100644
--- a/TTS/tts/layers/delightful_tts/energy_adaptor.py
+++ b/TTS/tts/layers/delightful_tts/energy_adaptor.py
@@ -1,4 +1,4 @@
-from typing import Callable, Tuple
+from collections.abc import Callable
 
 import torch
 import torch.nn as nn  # pylint: disable=consider-using-from-import
@@ -59,7 +59,7 @@ def __init__(
 
     def get_energy_embedding_train(
         self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Shapes:
             x: :math: `[B, T_src, C]`
diff --git a/TTS/tts/layers/delightful_tts/networks.py b/TTS/tts/layers/delightful_tts/networks.py
index 4305022f18..93b65a2a74 100644
--- a/TTS/tts/layers/delightful_tts/networks.py
+++ b/TTS/tts/layers/delightful_tts/networks.py
@@ -1,5 +1,4 @@
 import math
-from typing import Tuple
 
 import numpy as np
 import torch
@@ -9,7 +8,7 @@
 from TTS.tts.layers.delightful_tts.conv_layers import ConvNorm
 
 
-def initialize_embeddings(shape: Tuple[int]) -> torch.Tensor:
+def initialize_embeddings(shape: tuple[int]) -> torch.Tensor:
     assert len(shape) == 2, "Can only initialize 2-D embedding matrices ..."
     # Kaiming initialization
     return torch.randn(shape) * np.sqrt(2 / shape[1])
@@ -52,7 +51,7 @@ def __init__(
         kernel_size=3,
         use_partial_padding=False,  # pylint: disable=unused-argument
     ):
-        super(BottleneckLayer, self).__init__()  # pylint: disable=super-with-arguments
+        super().__init__()
 
         self.reduction_factor = reduction_factor
         reduced_dim = int(in_dim / reduction_factor)
@@ -195,7 +194,7 @@ class STL(nn.Module):
     """
 
     def __init__(self, n_hidden: int, token_num: int):
-        super(STL, self).__init__()  # pylint: disable=super-with-arguments
+        super().__init__()
 
         num_heads = 1
         E = n_hidden
diff --git a/TTS/tts/layers/delightful_tts/pitch_adaptor.py b/TTS/tts/layers/delightful_tts/pitch_adaptor.py
index 9031369e0f..14e751d2e2 100644
--- a/TTS/tts/layers/delightful_tts/pitch_adaptor.py
+++ b/TTS/tts/layers/delightful_tts/pitch_adaptor.py
@@ -1,4 +1,4 @@
-from typing import Callable, Tuple
+from collections.abc import Callable
 
 import torch
 import torch.nn as nn  # pylint: disable=consider-using-from-import
@@ -58,7 +58,7 @@ def __init__(
 
     def get_pitch_embedding_train(
         self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Shapes:
             x: :math: `[B, T_src, C]`
diff --git a/TTS/tts/layers/feed_forward/encoder.py b/TTS/tts/layers/feed_forward/encoder.py
index caf939ffc7..2d08f03c2d 100644
--- a/TTS/tts/layers/feed_forward/encoder.py
+++ b/TTS/tts/layers/feed_forward/encoder.py
@@ -143,9 +143,9 @@ def __init__(
         elif encoder_type.lower() == "residual_conv_bn":
             self.encoder = ResidualConv1dBNEncoder(in_hidden_channels, out_channels, in_hidden_channels, encoder_params)
         elif encoder_type.lower() == "fftransformer":
-            assert (
-                in_hidden_channels == out_channels
-            ), "[!] must be `in_channels` == `out_channels` when encoder type is 'fftransformer'"
+            assert in_hidden_channels == out_channels, (
+                "[!] must be `in_channels` == `out_channels` when encoder type is 'fftransformer'"
+            )
             # pylint: disable=unexpected-keyword-arg
             self.encoder = FFTransformerBlock(in_hidden_channels, **encoder_params)
         else:
diff --git a/TTS/tts/layers/generic/aligner.py b/TTS/tts/layers/generic/aligner.py
index baa6f0e9c4..480c48f9a4 100644
--- a/TTS/tts/layers/generic/aligner.py
+++ b/TTS/tts/layers/generic/aligner.py
@@ -1,5 +1,3 @@
-from typing import Tuple
-
 import torch
 from torch import nn
 
@@ -68,7 +66,7 @@ def init_layers(self):
 
     def forward(
         self, queries: torch.tensor, keys: torch.tensor, mask: torch.tensor = None, attn_prior: torch.tensor = None
-    ) -> Tuple[torch.tensor, torch.tensor]:
+    ) -> tuple[torch.tensor, torch.tensor]:
         """Forward pass of the aligner encoder.
         Shapes:
             - queries: :math:`[B, C, T_de]`
diff --git a/TTS/tts/layers/generic/pos_encoding.py b/TTS/tts/layers/generic/pos_encoding.py
index 913add0d14..7765e224aa 100644
--- a/TTS/tts/layers/generic/pos_encoding.py
+++ b/TTS/tts/layers/generic/pos_encoding.py
@@ -18,9 +18,7 @@ class PositionalEncoding(nn.Module):
     def __init__(self, channels, dropout_p=0.0, max_len=5000, use_scale=False):
         super().__init__()
         if channels % 2 != 0:
-            raise ValueError(
-                "Cannot use sin/cos positional encoding with " "odd channels (got channels={:d})".format(channels)
-            )
+            raise ValueError(f"Cannot use sin/cos positional encoding with odd channels (got channels={channels:d})")
         self.use_scale = use_scale
         if use_scale:
             self.scale = torch.nn.Parameter(torch.ones(1))
diff --git a/TTS/tts/layers/generic/transformer.py b/TTS/tts/layers/generic/transformer.py
index 9b7ecee2ba..2fe9bcc408 100644
--- a/TTS/tts/layers/generic/transformer.py
+++ b/TTS/tts/layers/generic/transformer.py
@@ -70,9 +70,7 @@ def forward(self, x, mask=None, g=None):  # pylint: disable=unused-argument
 
 
 class FFTDurationPredictor:
-    def __init__(
-        self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None
-    ):  # pylint: disable=unused-argument
+    def __init__(self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None):  # pylint: disable=unused-argument
         self.fft = FFTransformerBlock(in_channels, num_heads, hidden_channels, num_layers, dropout_p)
         self.proj = nn.Linear(in_channels, 1)
 
diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
index db62430c9d..1e744d62cf 100644
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@@ -814,7 +814,7 @@ def __init__(self, c):
         elif c.spec_loss_type == "l1":
             self.spec_loss = L1LossMasked(False)
         else:
-            raise ValueError(" [!] Unknown spec_loss_type {}".format(c.spec_loss_type))
+            raise ValueError(f" [!] Unknown spec_loss_type {c.spec_loss_type}")
 
         if c.duration_loss_type == "mse":
             self.dur_loss = MSELossMasked(False)
@@ -823,7 +823,7 @@ def __init__(self, c):
         elif c.duration_loss_type == "huber":
             self.dur_loss = Huber()
         else:
-            raise ValueError(" [!] Unknown duration_loss_type {}".format(c.duration_loss_type))
+            raise ValueError(f" [!] Unknown duration_loss_type {c.duration_loss_type}")
 
         if c.model_args.use_aligner:
             self.aligner_loss = ForwardSumLoss()
diff --git a/TTS/tts/layers/overflow/common_layers.py b/TTS/tts/layers/overflow/common_layers.py
index 9f77af293c..a477b34f0b 100644
--- a/TTS/tts/layers/overflow/common_layers.py
+++ b/TTS/tts/layers/overflow/common_layers.py
@@ -1,5 +1,4 @@
 import logging
-from typing import List, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -44,7 +43,7 @@ def __init__(self, num_chars, state_per_phone, in_out_channels=512, n_convolutio
         )
         self.rnn_state = None
 
-    def forward(self, x: torch.FloatTensor, x_len: torch.LongTensor) -> Tuple[torch.FloatTensor, torch.LongTensor]:
+    def forward(self, x: torch.FloatTensor, x_len: torch.LongTensor) -> tuple[torch.FloatTensor, torch.LongTensor]:
         """Forward pass to the encoder.
 
         Args:
@@ -110,7 +109,7 @@ class ParameterModel(nn.Module):
 
     def __init__(
         self,
-        outputnet_size: List[int],
+        outputnet_size: list[int],
         input_size: int,
         output_size: int,
         frame_channels: int,
@@ -152,7 +151,7 @@ def __init__(
         encoder_dim: int,
         memory_rnn_dim: int,
         frame_channels: int,
-        outputnet_size: List[int],
+        outputnet_size: list[int],
         flat_start_params: dict,
         std_floor: float = 1e-2,
     ):
diff --git a/TTS/tts/layers/overflow/neural_hmm.py b/TTS/tts/layers/overflow/neural_hmm.py
index a12becef03..9142f65e8c 100644
--- a/TTS/tts/layers/overflow/neural_hmm.py
+++ b/TTS/tts/layers/overflow/neural_hmm.py
@@ -1,5 +1,3 @@
-from typing import List
-
 import torch
 import torch.distributions as tdist
 import torch.nn.functional as F
@@ -57,7 +55,7 @@ def __init__(
         prenet_dropout: float,
         prenet_dropout_at_inference: bool,
         memory_rnn_dim: int,
-        outputnet_size: List[int],
+        outputnet_size: list[int],
         flat_start_params: dict,
         std_floor: float,
         use_grad_checkpointing: bool = True,
diff --git a/TTS/tts/layers/tacotron/tacotron.py b/TTS/tts/layers/tacotron/tacotron.py
index 32643dfcee..6f33edf3d7 100644
--- a/TTS/tts/layers/tacotron/tacotron.py
+++ b/TTS/tts/layers/tacotron/tacotron.py
@@ -1,4 +1,3 @@
-# coding: utf-8
 # adapted from https://github.com/r9y9/tacotron_pytorch
 
 import logging
diff --git a/TTS/tts/layers/tortoise/arch_utils.py b/TTS/tts/layers/tortoise/arch_utils.py
index 1bbf676393..00fa559c77 100644
--- a/TTS/tts/layers/tortoise/arch_utils.py
+++ b/TTS/tts/layers/tortoise/arch_utils.py
@@ -101,9 +101,9 @@ def __init__(
         if num_head_channels == -1:
             self.num_heads = num_heads
         else:
-            assert (
-                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            assert channels % num_head_channels == 0, (
+                f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            )
             self.num_heads = channels // num_head_channels
         self.norm = normalization(channels)
         self.qkv = nn.Conv1d(channels, channels * 3, 1)
diff --git a/TTS/tts/layers/tortoise/audio_utils.py b/TTS/tts/layers/tortoise/audio_utils.py
index c67ee6c44b..6bbe6c389c 100644
--- a/TTS/tts/layers/tortoise/audio_utils.py
+++ b/TTS/tts/layers/tortoise/audio_utils.py
@@ -1,7 +1,6 @@
 import logging
 import os
 from glob import glob
-from typing import Dict, List
 
 import librosa
 import numpy as np
@@ -88,9 +87,9 @@ def normalize_tacotron_mel(mel):
     return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1
 
 
-def get_voices(extra_voice_dirs: List[str] = []):
+def get_voices(extra_voice_dirs: list[str] = []):
     dirs = extra_voice_dirs
-    voices: Dict[str, List[str]] = {}
+    voices: dict[str, list[str]] = {}
     for d in dirs:
         subs = os.listdir(d)
         for sub in subs:
@@ -100,7 +99,7 @@ def get_voices(extra_voice_dirs: List[str] = []):
     return voices
 
 
-def load_voice(voice: str, extra_voice_dirs: List[str] = []):
+def load_voice(voice: str, extra_voice_dirs: list[str] = []):
     if voice == "random":
         return None, None
 
@@ -116,7 +115,7 @@ def load_voice(voice: str, extra_voice_dirs: List[str] = []):
         return conds, None
 
 
-def load_voices(voices: List[str], extra_voice_dirs: List[str] = []):
+def load_voices(voices: list[str], extra_voice_dirs: list[str] = []):
     latents = []
     clips = []
     for voice in voices:
@@ -126,14 +125,14 @@ def load_voices(voices: List[str], extra_voice_dirs: List[str] = []):
             return None, None
         clip, latent = load_voice(voice, extra_voice_dirs)
         if latent is None:
-            assert (
-                len(latents) == 0
-            ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
+            assert len(latents) == 0, (
+                "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
+            )
             clips.extend(clip)
         elif clip is None:
-            assert (
-                len(clips) == 0
-            ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
+            assert len(clips) == 0, (
+                "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
+            )
             latents.append(latent)
     if len(latents) == 0:
         return clips, None
diff --git a/TTS/tts/layers/tortoise/autoregressive.py b/TTS/tts/layers/tortoise/autoregressive.py
index 00c884e973..eaeb2a03c1 100644
--- a/TTS/tts/layers/tortoise/autoregressive.py
+++ b/TTS/tts/layers/tortoise/autoregressive.py
@@ -1,7 +1,6 @@
 # AGPL: a notification must be added stating that changes have been made to that file.
 import functools
 import random
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -609,9 +608,9 @@ def inference_speech(
         if input_tokens is None:
             inputs = fake_inputs
         else:
-            assert (
-                num_return_sequences % input_tokens.shape[0] == 0
-            ), "The number of return sequences must be divisible by the number of input sequences"
+            assert num_return_sequences % input_tokens.shape[0] == 0, (
+                "The number of return sequences must be divisible by the number of input sequences"
+            )
             fake_inputs = fake_inputs.repeat(num_return_sequences, 1)
             input_tokens = input_tokens.repeat(num_return_sequences // input_tokens.shape[0], 1)
             inputs = torch.cat([fake_inputs, input_tokens], dim=1)
@@ -640,8 +639,8 @@ def inference_speech(
 
 def _prepare_attention_mask_for_generation(
     inputs: torch.Tensor,
-    pad_token_id: Optional[torch.Tensor],
-    eos_token_id: Optional[torch.Tensor],
+    pad_token_id: torch.Tensor | None,
+    eos_token_id: torch.Tensor | None,
 ) -> torch.LongTensor:
     # No information for attention mask inference -> return default attention mask
     default_attention_mask = torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device)
diff --git a/TTS/tts/layers/tortoise/diffusion.py b/TTS/tts/layers/tortoise/diffusion.py
index 2b29091b44..cfb8fa800d 100644
--- a/TTS/tts/layers/tortoise/diffusion.py
+++ b/TTS/tts/layers/tortoise/diffusion.py
@@ -653,7 +653,7 @@ def p_sample_loop_progressive(
         """
         if device is None:
             device = next(model.parameters()).device
-        assert isinstance(shape, (tuple, list))
+        assert isinstance(shape, tuple | list)
         if noise is not None:
             img = noise
         else:
@@ -805,7 +805,7 @@ def ddim_sample_loop_progressive(
         """
         if device is None:
             device = next(model.parameters()).device
-        assert isinstance(shape, (tuple, list))
+        assert isinstance(shape, tuple | list)
         if noise is not None:
             img = noise
         else:
diff --git a/TTS/tts/layers/tortoise/dpm_solver.py b/TTS/tts/layers/tortoise/dpm_solver.py
index 6a1d8ff784..c8892d456a 100644
--- a/TTS/tts/layers/tortoise/dpm_solver.py
+++ b/TTS/tts/layers/tortoise/dpm_solver.py
@@ -98,9 +98,7 @@ def __init__(
 
         if schedule not in ["discrete", "linear", "cosine"]:
             raise ValueError(
-                "Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(
-                    schedule
-                )
+                f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear' or 'cosine'"
             )
 
         self.schedule = schedule
@@ -150,7 +148,7 @@ def marginal_log_mean_coeff(self, t):
                 t.reshape((-1, 1)),
                 self.t_array.to(t.device),
                 self.log_alpha_array.to(t.device),
-            ).reshape((-1))
+            ).reshape(-1)
         elif self.schedule == "linear":
             return -0.25 * t**2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
         elif self.schedule == "cosine":
@@ -447,7 +445,7 @@ def correcting_xt_fn(xt, t, step):
             Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models
             with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b.
         """
-        self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
+        self.model = lambda x, t: model_fn(x, t.expand(x.shape[0]))
         self.noise_schedule = noise_schedule
         assert algorithm_type in ["dpmsolver", "dpmsolver++"]
         self.algorithm_type = algorithm_type
@@ -527,7 +525,7 @@ def get_time_steps(self, skip_type, t_T, t_0, N, device):
             return t
         else:
             raise ValueError(
-                "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type)
+                f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'"
             )
 
     def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
@@ -565,41 +563,21 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type
         if order == 3:
             K = steps // 3 + 1
             if steps % 3 == 0:
-                orders = [
-                    3,
-                ] * (
-                    K - 2
-                ) + [2, 1]
+                orders = [3] * (K - 2) + [2, 1]
             elif steps % 3 == 1:
-                orders = [
-                    3,
-                ] * (
-                    K - 1
-                ) + [1]
+                orders = [3] * (K - 1) + [1]
             else:
-                orders = [
-                    3,
-                ] * (
-                    K - 1
-                ) + [2]
+                orders = [3] * (K - 1) + [2]
         elif order == 2:
             if steps % 2 == 0:
                 K = steps // 2
-                orders = [
-                    2,
-                ] * K
+                orders = [2] * K
             else:
                 K = steps // 2 + 1
-                orders = [
-                    2,
-                ] * (
-                    K - 1
-                ) + [1]
+                orders = [2] * (K - 1) + [1]
         elif order == 1:
             K = 1
-            orders = [
-                1,
-            ] * steps
+            orders = [1] * steps
         else:
             raise ValueError("'order' must be '1' or '2' or '3'.")
         if skip_type == "logSNR":
@@ -607,15 +585,7 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type
             timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
         else:
             timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[
-                torch.cumsum(
-                    torch.tensor(
-                        [
-                            0,
-                        ]
-                        + orders
-                    ),
-                    0,
-                ).to(device)
+                torch.cumsum(torch.tensor([0] + orders), 0).to(device)
             ]
         return timesteps_outer, orders
 
@@ -693,7 +663,7 @@ def singlestep_dpm_solver_second_update(
             x_t: A pytorch tensor. The approximated solution at time `t`.
         """
         if solver_type not in ["dpmsolver", "taylor"]:
-            raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type))
+            raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}")
         if r1 is None:
             r1 = 0.5
         ns = self.noise_schedule
@@ -790,7 +760,7 @@ def singlestep_dpm_solver_third_update(
             x_t: A pytorch tensor. The approximated solution at time `t`.
         """
         if solver_type not in ["dpmsolver", "taylor"]:
-            raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type))
+            raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}")
         if r1 is None:
             r1 = 1.0 / 3.0
         if r2 is None:
@@ -913,7 +883,7 @@ def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t,
             x_t: A pytorch tensor. The approximated solution at time `t`.
         """
         if solver_type not in ["dpmsolver", "taylor"]:
-            raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type))
+            raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}")
         ns = self.noise_schedule
         model_prev_1, model_prev_0 = model_prev_list[-2], model_prev_list[-1]
         t_prev_1, t_prev_0 = t_prev_list[-2], t_prev_list[-1]
@@ -1062,7 +1032,7 @@ def singlestep_dpm_solver_update(
                 r2=r2,
             )
         else:
-            raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
+            raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}")
 
     def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type="dpmsolver"):
         """
@@ -1086,7 +1056,7 @@ def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order,
         elif order == 3:
             return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
         else:
-            raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
+            raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}")
 
     def dpm_solver_adaptive(
         self,
@@ -1150,8 +1120,8 @@ def higher_update(x, s, t, **kwargs):
                 return self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs)
 
         else:
-            raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
-        while torch.abs((s - t_0)).mean() > t_err:
+            raise ValueError(f"For adaptive step size solver, order must be 2 or 3, got {order}")
+        while torch.abs(s - t_0).mean() > t_err:
             t = ns.inverse_lambda(lambda_s + h)
             x_lower, lower_noise_kwargs = lower_update(x, s, t)
             x_higher = higher_update(x, s, t, **lower_noise_kwargs)
@@ -1219,9 +1189,9 @@ def inverse(
         """
         t_0 = 1.0 / self.noise_schedule.total_N if t_start is None else t_start
         t_T = self.noise_schedule.T if t_end is None else t_end
-        assert (
-            t_0 > 0 and t_T > 0
-        ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        assert t_0 > 0 and t_T > 0, (
+            "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        )
         return self.sample(
             x,
             steps=steps,
@@ -1364,9 +1334,9 @@ def sample(
         """
         t_0 = 1.0 / self.noise_schedule.total_N if t_end is None else t_end
         t_T = self.noise_schedule.T if t_start is None else t_start
-        assert (
-            t_0 > 0 and t_T > 0
-        ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        assert t_0 > 0 and t_T > 0, (
+            "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        )
         if return_intermediate:
             assert method in [
                 "multistep",
@@ -1487,7 +1457,7 @@ def sample(
                     if return_intermediate:
                         intermediates.append(x)
             else:
-                raise ValueError("Got wrong method {}".format(method))
+                raise ValueError(f"Got wrong method {method}")
             if denoise_to_zero:
                 t = torch.ones((1,)).to(device) * t_0
                 x = self.denoise_to_zero_fn(x, t)
diff --git a/TTS/tts/layers/tortoise/transformer.py b/TTS/tts/layers/tortoise/transformer.py
index ed4d79d4ab..531f294220 100644
--- a/TTS/tts/layers/tortoise/transformer.py
+++ b/TTS/tts/layers/tortoise/transformer.py
@@ -1,4 +1,4 @@
-from typing import TypeVar, Union
+from typing import TypeVar
 
 import torch
 import torch.nn.functional as F
@@ -11,7 +11,7 @@
 _T = TypeVar("_T")
 
 
-def cast_tuple(val: Union[tuple[_T], list[_T], _T], depth: int = 1) -> tuple[_T]:
+def cast_tuple(val: tuple[_T] | list[_T] | _T, depth: int = 1) -> tuple[_T]:
     if isinstance(val, list):
         return tuple(val)
     return val if isinstance(val, tuple) else (val,) * depth
@@ -43,9 +43,9 @@ def route_args(router, args, depth):
 class SequentialSequence(nn.Module):
     def __init__(self, layers, args_route={}, layer_dropout=0.0):
         super().__init__()
-        assert all(
-            len(route) == len(layers) for route in args_route.values()
-        ), "each argument route map must have the same depth as the number of sequential layers"
+        assert all(len(route) == len(layers) for route in args_route.values()), (
+            "each argument route map must have the same depth as the number of sequential layers"
+        )
         self.layers = layers
         self.args_route = args_route
         self.layer_dropout = layer_dropout
diff --git a/TTS/tts/layers/tortoise/vocoder.py b/TTS/tts/layers/tortoise/vocoder.py
index a5200c2673..e7497d8190 100644
--- a/TTS/tts/layers/tortoise/vocoder.py
+++ b/TTS/tts/layers/tortoise/vocoder.py
@@ -1,6 +1,6 @@
+from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum
-from typing import Callable, Optional
 
 import torch
 import torch.nn as nn
@@ -293,7 +293,7 @@ def __init__(
         hop_length=256,
         n_mel_channels=100,
     ):
-        super(UnivNetGenerator, self).__init__()
+        super().__init__()
         self.mel_channel = n_mel_channels
         self.noise_dim = noise_dim
         self.hop_length = hop_length
@@ -344,7 +344,7 @@ def forward(self, c, z):
         return z
 
     def eval(self, inference=False):
-        super(UnivNetGenerator, self).eval()
+        super().eval()
         # don't remove weight norm while validation in training loop
         if inference:
             self.remove_weight_norm()
@@ -378,7 +378,7 @@ def inference(self, c, z=None):
 class VocType:
     constructor: Callable[[], nn.Module]
     model_path: str
-    subkey: Optional[str] = None
+    subkey: str | None = None
 
     def optionally_index(self, model_dict):
         if self.subkey is not None:
diff --git a/TTS/tts/layers/tortoise/xtransformers.py b/TTS/tts/layers/tortoise/xtransformers.py
index 0892fee19d..b2e74cf118 100644
--- a/TTS/tts/layers/tortoise/xtransformers.py
+++ b/TTS/tts/layers/tortoise/xtransformers.py
@@ -560,9 +560,9 @@ def __init__(
 
         self.rel_pos_bias = rel_pos_bias
         if rel_pos_bias:
-            assert (
-                rel_pos_num_buckets <= rel_pos_max_distance
-            ), "number of relative position buckets must be less than the relative position max distance"
+            assert rel_pos_num_buckets <= rel_pos_max_distance, (
+                "number of relative position buckets must be less than the relative position max distance"
+            )
             self.rel_pos = RelativePositionBias(
                 scale=dim_head**0.5,
                 causal=causal,
@@ -680,9 +680,9 @@ def forward(
             del input_mask
 
         if exists(attn_mask):
-            assert (
-                2 <= attn_mask.ndim <= 4
-            ), "attention mask must have greater than 2 dimensions but less than or equal to 4"
+            assert 2 <= attn_mask.ndim <= 4, (
+                "attention mask must have greater than 2 dimensions but less than or equal to 4"
+            )
             if attn_mask.ndim == 2:
                 attn_mask = rearrange(attn_mask, "i j -> () () i j")
             elif attn_mask.ndim == 3:
@@ -790,9 +790,9 @@ def __init__(
         rotary_emb_dim = max(default(rotary_emb_dim, dim_head // 2), 32)
         self.rotary_pos_emb = RotaryEmbedding(rotary_emb_dim) if rotary_pos_emb else None
 
-        assert not (
-            alibi_pos_bias and rel_pos_bias
-        ), "you can only choose Alibi positional bias or T5 relative positional bias, not both"
+        assert not (alibi_pos_bias and rel_pos_bias), (
+            "you can only choose Alibi positional bias or T5 relative positional bias, not both"
+        )
 
         if alibi_pos_bias:
             alibi_num_heads = default(alibi_num_heads, heads)
@@ -922,9 +922,9 @@ def forward(
         past_key_values=None,
         expected_seq_len=None,
     ):
-        assert not (
-            self.cross_attend ^ (exists(context) or exists(full_context))
-        ), "context must be passed in if cross_attend is set to True"
+        assert not (self.cross_attend ^ (exists(context) or exists(full_context))), (
+            "context must be passed in if cross_attend is set to True"
+        )
         assert context is None or full_context is None, "only one of full_context or context can be provided"
 
         hiddens = []
@@ -940,9 +940,9 @@ def forward(
         rotary_pos_emb = None
         if exists(self.rotary_pos_emb):
             if not self.training and self.causal:
-                assert (
-                    expected_seq_len is not None
-                ), "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`"
+                assert expected_seq_len is not None, (
+                    "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`"
+                )
             elif expected_seq_len is None:
                 expected_seq_len = 0
             seq_len = x.shape[1]
diff --git a/TTS/tts/layers/vits/transforms.py b/TTS/tts/layers/vits/transforms.py
index 3cac1b8d6d..da5deea9ef 100644
--- a/TTS/tts/layers/vits/transforms.py
+++ b/TTS/tts/layers/vits/transforms.py
@@ -74,7 +74,7 @@ def unconstrained_rational_quadratic_spline(
         outputs[outside_interval_mask] = inputs[outside_interval_mask]
         logabsdet[outside_interval_mask] = 0
     else:
-        raise RuntimeError("{} tails are not implemented.".format(tails))
+        raise RuntimeError(f"{tails} tails are not implemented.")
 
     outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
         inputs=inputs[inside_interval_mask],
diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py
index 20eff26ecc..4e0f53616d 100644
--- a/TTS/tts/layers/xtts/gpt.py
+++ b/TTS/tts/layers/xtts/gpt.py
@@ -347,12 +347,12 @@ def forward(
             audio_codes = F.pad(audio_codes, (0, max_mel_len - audio_codes.shape[-1]))
 
         # 💖 Lovely assertions
-        assert (
-            max_mel_len <= audio_codes.shape[-1]
-        ), f" ❗ max_mel_len ({max_mel_len}) > audio_codes.shape[-1] ({audio_codes.shape[-1]})"
-        assert (
-            max_text_len <= text_inputs.shape[-1]
-        ), f" ❗ max_text_len ({max_text_len}) > text_inputs.shape[-1] ({text_inputs.shape[-1]})"
+        assert max_mel_len <= audio_codes.shape[-1], (
+            f" ❗ max_mel_len ({max_mel_len}) > audio_codes.shape[-1] ({audio_codes.shape[-1]})"
+        )
+        assert max_text_len <= text_inputs.shape[-1], (
+            f" ❗ max_text_len ({max_text_len}) > text_inputs.shape[-1] ({text_inputs.shape[-1]})"
+        )
 
         # Append stop token to text inputs
         text_inputs = F.pad(text_inputs[:, :max_text_len], (0, 1), value=self.stop_text_token)
@@ -454,9 +454,9 @@ def forward(
             mel_targets[idx, l + 1 :] = -1
 
         # check if stoptoken is in every row of mel_targets
-        assert (mel_targets == self.stop_audio_token).sum() >= mel_targets.shape[
-            0
-        ], f" ❗ mel_targets does not contain stop token ({self.stop_audio_token}) in every row."
+        assert (mel_targets == self.stop_audio_token).sum() >= mel_targets.shape[0], (
+            f" ❗ mel_targets does not contain stop token ({self.stop_audio_token}) in every row."
+        )
 
         # ignore the loss for the segment used for conditioning
         # coin flip for the segment to be ignored
diff --git a/TTS/tts/layers/xtts/hifigan_decoder.py b/TTS/tts/layers/xtts/hifigan_decoder.py
index 2e6ac01a87..550ad3e3b2 100644
--- a/TTS/tts/layers/xtts/hifigan_decoder.py
+++ b/TTS/tts/layers/xtts/hifigan_decoder.py
@@ -97,7 +97,7 @@ def forward(self, latents, g=None):
         o = self.waveform_decoder(z, g=g)
         return o
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, c, g):
         """
         Args:
diff --git a/TTS/tts/layers/xtts/stream_generator.py b/TTS/tts/layers/xtts/stream_generator.py
index 44cf940c69..e09a5233ac 100644
--- a/TTS/tts/layers/xtts/stream_generator.py
+++ b/TTS/tts/layers/xtts/stream_generator.py
@@ -4,7 +4,7 @@
 import inspect
 import random
 import warnings
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import numpy as np
 import torch
@@ -45,18 +45,18 @@ def __init__(self, **kwargs):
 
 
 class NewGenerationMixin(GenerationMixin):
-    @torch.no_grad()
+    @torch.inference_mode()
     def generate(  # noqa: PLR0911
         self,
-        inputs: Optional[torch.Tensor] = None,
-        generation_config: Optional[StreamGenerationConfig] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
-        synced_gpus: Optional[bool] = False,
+        inputs: torch.Tensor | None = None,
+        generation_config: StreamGenerationConfig | None = None,
+        logits_processor: LogitsProcessorList | None = None,
+        stopping_criteria: StoppingCriteriaList | None = None,
+        prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], list[int]] | None = None,
+        synced_gpus: bool | None = False,
         seed: int = 0,
         **kwargs,
-    ) -> Union[GenerateOutput, torch.LongTensor]:
+    ) -> GenerateOutput | torch.LongTensor:
         r"""
 
         Generates sequences of token ids for models with a language modeling head.
@@ -662,23 +662,23 @@ def typeerror():
                 **model_kwargs,
             )
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def sample_stream(
         self,
         input_ids: torch.LongTensor,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        logits_warper: Optional[LogitsProcessorList] = None,
-        max_length: Optional[int] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, list[int]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_scores: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: Optional[bool] = False,
+        logits_processor: LogitsProcessorList | None = None,
+        stopping_criteria: StoppingCriteriaList | None = None,
+        logits_warper: LogitsProcessorList | None = None,
+        max_length: int | None = None,
+        pad_token_id: int | None = None,
+        eos_token_id: int | list[int] | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        output_scores: bool | None = None,
+        return_dict_in_generate: bool | None = None,
+        synced_gpus: bool | None = False,
         **model_kwargs,
-    ) -> Union[SampleOutput, torch.LongTensor]:
+    ) -> SampleOutput | torch.LongTensor:
         r"""
         Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
         can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
@@ -953,7 +953,6 @@ def init_stream_support():
 
 
 def _get_logits_warper(generation_config: GenerationConfig) -> LogitsProcessorList:
-
     warpers = LogitsProcessorList()
 
     if generation_config.temperature is not None and generation_config.temperature != 1.0:
diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py
index fec8358deb..ef4162a1cb 100644
--- a/TTS/tts/layers/xtts/tokenizer.py
+++ b/TTS/tts/layers/xtts/tokenizer.py
@@ -76,7 +76,7 @@ def split_sentence(text, lang, text_split_length=250):
 # List of (regular expression, replacement) pairs for abbreviations:
 _abbreviations = {
     "en": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("mrs", "misess"),
             ("mr", "mister"),
@@ -99,7 +99,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "es": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("sra", "señora"),
             ("sr", "señor"),
@@ -112,7 +112,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "fr": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("mme", "madame"),
             ("mr", "monsieur"),
@@ -124,7 +124,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "de": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("fr", "frau"),
             ("dr", "doktor"),
@@ -134,7 +134,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "pt": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("sra", "senhora"),
             ("sr", "senhor"),
@@ -147,7 +147,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "it": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             # ("sig.ra", "signora"),
             ("sig", "signore"),
@@ -159,7 +159,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "pl": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("p", "pani"),
             ("m", "pan"),
@@ -169,19 +169,19 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "ar": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             # There are not many common abbreviations in Arabic as in English.
         ]
     ],
     "zh": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
         ]
     ],
     "cs": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("dr", "doktor"),  # doctor
             ("ing", "inženýr"),  # engineer
@@ -190,7 +190,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "ru": [
-        (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\b", re.IGNORECASE), x[1])
         for x in [
             ("г-жа", "госпожа"),  # Mrs.
             ("г-н", "господин"),  # Mr.
@@ -199,7 +199,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "nl": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("dhr", "de heer"),  # Mr.
             ("mevr", "mevrouw"),  # Mrs.
@@ -209,7 +209,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "tr": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("b", "bay"),  # Mr.
             ("byk", "büyük"),  # büyük
@@ -218,7 +218,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "hu": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("dr", "doktor"),  # doctor
             ("b", "bácsi"),  # Mr.
@@ -227,13 +227,13 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "ko": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             # Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
         ]
     ],
     "hi": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             # Hindi doesn't typically use abbreviations in the same way as Latin-based scripts.
         ]
@@ -249,7 +249,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
 
 _symbols_multilingual = {
     "en": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " and "),
             ("@", " at "),
@@ -261,7 +261,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "es": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " y "),
             ("@", " arroba "),
@@ -273,7 +273,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "fr": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " et "),
             ("@", " arobase "),
@@ -285,7 +285,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "de": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " und "),
             ("@", " at "),
@@ -297,7 +297,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "pt": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " e "),
             ("@", " arroba "),
@@ -309,7 +309,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "it": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " e "),
             ("@", " chiocciola "),
@@ -321,7 +321,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "pl": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " i "),
             ("@", " małpa "),
@@ -334,7 +334,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
     ],
     "ar": [
         # Arabic
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " و "),
             ("@", " على "),
@@ -347,7 +347,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
     ],
     "zh": [
         # Chinese
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " 和 "),
             ("@", " 在 "),
@@ -360,7 +360,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
     ],
     "cs": [
         # Czech
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " a "),
             ("@", " na "),
@@ -373,7 +373,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
     ],
     "ru": [
         # Russian
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " и "),
             ("@", " собака "),
@@ -386,7 +386,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
     ],
     "nl": [
         # Dutch
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " en "),
             ("@", " bij "),
@@ -398,7 +398,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "tr": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " ve "),
             ("@", " at "),
@@ -410,7 +410,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "hu": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " és "),
             ("@", " kukac "),
@@ -423,7 +423,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
     ],
     "ko": [
         # Korean
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " 그리고 "),
             ("@", " 에 "),
@@ -435,7 +435,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "hi": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " और "),
             ("@", " ऐट दी रेट "),
@@ -505,7 +505,7 @@ def _expand_decimal_point(m, lang="en"):
 
 
 def _expand_currency(m, lang="en", currency="USD"):
-    amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
+    amount = float(re.sub(r"[^\d.]", "", m.group(0).replace(",", ".")))
     full_amount = num2words(amount, to="currency", currency=currency, lang=lang)
 
     and_equivalents = {
diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py
index 107054189c..0a8af2f950 100644
--- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py
+++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py
@@ -1,6 +1,5 @@
 import logging
 from dataclasses import dataclass, field
-from typing import Dict, List, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -31,7 +30,7 @@ class GPTTrainerConfig(XttsConfig):
     optimizer_wd_only_on_weights: bool = False
     weighted_loss_attrs: dict = field(default_factory=lambda: {})
     weighted_loss_multipliers: dict = field(default_factory=lambda: {})
-    test_sentences: List[dict] = field(default_factory=lambda: [])
+    test_sentences: list[dict] = field(default_factory=lambda: [])
 
 
 @dataclass
@@ -197,10 +196,6 @@ def __init__(self, config: Coqpit):
             mel_norm_file=self.args.mel_norm_file, sampling_rate=config.audio.dvae_sample_rate
         )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels, cond_idxs, cond_lens):
         """
         Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode
@@ -225,8 +220,8 @@ def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels
         )
         return losses
 
-    @torch.no_grad()
-    def test_run(self, assets) -> Tuple[Dict, Dict]:  # pylint: disable=W0613
+    @torch.inference_mode()
+    def test_run(self, assets) -> tuple[dict, dict]:  # pylint: disable=W0613
         test_audios = {}
         if self.config.test_sentences:
             # init gpt for inference mode
@@ -241,7 +236,7 @@ def test_run(self, assets) -> Tuple[Dict, Dict]:  # pylint: disable=W0613
                     s_info["language"],
                     gpt_cond_len=3,
                 )["wav"]
-                test_audios["{}-audio".format(idx)] = wav
+                test_audios[f"{idx}-audio"] = wav
 
             # delete inference layers
             del self.xtts.gpt.gpt_inference
@@ -249,11 +244,15 @@ def test_run(self, assets) -> Tuple[Dict, Dict]:  # pylint: disable=W0613
         return {"audios": test_audios}
 
     def test_log(
-        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
+        self,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
     ) -> None:
         logger.test_audios(steps, outputs["audios"], self.args.output_sample_rate)
 
-    def format_batch(self, batch: Dict) -> Dict:
+    def format_batch(self, batch: dict) -> dict:
         return batch
 
     @torch.no_grad()  # torch no grad to avoid gradients from the pre-processing and DVAE codes extraction
@@ -335,7 +334,7 @@ def on_init_end(self, trainer):  # pylint: disable=W0613
 
             WeightsFileHandler.add_pre_callback(callback_clearml_load_save)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         x,
@@ -355,9 +354,9 @@ def get_sampler(self, dataset: TTSDataset, num_gpus=1):
     def get_data_loader(
         self,
         config: Coqpit,
-        assets: Dict,
+        assets: dict,
         is_eval: bool,
-        samples: Union[List[Dict], List[List]],
+        samples: list[dict] | list[list],
         verbose: bool,
         num_gpus: int,
         rank: int = None,
@@ -400,7 +399,7 @@ def get_data_loader(
                 )
         return loader
 
-    def get_optimizer(self) -> List:
+    def get_optimizer(self) -> list:
         """Initiate and return the optimizer based on the config parameters."""
         # ToDo: deal with multi GPU training
         if self.config.optimizer_wd_only_on_weights:
@@ -431,7 +430,7 @@ def get_optimizer(self) -> List:
                     v.is_norm = isinstance(m, norm_modules)
                     v.is_emb = isinstance(m, emb_modules)
 
-                    fpn = "%s.%s" % (mn, k) if mn else k  # full param name
+                    fpn = f"{mn}.{k}" if mn else k  # full param name
                     all_param_names.add(fpn)
                     param_map[fpn] = v
                     if v.is_bias or v.is_norm or v.is_emb:
@@ -464,7 +463,7 @@ def get_optimizer(self) -> List:
             parameters=self.xtts.gpt.parameters(),
         )
 
-    def get_scheduler(self, optimizer) -> List:
+    def get_scheduler(self, optimizer) -> list:
         """Set the scheduler for the optimizer.
 
         Args:
@@ -495,7 +494,7 @@ def load_checkpoint(
             assert not self.training
 
     @staticmethod
-    def init_from_config(config: "GPTTrainerConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "GPTTrainerConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
diff --git a/TTS/tts/layers/xtts/zh_num2words.py b/TTS/tts/layers/xtts/zh_num2words.py
index 69b8dae952..360d9b06c8 100644
--- a/TTS/tts/layers/xtts/zh_num2words.py
+++ b/TTS/tts/layers/xtts/zh_num2words.py
@@ -392,7 +392,7 @@
 # ================================================================================ #
 #                                    basic class
 # ================================================================================ #
-class ChineseChar(object):
+class ChineseChar:
     """
     中文字符
     每个字符对应简体和繁体,
@@ -420,13 +420,13 @@ class ChineseNumberUnit(ChineseChar):
     """
 
     def __init__(self, power, simplified, traditional, big_s, big_t):
-        super(ChineseNumberUnit, self).__init__(simplified, traditional)
+        super().__init__(simplified, traditional)
         self.power = power
         self.big_s = big_s
         self.big_t = big_t
 
     def __str__(self):
-        return "10^{}".format(self.power)
+        return f"10^{self.power}"
 
     @classmethod
     def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
@@ -447,7 +447,7 @@ def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=Fals
                 power=pow(2, index + 3), simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]
             )
         else:
-            raise ValueError("Counting type should be in {0} ({1} provided).".format(NUMBERING_TYPES, numbering_type))
+            raise ValueError(f"Counting type should be in {NUMBERING_TYPES} ({numbering_type} provided).")
 
 
 class ChineseNumberDigit(ChineseChar):
@@ -456,7 +456,7 @@ class ChineseNumberDigit(ChineseChar):
     """
 
     def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None):
-        super(ChineseNumberDigit, self).__init__(simplified, traditional)
+        super().__init__(simplified, traditional)
         self.value = value
         self.big_s = big_s
         self.big_t = big_t
@@ -477,7 +477,7 @@ class ChineseMath(ChineseChar):
     """
 
     def __init__(self, simplified, traditional, symbol, expression=None):
-        super(ChineseMath, self).__init__(simplified, traditional)
+        super().__init__(simplified, traditional)
         self.symbol = symbol
         self.expression = expression
         self.big_s = simplified
@@ -487,13 +487,13 @@ def __init__(self, simplified, traditional, symbol, expression=None):
 CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
 
 
-class NumberSystem(object):
+class NumberSystem:
     """
     中文数字系统
     """
 
 
-class MathSymbol(object):
+class MathSymbol:
     """
     用于中文数字系统的数学符号 (繁/简体), e.g.
     positive = ['正', '正']
@@ -507,8 +507,7 @@ def __init__(self, positive, negative, point):
         self.point = point
 
     def __iter__(self):
-        for v in self.__dict__.values():
-            yield v
+        yield from self.__dict__.values()
 
 
 # class OtherSymbol(object):
@@ -640,7 +639,7 @@ def compute_value(integer_symbols):
     int_str = str(compute_value(int_part))
     dec_str = "".join([str(d.value) for d in dec_part])
     if dec_part:
-        return "{0}.{1}".format(int_str, dec_str)
+        return f"{int_str}.{dec_str}"
     else:
         return int_str
 
@@ -686,7 +685,7 @@ def get_value(value_string, use_zeros=True):
         int_string = int_dec[0]
         dec_string = int_dec[1]
     else:
-        raise ValueError("invalid input num string with more than one dot: {}".format(number_string))
+        raise ValueError(f"invalid input num string with more than one dot: {number_string}")
 
     if use_units and len(int_string) > 1:
         result_symbols = get_value(int_string)
@@ -702,7 +701,7 @@ def get_value(value_string, use_zeros=True):
             if isinstance(v, CND) and v.value == 2:
                 next_symbol = result_symbols[i + 1] if i < len(result_symbols) - 1 else None
                 previous_symbol = result_symbols[i - 1] if i > 0 else None
-                if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))):
+                if isinstance(next_symbol, CNU) and isinstance(previous_symbol, CNU | type(None)):
                     if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)):
                         result_symbols[i] = liang
 
@@ -1166,7 +1165,7 @@ def __call__(self, text):
     )
 
     ndone = 0
-    with open(args.ifile, "r", encoding="utf8") as istream, open(args.ofile, "w+", encoding="utf8") as ostream:
+    with open(args.ifile, encoding="utf8") as istream, open(args.ofile, "w+", encoding="utf8") as ostream:
         if args.format == "tsv":
             reader = csv.DictReader(istream, delimiter="\t")
             assert "TEXT" in reader.fieldnames
diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py
index ebfa171c80..4746b13ea2 100644
--- a/TTS/tts/models/__init__.py
+++ b/TTS/tts/models/__init__.py
@@ -1,12 +1,11 @@
 import logging
-from typing import Dict, List, Union
 
 from TTS.utils.generic_utils import find_module
 
 logger = logging.getLogger(__name__)
 
 
-def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS":
+def setup_model(config: "Coqpit", samples: list[list] | list[dict] = None) -> "BaseTTS":
     logger.info("Using model: %s", config.model)
     # fetch the right model implementation.
     if "base_model" in config and config["base_model"] is not None:
diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py
index 28a52bc558..c2e29c7100 100644
--- a/TTS/tts/models/align_tts.py
+++ b/TTS/tts/models/align_tts.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import Dict, List, Union
 
 import torch
 from coqpit import Coqpit
@@ -233,9 +232,7 @@ def _forward_mdn(self, o_en, y, y_lengths, x_mask):
         dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, y_mask)
         return dr_mas, mu, log_sigma, logp
 
-    def forward(
-        self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None
-    ):  # pylint: disable=unused-argument
+    def forward(self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None):  # pylint: disable=unused-argument
         """
         Shapes:
             - x: :math:`[B, T_max]`
@@ -288,7 +285,7 @@ def forward(
         }
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, x, aux_input={"d_vectors": None}):  # pylint: disable=unused-argument
         """
         Shapes:
@@ -352,9 +349,7 @@ def _create_logs(self, batch, outputs, ap):  # pylint: disable=no-self-use
         train_audio = ap.inv_melspectrogram(pred_spec.T)
         return figures, {"audio": train_audio}
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ) -> None:  # pylint: disable=no-self-use
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:  # pylint: disable=no-self-use
         figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
@@ -367,9 +362,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s
         logger.eval_figures(steps, figures)
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         self.load_state_dict(state["model"])
         if eval:
@@ -403,7 +396,7 @@ def on_epoch_start(self, trainer):
         self.phase = self._set_phase(trainer.config, trainer.total_steps_done)
 
     @staticmethod
-    def init_from_config(config: "AlignTTSConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "AlignTTSConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
diff --git a/TTS/tts/models/bark.py b/TTS/tts/models/bark.py
index c52c541b25..84814745a2 100644
--- a/TTS/tts/models/bark.py
+++ b/TTS/tts/models/bark.py
@@ -1,6 +1,6 @@
 import os
 from dataclasses import dataclass
-from typing import Optional
+from pathlib import Path
 
 import numpy as np
 from coqpit import Coqpit
@@ -42,10 +42,6 @@ def __init__(
         self.encodec = EncodecModel.encodec_model_24khz()
         self.encodec.set_target_bandwidth(6.0)
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     def load_bark_models(self):
         self.semantic_model, self.config = load_model(
             ckpt_path=self.config.LOCAL_MODEL_PATHS["text"], device=self.device, config=self.config, model_type="text"
@@ -68,7 +64,7 @@ def train_step(
     def text_to_semantic(
         self,
         text: str,
-        history_prompt: Optional[str] = None,
+        history_prompt: str | None = None,
         temp: float = 0.7,
         base=None,
         allow_early_stop=True,
@@ -98,7 +94,7 @@ def text_to_semantic(
     def semantic_to_waveform(
         self,
         semantic_tokens: np.ndarray,
-        history_prompt: Optional[str] = None,
+        history_prompt: str | None = None,
         temp: float = 0.7,
         base=None,
     ):
@@ -132,7 +128,7 @@ def semantic_to_waveform(
     def generate_audio(
         self,
         text: str,
-        history_prompt: Optional[str] = None,
+        history_prompt: str | None = None,
         text_temp: float = 0.7,
         waveform_temp: float = 0.7,
         base=None,
@@ -194,9 +190,7 @@ def _set_voice_dirs(self, voice_dirs):
         return _voice_dirs
 
     # TODO: remove config from synthesize
-    def synthesize(
-        self, text, config, speaker_id="random", voice_dirs=None, **kwargs
-    ):  # pylint: disable=unused-argument
+    def synthesize(self, text, config, speaker_id="random", voice_dirs=None, **kwargs):  # pylint: disable=unused-argument
         """Synthesize speech with the given input text.
 
         Args:
@@ -269,10 +263,12 @@ def load_checkpoint(
         fine_model_path = fine_model_path or os.path.join(checkpoint_dir, "fine_2.pt")
         hubert_tokenizer_path = hubert_tokenizer_path or os.path.join(checkpoint_dir, "tokenizer.pth")
 
+        # The paths in the default config start with /root/.local/share/tts and need to be fixed
         self.config.LOCAL_MODEL_PATHS["text"] = text_model_path
         self.config.LOCAL_MODEL_PATHS["coarse"] = coarse_model_path
         self.config.LOCAL_MODEL_PATHS["fine"] = fine_model_path
         self.config.LOCAL_MODEL_PATHS["hubert_tokenizer"] = hubert_tokenizer_path
+        self.config.CACHE_DIR = str(Path(text_model_path).parent)
 
         self.load_bark_models()
 
diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py
index 79cdf1a7d4..05f4ae168d 100644
--- a/TTS/tts/models/base_tacotron.py
+++ b/TTS/tts/models/base_tacotron.py
@@ -1,7 +1,6 @@
 import copy
 import logging
 from abc import abstractmethod
-from typing import Dict, Tuple
 
 import torch
 from coqpit import Coqpit
@@ -62,7 +61,7 @@ def __init__(
         self.coarse_decoder = None
 
     @staticmethod
-    def _format_aux_input(aux_input: Dict) -> Dict:
+    def _format_aux_input(aux_input: dict) -> dict:
         """Set missing fields to their default values"""
         if aux_input:
             return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input)
@@ -94,9 +93,7 @@ def forward(self):
     def inference(self):
         pass
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         """Load model checkpoint and set up internals.
 
         Args:
@@ -141,7 +138,7 @@ def init_from_config(config: Coqpit):
     # TEST AND LOG FUNCTIONS #
     ##########################
 
-    def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
+    def test_run(self, assets: dict) -> tuple[dict, dict]:
         """Generic test run for `tts` models used by `Trainer`.
 
         You can override this for a different behaviour.
@@ -169,17 +166,19 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
                 use_griffin_lim=True,
                 do_trim_silence=False,
             )
-            test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
-            test_figures["{}-prediction".format(idx)] = plot_spectrogram(
+            test_audios[f"{idx}-audio"] = outputs_dict["wav"]
+            test_figures[f"{idx}-prediction"] = plot_spectrogram(
                 outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False
             )
-            test_figures["{}-alignment".format(idx)] = plot_alignment(
-                outputs_dict["outputs"]["alignments"], output_fig=False
-            )
+            test_figures[f"{idx}-alignment"] = plot_alignment(outputs_dict["outputs"]["alignments"], output_fig=False)
         return {"figures": test_figures, "audios": test_audios}
 
     def test_log(
-        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
+        self,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
     ) -> None:
         logger.test_audios(steps, outputs["audios"], self.ap.sample_rate)
         logger.test_figures(steps, outputs["figures"])
diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index 91dd6e96d6..6a78cf603f 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -1,7 +1,6 @@
 import logging
 import os
 import random
-from typing import Dict, List, Tuple, Union
 
 import torch
 import torch.distributed as dist
@@ -107,7 +106,7 @@ def adjust_speech_rate(self, gpt_latents, length_scale):
             print(f"Interpolation failed: {e}")
             return gpt_latents
 
-    def init_multispeaker(self, config: Coqpit, data: List = None):
+    def init_multispeaker(self, config: Coqpit, data: list = None):
         """Set up for multi-speaker TTS.
 
         Initialize a speaker embedding layer if needed and define expected embedding
@@ -142,7 +141,7 @@ def init_multispeaker(self, config: Coqpit, data: List = None):
             self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
             self.speaker_embedding.weight.data.normal_(0, 0.3)
 
-    def get_aux_input(self, **kwargs) -> Dict:
+    def get_aux_input(self, **kwargs) -> dict:
         """Prepare and return `aux_input` used by `forward()`"""
         return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None}
 
@@ -193,7 +192,7 @@ def get_aux_input_from_test_sentences(self, sentence_info):
             "language_id": language_id,
         }
 
-    def format_batch(self, batch: Dict) -> Dict:
+    def format_batch(self, batch: dict) -> dict:
         """Generic batch formatting for `TTSDataset`.
 
         You must override this if you use a custom dataset.
@@ -239,9 +238,9 @@ def format_batch(self, batch: Dict) -> Dict:
                 extra_frames = dur.sum() - mel_lengths[idx]
                 largest_idxs = torch.argsort(-dur)[:extra_frames]
                 dur[largest_idxs] -= 1
-                assert (
-                    dur.sum() == mel_lengths[idx]
-                ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}"
+                assert dur.sum() == mel_lengths[idx], (
+                    f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}"
+                )
                 durations[idx, : text_lengths[idx]] = dur
 
         # set stop targets wrt reduction factor
@@ -313,9 +312,9 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1):
     def get_data_loader(
         self,
         config: Coqpit,
-        assets: Dict,
+        assets: dict,
         is_eval: bool,
-        samples: Union[List[Dict], List[List]],
+        samples: list[dict] | list[list],
         verbose: bool,
         num_gpus: int,
         rank: int = None,
@@ -394,7 +393,7 @@ def get_data_loader(
 
     def _get_test_aux_input(
         self,
-    ) -> Dict:
+    ) -> dict:
         d_vector = None
         if self.config.use_d_vector_file:
             d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings]
@@ -411,7 +410,7 @@ def _get_test_aux_input(
         }
         return aux_inputs
 
-    def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
+    def test_run(self, assets: dict) -> tuple[dict, dict]:
         """Generic test run for `tts` models used by `Trainer`.
 
         You can override this for a different behaviour.
@@ -442,13 +441,11 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
                 use_griffin_lim=True,
                 do_trim_silence=False,
             )
-            test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
-            test_figures["{}-prediction".format(idx)] = plot_spectrogram(
+            test_audios[f"{idx}-audio"] = outputs_dict["wav"]
+            test_figures[f"{idx}-prediction"] = plot_spectrogram(
                 outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False
             )
-            test_figures["{}-alignment".format(idx)] = plot_alignment(
-                outputs_dict["outputs"]["alignments"], output_fig=False
-            )
+            test_figures[f"{idx}-alignment"] = plot_alignment(outputs_dict["outputs"]["alignments"], output_fig=False)
         return test_figures, test_audios
 
     def on_init_start(self, trainer):
diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py
index e6db116081..7b6103512c 100644
--- a/TTS/tts/models/delightful_tts.py
+++ b/TTS/tts/models/delightful_tts.py
@@ -3,7 +3,6 @@
 from dataclasses import dataclass, field
 from itertools import chain
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -65,7 +64,7 @@ class ForwardTTSE2eF0Dataset(F0Dataset):
     def __init__(
         self,
         ap,
-        samples: Union[List[List], List[Dict]],
+        samples: list[list] | list[dict],
         cache_path: str = None,
         precompute_num_workers=0,
         normalize_f0=True,
@@ -275,15 +274,15 @@ def collate_fn(self, batch):
 @dataclass
 class VocoderConfig(Coqpit):
     resblock_type_decoder: str = "1"
-    resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
-    resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
-    upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
+    resblock_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [3, 7, 11])
+    resblock_dilation_sizes_decoder: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+    upsample_rates_decoder: list[int] = field(default_factory=lambda: [8, 8, 2, 2])
     upsample_initial_channel_decoder: int = 512
-    upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    upsample_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [16, 16, 4, 4])
     use_spectral_norm_discriminator: bool = False
-    upsampling_rates_discriminator: List[int] = field(default_factory=lambda: [4, 4, 4, 4])
-    periods_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
-    pretrained_model_path: Optional[str] = None
+    upsampling_rates_discriminator: list[int] = field(default_factory=lambda: [4, 4, 4, 4])
+    periods_discriminator: list[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
+    pretrained_model_path: str | None = None
 
 
 @dataclass
@@ -438,10 +437,6 @@ def __init__(
                 periods=self.config.vocoder.periods_discriminator,
             )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     @property
     def energy_scaler(self):
         return self.acoustic_model.energy_scaler
@@ -557,7 +552,7 @@ def forward(
         attn_priors: torch.FloatTensor = None,
         d_vectors: torch.FloatTensor = None,
         speaker_idx: torch.LongTensor = None,
-    ) -> Dict:
+    ) -> dict:
         """Model's forward pass.
 
         Args:
@@ -622,7 +617,7 @@ def forward(
         model_outputs["slice_ids"] = slice_ids
         return model_outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self, x, aux_input={"d_vectors": None, "speaker_ids": None}, pitch_transform=None, energy_transform=None
     ):
@@ -646,7 +641,7 @@ def inference(
         model_outputs["model_outputs"] = vocoder_output
         return model_outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference_spec_decoder(self, x, aux_input={"d_vectors": None, "speaker_ids": None}):
         encoder_outputs = self.acoustic_model.inference(
             tokens=x,
@@ -836,9 +831,7 @@ def _log(self, batch, outputs, name_prefix="train"):
         audios[f"{name_prefix}/vocoder_audio"] = sample_voice
         return figures, audios
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ):  # pylint: disable=no-self-use, unused-argument
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int):  # pylint: disable=no-self-use, unused-argument
         """Create visualizations and waveform examples.
 
         For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
@@ -1018,8 +1011,8 @@ def synthesize_with_gl(self, text: str, speaker_id, d_vector):
         }
         return return_dict
 
-    @torch.no_grad()
-    def test_run(self, assets) -> Tuple[Dict, Dict]:
+    @torch.inference_mode()
+    def test_run(self, assets) -> tuple[dict, dict]:
         """Generic test run for `tts` models used by `Trainer`.
 
         You can override this for a different behaviour.
@@ -1045,18 +1038,22 @@ def test_run(self, assets) -> Tuple[Dict, Dict]:
                 d_vector=aux_inputs["d_vector"],
             )
             # speaker_name = self.speaker_manager.speaker_names[aux_inputs["speaker_id"]]
-            test_audios["{}-audio".format(idx)] = outputs["wav"].T
-            test_audios["{}-audio_encoder".format(idx)] = outputs_gl["wav"].T
-            test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False)
+            test_audios[f"{idx}-audio"] = outputs["wav"].T
+            test_audios[f"{idx}-audio_encoder"] = outputs_gl["wav"].T
+            test_figures[f"{idx}-alignment"] = plot_alignment(outputs["alignments"], output_fig=False)
         return {"figures": test_figures, "audios": test_audios}
 
     def test_log(
-        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
+        self,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
     ) -> None:
         logger.test_audios(steps, outputs["audios"], self.config.audio.sample_rate)
         logger.test_figures(steps, outputs["figures"])
 
-    def format_batch(self, batch: Dict) -> Dict:
+    def format_batch(self, batch: dict) -> dict:
         """Compute speaker, langugage IDs and d_vector for the batch if necessary."""
         speaker_ids = None
         d_vectors = None
@@ -1164,9 +1161,9 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1):
     def get_data_loader(
         self,
         config: Coqpit,
-        assets: Dict,
+        assets: dict,
         is_eval: bool,
-        samples: Union[List[Dict], List[List]],
+        samples: list[dict] | list[list],
         verbose: bool,
         num_gpus: int,
         rank: int = None,
@@ -1221,7 +1218,7 @@ def get_data_loader(
     def get_criterion(self):
         return [VitsDiscriminatorLoss(self.config), DelightfulTTSLoss(self.config)]
 
-    def get_optimizer(self) -> List:
+    def get_optimizer(self) -> list:
         """Initiate and return the GAN optimizers based on the config parameters.
         It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator.
         Returns:
@@ -1236,7 +1233,7 @@ def get_optimizer(self) -> List:
         )
         return [optimizer_disc, optimizer_gen]
 
-    def get_lr(self) -> List:
+    def get_lr(self) -> list:
         """Set the initial learning rates for each optimizer.
 
         Returns:
@@ -1244,7 +1241,7 @@ def get_lr(self) -> List:
         """
         return [self.config.lr_disc, self.config.lr_gen]
 
-    def get_scheduler(self, optimizer) -> List:
+    def get_scheduler(self, optimizer) -> list:
         """Set the schedulers for each optimizer.
 
         Args:
@@ -1263,9 +1260,7 @@ def on_epoch_end(self, trainer):  # pylint: disable=unused-argument
         self.energy_scaler.eval()
 
     @staticmethod
-    def init_from_config(
-        config: "DelightfulTTSConfig", samples: Union[List[List], List[Dict]] = None
-    ):  # pylint: disable=unused-argument
+    def init_from_config(config: "DelightfulTTSConfig", samples: list[list] | list[dict] = None):  # pylint: disable=unused-argument
         """Initiate model from config
 
         Args:
diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py
index d09e3ea91b..497ac3f63a 100644
--- a/TTS/tts/models/forward_tts.py
+++ b/TTS/tts/models/forward_tts.py
@@ -1,6 +1,5 @@
 import logging
 from dataclasses import dataclass, field
-from typing import Dict, List, Tuple, Union
 
 import torch
 from coqpit import Coqpit
@@ -333,7 +332,7 @@ def format_durations(self, o_dr_log, x_mask):
 
     def _forward_encoder(
         self, x: torch.LongTensor, x_mask: torch.FloatTensor, g: torch.FloatTensor = None
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         """Encoding forward pass.
 
         1. Embed speaker IDs if multi-speaker mode.
@@ -381,7 +380,7 @@ def _forward_decoder(
         x_mask: torch.FloatTensor,
         y_lengths: torch.IntTensor,
         g: torch.FloatTensor,
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
         """Decoding forward pass.
 
         1. Compute the decoder output mask
@@ -415,7 +414,7 @@ def _forward_pitch_predictor(
         x_mask: torch.IntTensor,
         pitch: torch.FloatTensor = None,
         dr: torch.IntTensor = None,
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
         """Pitch predictor forward pass.
 
         1. Predict pitch from encoder outputs.
@@ -451,7 +450,7 @@ def _forward_energy_predictor(
         x_mask: torch.IntTensor,
         energy: torch.FloatTensor = None,
         dr: torch.IntTensor = None,
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
         """Energy predictor forward pass.
 
         1. Predict energy from encoder outputs.
@@ -483,7 +482,7 @@ def _forward_energy_predictor(
 
     def _forward_aligner(
         self, x: torch.FloatTensor, y: torch.FloatTensor, x_mask: torch.IntTensor, y_mask: torch.IntTensor
-    ) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         """Aligner forward pass.
 
         1. Compute a mask to apply to the attention map.
@@ -522,7 +521,7 @@ def _forward_aligner(
         alignment_soft = alignment_soft.squeeze(1).transpose(1, 2)
         return o_alignment_dur, alignment_soft, alignment_logprob, alignment_mas
 
-    def _set_speaker_input(self, aux_input: Dict):
+    def _set_speaker_input(self, aux_input: dict):
         d_vectors = aux_input.get("d_vectors", None)
         speaker_ids = aux_input.get("speaker_ids", None)
 
@@ -544,8 +543,8 @@ def forward(
         dr: torch.IntTensor = None,
         pitch: torch.FloatTensor = None,
         energy: torch.FloatTensor = None,
-        aux_input: Dict = {"d_vectors": None, "speaker_ids": None},  # pylint: disable=unused-argument
-    ) -> Dict:
+        aux_input: dict = {"d_vectors": None, "speaker_ids": None},  # pylint: disable=unused-argument
+    ) -> dict:
         """Model's forward pass.
 
         Args:
@@ -628,7 +627,7 @@ def forward(
         }
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}):  # pylint: disable=unused-argument
         """Model's inference pass.
 
@@ -771,9 +770,7 @@ def _create_logs(self, batch, outputs, ap):
         train_audio = ap.inv_melspectrogram(pred_spec.T)
         return figures, {"audio": train_audio}
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ) -> None:  # pylint: disable=no-self-use
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:  # pylint: disable=no-self-use
         figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
@@ -786,9 +783,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s
         logger.eval_figures(steps, figures)
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         self.load_state_dict(state["model"])
         if eval:
@@ -805,7 +800,7 @@ def on_train_step_start(self, trainer):
         self.binary_loss_weight = min(trainer.epochs_done / self.config.binary_loss_warmup_epochs, 1.0) * 1.0
 
     @staticmethod
-    def init_from_config(config: "ForwardTTSConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "ForwardTTSConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py
index 5bf4713140..5d03b53dc6 100644
--- a/TTS/tts/models/glow_tts.py
+++ b/TTS/tts/models/glow_tts.py
@@ -1,6 +1,5 @@
 import logging
 import math
-from typing import Dict, List, Tuple, Union
 
 import torch
 from coqpit import Coqpit
@@ -125,9 +124,9 @@ def init_multispeaker(self, config: Coqpit):
                 config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512
             )
             if self.speaker_manager is not None:
-                assert (
-                    config.d_vector_dim == self.speaker_manager.embedding_dim
-                ), " [!] d-vector dimension mismatch b/w config and speaker manager."
+                assert config.d_vector_dim == self.speaker_manager.embedding_dim, (
+                    " [!] d-vector dimension mismatch b/w config and speaker manager."
+                )
         # init speaker embedding layer
         if config.use_speaker_embedding and not config.use_d_vector_file:
             logger.info("Init speaker_embedding layer.")
@@ -162,7 +161,7 @@ def lock_act_norm_layers(self):
             if getattr(f, "set_ddi", False):
                 f.set_ddi(False)
 
-    def _set_speaker_input(self, aux_input: Dict):
+    def _set_speaker_input(self, aux_input: dict):
         if aux_input is None:
             d_vectors = None
             speaker_ids = None
@@ -179,7 +178,7 @@ def _set_speaker_input(self, aux_input: Dict):
         g = speaker_ids if speaker_ids is not None else d_vectors
         return g
 
-    def _speaker_embedding(self, aux_input: Dict) -> Union[torch.tensor, None]:
+    def _speaker_embedding(self, aux_input: dict) -> torch.Tensor | None:
         g = self._set_speaker_input(aux_input)
         # speaker embedding
         if g is not None:
@@ -193,9 +192,7 @@ def _speaker_embedding(self, aux_input: Dict) -> Union[torch.tensor, None]:
                 g = F.normalize(g).unsqueeze(-1)  # [b, h, 1]
         return g
 
-    def forward(
-        self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
-    ):  # pylint: disable=dangerous-default-value
+    def forward(self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}):  # pylint: disable=dangerous-default-value
         """
         Args:
             x (torch.Tensor):
@@ -262,7 +259,7 @@ def forward(
         }
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference_with_MAS(
         self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
     ):  # pylint: disable=dangerous-default-value
@@ -318,10 +315,8 @@ def inference_with_MAS(
         }
         return outputs
 
-    @torch.no_grad()
-    def decoder_inference(
-        self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
-    ):  # pylint: disable=dangerous-default-value
+    @torch.inference_mode()
+    def decoder_inference(self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}):  # pylint: disable=dangerous-default-value
         """
         Shapes:
             - y: :math:`[B, T, C]`
@@ -341,10 +336,8 @@ def decoder_inference(
         outputs["logdet"] = logdet
         return outputs
 
-    @torch.no_grad()
-    def inference(
-        self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}
-    ):  # pylint: disable=dangerous-default-value
+    @torch.inference_mode()
+    def inference(self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}):  # pylint: disable=dangerous-default-value
         x_lengths = aux_input["x_lengths"]
         g = self._speaker_embedding(aux_input)
         # embedding pass
@@ -457,14 +450,12 @@ def _create_logs(self, batch, outputs, ap):
         train_audio = ap.inv_melspectrogram(pred_spec.T)
         return figures, {"audio": train_audio}
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ) -> None:  # pylint: disable=no-self-use
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:  # pylint: disable=no-self-use
         figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def eval_step(self, batch: dict, criterion: nn.Module):
         return self.train_step(batch, criterion)
 
@@ -473,8 +464,8 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s
         logger.eval_figures(steps, figures)
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
-    @torch.no_grad()
-    def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
+    @torch.inference_mode()
+    def test_run(self, assets: dict) -> tuple[dict, dict]:
         """Generic test run for `tts` models used by `Trainer`.
 
         You can override this for a different behaviour.
@@ -503,11 +494,11 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
                     do_trim_silence=False,
                 )
 
-                test_audios["{}-audio".format(idx)] = outputs["wav"]
-                test_figures["{}-prediction".format(idx)] = plot_spectrogram(
+                test_audios[f"{idx}-audio"] = outputs["wav"]
+                test_figures[f"{idx}-prediction"] = plot_spectrogram(
                     outputs["outputs"]["model_outputs"], self.ap, output_fig=False
                 )
-                test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False)
+                test_figures[f"{idx}-alignment"] = plot_alignment(outputs["alignments"], output_fig=False)
         return test_figures, test_audios
 
     def preprocess(self, y, y_lengths, y_max_length, attn=None):
@@ -522,9 +513,7 @@ def preprocess(self, y, y_lengths, y_max_length, attn=None):
     def store_inverse(self):
         self.decoder.store_inverse()
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
         self.load_state_dict(state["model"])
         if eval:
@@ -543,7 +532,7 @@ def on_train_step_start(self, trainer):
         self.run_data_dep_init = trainer.total_steps_done < self.data_dep_init_steps
 
     @staticmethod
-    def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "GlowTTSConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
diff --git a/TTS/tts/models/neuralhmm_tts.py b/TTS/tts/models/neuralhmm_tts.py
index 0b3fadafbf..2cbf425884 100644
--- a/TTS/tts/models/neuralhmm_tts.py
+++ b/TTS/tts/models/neuralhmm_tts.py
@@ -1,6 +1,5 @@
 import logging
 import os
-from typing import Dict, List, Union
 
 import torch
 from coqpit import Coqpit
@@ -102,7 +101,7 @@ def __init__(
         self.register_buffer("mean", torch.tensor(0))
         self.register_buffer("std", torch.tensor(1))
 
-    def update_mean_std(self, statistics_dict: Dict):
+    def update_mean_std(self, statistics_dict: dict):
         self.mean.data = torch.tensor(statistics_dict["mean"])
         self.std.data = torch.tensor(statistics_dict["std"])
 
@@ -174,10 +173,10 @@ def train_step(self, batch: dict, criterion: nn.Module):
         loss_dict.update(self._training_stats(batch))
         return outputs, loss_dict
 
-    def eval_step(self, batch: Dict, criterion: nn.Module):
+    def eval_step(self, batch: dict, criterion: nn.Module):
         return self.train_step(batch, criterion)
 
-    def _format_aux_input(self, aux_input: Dict, default_input_dict):
+    def _format_aux_input(self, aux_input: dict, default_input_dict):
         """Set missing fields to their default value.
 
         Args:
@@ -195,7 +194,7 @@ def _format_aux_input(self, aux_input: Dict, default_input_dict):
             return format_aux_input(default_input_dict, aux_input)
         return default_input_dict
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         text: torch.Tensor,
@@ -239,7 +238,7 @@ def get_criterion():
         return NLLLoss()
 
     @staticmethod
-    def init_from_config(config: "NeuralhmmTTSConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "NeuralhmmTTSConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
@@ -346,17 +345,13 @@ def _create_logs(self, batch, outputs, ap):  # pylint: disable=no-self-use, unus
         audio = ap.inv_melspectrogram(inference_output["model_outputs"][0].T.cpu().numpy())
         return figures, {"audios": audio}
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ):  # pylint: disable=unused-argument
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int):  # pylint: disable=unused-argument
         """Log training progress."""
         figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
 
-    def eval_log(
-        self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int
-    ):  # pylint: disable=unused-argument
+    def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int):  # pylint: disable=unused-argument
         """Compute and log evaluation metrics."""
         # Plot model parameters histograms
         if isinstance(logger, TensorboardLogger):
@@ -370,7 +365,11 @@ def eval_log(
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     def test_log(
-        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
+        self,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
     ) -> None:
         logger.test_audios(steps, outputs[1], self.ap.sample_rate)
         logger.test_figures(steps, outputs[0])
diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py
index 1c146b2eac..aad2e1f553 100644
--- a/TTS/tts/models/overflow.py
+++ b/TTS/tts/models/overflow.py
@@ -1,6 +1,5 @@
 import logging
 import os
-from typing import Dict, List, Union
 
 import torch
 from coqpit import Coqpit
@@ -116,7 +115,7 @@ def __init__(
         self.register_buffer("mean", torch.tensor(0))
         self.register_buffer("std", torch.tensor(1))
 
-    def update_mean_std(self, statistics_dict: Dict):
+    def update_mean_std(self, statistics_dict: dict):
         self.mean.data = torch.tensor(statistics_dict["mean"])
         self.std.data = torch.tensor(statistics_dict["std"])
 
@@ -188,10 +187,10 @@ def train_step(self, batch: dict, criterion: nn.Module):
         loss_dict.update(self._training_stats(batch))
         return outputs, loss_dict
 
-    def eval_step(self, batch: Dict, criterion: nn.Module):
+    def eval_step(self, batch: dict, criterion: nn.Module):
         return self.train_step(batch, criterion)
 
-    def _format_aux_input(self, aux_input: Dict, default_input_dict):
+    def _format_aux_input(self, aux_input: dict, default_input_dict):
         """Set missing fields to their default value.
 
         Args:
@@ -209,7 +208,7 @@ def _format_aux_input(self, aux_input: Dict, default_input_dict):
             return format_aux_input(default_input_dict, aux_input)
         return default_input_dict
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         text: torch.Tensor,
@@ -255,7 +254,7 @@ def get_criterion():
         return NLLLoss()
 
     @staticmethod
-    def init_from_config(config: "OverFlowConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "OverFlowConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
@@ -363,17 +362,13 @@ def _create_logs(self, batch, outputs, ap):  # pylint: disable=no-self-use, unus
         audio = ap.inv_melspectrogram(inference_output["model_outputs"][0].T.cpu().numpy())
         return figures, {"audios": audio}
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ):  # pylint: disable=unused-argument
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int):  # pylint: disable=unused-argument
         """Log training progress."""
         figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
 
-    def eval_log(
-        self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int
-    ):  # pylint: disable=unused-argument
+    def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int):  # pylint: disable=unused-argument
         """Compute and log evaluation metrics."""
         # Plot model parameters histograms
         if isinstance(logger, TensorboardLogger):
@@ -387,7 +382,11 @@ def eval_log(
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     def test_log(
-        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
+        self,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
     ) -> None:
         logger.test_audios(steps, outputs[1], self.ap.sample_rate)
         logger.test_figures(steps, outputs[0])
diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py
index 5d3efd2021..59173691f7 100644
--- a/TTS/tts/models/tacotron.py
+++ b/TTS/tts/models/tacotron.py
@@ -1,7 +1,3 @@
-# coding: utf-8
-
-from typing import Dict, List, Tuple, Union
-
 import torch
 from torch import nn
 from trainer.trainer_utils import get_optimizer, get_scheduler
@@ -218,7 +214,7 @@ def forward(  # pylint: disable=dangerous-default-value
         )
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, text_input, aux_input=None):
         aux_input = self._format_aux_input(aux_input)
         inputs = self.embedding(text_input)
@@ -280,7 +276,7 @@ def before_backward_pass(self, loss_dict, optimizer) -> None:
             loss_dict["capacitron_vae_beta_loss"].backward()
             optimizer.first_step()
 
-    def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dict]:
+    def train_step(self, batch: dict, criterion: torch.nn.Module) -> tuple[dict, dict]:
         """Perform a single training step by fetching the right set of samples from the batch.
 
         Args:
@@ -332,7 +328,7 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dic
         loss_dict["align_error"] = align_error
         return outputs, loss_dict
 
-    def get_optimizer(self) -> List:
+    def get_optimizer(self) -> list:
         if self.use_capacitron_vae:
             return CapacitronOptimizer(self.config, self.named_parameters())
         return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self)
@@ -380,9 +376,7 @@ def _create_logs(self, batch, outputs, ap):
         audio = ap.inv_spectrogram(pred_linear_spec.T)
         return figures, {"audio": audio}
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ) -> None:  # pylint: disable=no-self-use
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:  # pylint: disable=no-self-use
         figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
@@ -396,7 +390,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     @staticmethod
-    def init_from_config(config: "TacotronConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "TacotronConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py
index 2716a39786..e924d82d42 100644
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@@ -1,7 +1,3 @@
-# coding: utf-8
-
-from typing import Dict, List, Union
-
 import torch
 from torch import nn
 from trainer.trainer_utils import get_optimizer, get_scheduler
@@ -238,7 +234,7 @@ def forward(  # pylint: disable=dangerous-default-value
         )
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, text, aux_input=None):
         """Forward pass for inference with no Teacher-Forcing.
 
@@ -309,7 +305,7 @@ def before_backward_pass(self, loss_dict, optimizer) -> None:
             loss_dict["capacitron_vae_beta_loss"].backward()
             optimizer.first_step()
 
-    def train_step(self, batch: Dict, criterion: torch.nn.Module):
+    def train_step(self, batch: dict, criterion: torch.nn.Module):
         """A single training step. Forward pass and loss computation.
 
         Args:
@@ -360,7 +356,7 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module):
         loss_dict["align_error"] = align_error
         return outputs, loss_dict
 
-    def get_optimizer(self) -> List:
+    def get_optimizer(self) -> list:
         if self.use_capacitron_vae:
             return CapacitronOptimizer(self.config, self.named_parameters())
         return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self)
@@ -403,9 +399,7 @@ def _create_logs(self, batch, outputs, ap):
         audio = ap.inv_melspectrogram(pred_spec.T)
         return figures, {"audio": audio}
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ) -> None:  # pylint: disable=no-self-use
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:  # pylint: disable=no-self-use
         """Log training progress."""
         figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
@@ -420,7 +414,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     @staticmethod
-    def init_from_config(config: "Tacotron2Config", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "Tacotron2Config", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py
index 738e9dd9b3..a42d577676 100644
--- a/TTS/tts/models/tortoise.py
+++ b/TTS/tts/models/tortoise.py
@@ -342,7 +342,6 @@ def __init__(self, config: Coqpit):
             else self.args.autoregressive_batch_size
         )
         self.enable_redaction = self.args.enable_redaction
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         if self.enable_redaction:
             self.aligner = Wav2VecAlignment()
 
@@ -685,9 +684,9 @@ def inference(
 
         text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).to(self.device)
         text_tokens = F.pad(text_tokens, (0, 1))  # This may not be necessary.
-        assert (
-            text_tokens.shape[-1] < 400
-        ), "Too much text provided. Break the text up into separate segments and re-try inference."
+        assert text_tokens.shape[-1] < 400, (
+            "Too much text provided. Break the text up into separate segments and re-try inference."
+        )
 
         if voice_samples is not None:
             (
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 7ec2519236..3b6cee7ead 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -3,7 +3,8 @@
 import os
 from dataclasses import dataclass, field, replace
 from itertools import chain
-from typing import Dict, List, Tuple, Union
+from pathlib import Path
+from typing import Any
 
 import numpy as np
 import torch
@@ -400,12 +401,12 @@ class VitsArgs(Coqpit):
     dilation_rate_flow: int = 1
     num_layers_flow: int = 4
     resblock_type_decoder: str = "1"
-    resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
-    resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
-    upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
+    resblock_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [3, 7, 11])
+    resblock_dilation_sizes_decoder: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+    upsample_rates_decoder: list[int] = field(default_factory=lambda: [8, 8, 2, 2])
     upsample_initial_channel_decoder: int = 512
-    upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
-    periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
+    upsample_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    periods_multi_period_discriminator: list[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
     use_sdp: bool = True
     noise_scale: float = 1.0
     inference_noise_scale: float = 0.667
@@ -418,7 +419,7 @@ class VitsArgs(Coqpit):
     use_speaker_embedding: bool = False
     num_speakers: int = 0
     speakers_file: str = None
-    d_vector_file: List[str] = None
+    d_vector_file: list[str] = None
     speaker_embedding_channels: int = 256
     use_d_vector_file: bool = False
     d_vector_dim: int = 0
@@ -565,10 +566,6 @@ def __init__(
                 use_spectral_norm=self.args.use_spectral_norm_disriminator,
             )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     def init_multispeaker(self, config: Coqpit):
         """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
         or with external `d_vectors` computed from a speaker encoder model.
@@ -683,7 +680,7 @@ def on_init_end(self, trainer):  # pylint: disable=W0613
                     raise RuntimeError(" [!] The weights of Text Encoder was not reinit check it !")
             logger.info("Text Encoder was reinit.")
 
-    def get_aux_input(self, aux_input: Dict):
+    def get_aux_input(self, aux_input: dict):
         sid, g, lid, _ = self._set_cond_input(aux_input)
         return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid}
 
@@ -713,7 +710,7 @@ def _freeze_layers(self):
                 param.requires_grad = False
 
     @staticmethod
-    def _set_cond_input(aux_input: Dict):
+    def _set_cond_input(aux_input: dict):
         """Set the speaker conditioning input based on the multi-speaker mode."""
         sid, g, lid, durations = None, None, None, None
         if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None:
@@ -735,7 +732,7 @@ def _set_cond_input(aux_input: Dict):
 
         return sid, g, lid, durations
 
-    def _set_speaker_input(self, aux_input: Dict):
+    def _set_speaker_input(self, aux_input: dict):
         d_vectors = aux_input.get("d_vectors", None)
         speaker_ids = aux_input.get("speaker_ids", None)
 
@@ -808,7 +805,7 @@ def forward(  # pylint: disable=dangerous-default-value
         y_lengths: torch.tensor,
         waveform: torch.tensor,
         aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None},
-    ) -> Dict:
+    ) -> dict:
         """Forward pass of the model.
 
         Args:
@@ -927,7 +924,7 @@ def _set_x_lengths(x, aux_input):
             return aux_input["x_lengths"]
         return torch.tensor(x.shape[1:2]).to(x.device)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         x,
@@ -1014,7 +1011,7 @@ def inference(
         }
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference_voice_conversion(
         self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None
     ):
@@ -1055,8 +1052,8 @@ def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt):
         assert self.num_speakers > 0, "num_speakers have to be larger than 0."
         # speaker embedding
         if self.args.use_speaker_embedding and not self.args.use_d_vector_file:
-            g_src = self.emb_g(torch.from_numpy((np.array(speaker_cond_src))).unsqueeze(0)).unsqueeze(-1)
-            g_tgt = self.emb_g(torch.from_numpy((np.array(speaker_cond_tgt))).unsqueeze(0)).unsqueeze(-1)
+            g_src = self.emb_g(torch.from_numpy(np.array(speaker_cond_src)).unsqueeze(0)).unsqueeze(-1)
+            g_tgt = self.emb_g(torch.from_numpy(np.array(speaker_cond_tgt)).unsqueeze(0)).unsqueeze(-1)
         elif not self.args.use_speaker_embedding and self.args.use_d_vector_file:
             g_src = F.normalize(speaker_cond_src).unsqueeze(-1)
             g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1)
@@ -1069,7 +1066,7 @@ def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt):
         o_hat = self.waveform_decoder(z_hat * y_mask, g=g_tgt)
         return o_hat, y_mask, (z, z_p, z_hat)
 
-    def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]:
+    def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> tuple[dict, dict]:
         """Perform a single training step. Run the model forward pass and compute losses.
 
         Args:
@@ -1189,9 +1186,7 @@ def _log(self, ap, batch, outputs, name_prefix="train"):  # pylint: disable=unus
         )
         return figures, audios
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ):  # pylint: disable=no-self-use
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int):  # pylint: disable=no-self-use
         """Create visualizations and waveform examples.
 
         For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
@@ -1209,7 +1204,7 @@ def train_log(
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int):
         return self.train_step(batch, criterion, optimizer_idx)
 
@@ -1266,8 +1261,8 @@ def get_aux_input_from_test_sentences(self, sentence_info):
             "language_name": language_name,
         }
 
-    @torch.no_grad()
-    def test_run(self, assets) -> Tuple[Dict, Dict]:
+    @torch.inference_mode()
+    def test_run(self, assets) -> tuple[dict, dict]:
         """Generic test run for `tts` models used by `Trainer`.
 
         You can override this for a different behaviour.
@@ -1293,17 +1288,21 @@ def test_run(self, assets) -> Tuple[Dict, Dict]:
                 use_griffin_lim=True,
                 do_trim_silence=False,
             ).values()
-            test_audios["{}-audio".format(idx)] = wav
-            test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)
+            test_audios[f"{idx}-audio"] = wav
+            test_figures[f"{idx}-alignment"] = plot_alignment(alignment.permute(2, 1, 0), output_fig=False)
         return {"figures": test_figures, "audios": test_audios}
 
     def test_log(
-        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
+        self,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
     ) -> None:
         logger.test_audios(steps, outputs["audios"], self.ap.sample_rate)
         logger.test_figures(steps, outputs["figures"])
 
-    def format_batch(self, batch: Dict) -> Dict:
+    def format_batch(self, batch: dict) -> dict:
         """Compute speaker, langugage IDs and d_vector for the batch if necessary."""
         speaker_ids = None
         language_ids = None
@@ -1367,9 +1366,9 @@ def format_batch_on_device(self, batch):
         )
 
         if self.args.encoder_sample_rate:
-            assert batch["spec"].shape[2] == int(
-                batch["mel"].shape[2] / self.interpolate_factor
-            ), f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}"
+            assert batch["spec"].shape[2] == int(batch["mel"].shape[2] / self.interpolate_factor), (
+                f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}"
+            )
         else:
             assert batch["spec"].shape[2] == batch["mel"].shape[2], f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}"
 
@@ -1426,9 +1425,9 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1, is_eval=F
     def get_data_loader(
         self,
         config: Coqpit,
-        assets: Dict,
+        assets: dict,
         is_eval: bool,
-        samples: Union[List[Dict], List[List]],
+        samples: list[dict] | list[list],
         verbose: bool,
         num_gpus: int,
         rank: int = None,
@@ -1490,7 +1489,7 @@ def get_data_loader(
                     )
         return loader
 
-    def get_optimizer(self) -> List:
+    def get_optimizer(self) -> list:
         """Initiate and return the GAN optimizers based on the config parameters.
 
         It returns 2 optimizers in a list. First one is for the discriminator
@@ -1508,7 +1507,7 @@ def get_optimizer(self) -> List:
         )
         return [optimizer0, optimizer1]
 
-    def get_lr(self) -> List:
+    def get_lr(self) -> list:
         """Set the initial learning rates for each optimizer.
 
         Returns:
@@ -1516,7 +1515,7 @@ def get_lr(self) -> List:
         """
         return [self.config.lr_disc, self.config.lr_gen]
 
-    def get_scheduler(self, optimizer) -> List:
+    def get_scheduler(self, optimizer) -> list:
         """Set the schedulers for each optimizer.
 
         Args:
@@ -1539,9 +1538,7 @@ def get_criterion(self):
 
         return [VitsDiscriminatorLoss(self.config), VitsGeneratorLoss(self.config)]
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, strict=True, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, strict=True, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         """Load the model checkpoint and setup for training or inference"""
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         # compat band-aid for the pre-trained models to not use the encoder baked into the model
@@ -1568,9 +1565,7 @@ def load_checkpoint(
             self.eval()
             assert not self.training
 
-    def load_fairseq_checkpoint(
-        self, config, checkpoint_dir, eval=False, strict=True
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_fairseq_checkpoint(self, config, checkpoint_dir, eval=False, strict=True):  # pylint: disable=unused-argument, redefined-builtin
         """Load VITS checkpoints released by fairseq here: https://github.com/facebookresearch/fairseq/tree/main/examples/mms
         Performs some changes for compatibility.
 
@@ -1585,13 +1580,16 @@ def load_fairseq_checkpoint(
 
         self.disc = None
         # set paths
-        config_file = os.path.join(checkpoint_dir, "config.json")
-        checkpoint_file = os.path.join(checkpoint_dir, "G_100000.pth")
-        vocab_file = os.path.join(checkpoint_dir, "vocab.txt")
+        checkpoint_dir = Path(checkpoint_dir)
+        config_file = checkpoint_dir / "config.json"
+        checkpoint_file = checkpoint_dir / "model.pth"
+        if not checkpoint_file.is_file():
+            checkpoint_file = checkpoint_dir / "G_100000.pth"
+        vocab_file = checkpoint_dir / "vocab.txt"
         # set config params
-        with open(config_file, "r", encoding="utf-8") as file:
+        with open(config_file, encoding="utf-8") as f:
             # Load the JSON data as a dictionary
-            config_org = json.load(file)
+            config_org = json.load(f)
         self.config.audio.sample_rate = config_org["data"]["sampling_rate"]
         # self.config.add_blank = config['add_blank']
         # set tokenizer
@@ -1613,7 +1611,7 @@ def load_fairseq_checkpoint(
             assert not self.training
 
     @staticmethod
-    def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "VitsConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
@@ -1626,15 +1624,15 @@ def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]
         upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item()
 
         if not config.model_args.encoder_sample_rate:
-            assert (
-                upsample_rate == config.audio.hop_length
-            ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
+            assert upsample_rate == config.audio.hop_length, (
+                f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
+            )
         else:
             encoder_to_vocoder_upsampling_factor = config.audio.sample_rate / config.model_args.encoder_sample_rate
             effective_hop_length = config.audio.hop_length * encoder_to_vocoder_upsampling_factor
-            assert (
-                upsample_rate == effective_hop_length
-            ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}"
+            assert upsample_rate == effective_hop_length, (
+                f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}"
+            )
 
         ap = AudioProcessor.init_from_config(config)
         tokenizer, new_config = TTSTokenizer.init_from_config(config)
@@ -1825,7 +1823,7 @@ def to_config(self) -> "CharactersConfig":
 
 
 class FairseqVocab(BaseVocabulary):
-    def __init__(self, vocab: str):
+    def __init__(self, vocab: str | os.PathLike[Any]):
         super(FairseqVocab).__init__()
         self.vocab = vocab
 
@@ -1835,7 +1833,7 @@ def vocab(self):
         return self._vocab
 
     @vocab.setter
-    def vocab(self, vocab_file):
+    def vocab(self, vocab_file: str | os.PathLike[Any]):
         with open(vocab_file, encoding="utf-8") as f:
             self._vocab = [x.replace("\n", "") for x in f.readlines()]
         self.blank = self._vocab[0]
diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py
index 38091d7cff..833e2ddaa2 100644
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@@ -2,7 +2,6 @@
 import os
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Optional
 
 import librosa
 import torch
@@ -239,10 +238,6 @@ def init_models(self):
             cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer,
         )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     @torch.inference_mode()
     def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = 6):
         """Compute the conditioning latents for the GPT model from the given audio.
@@ -384,9 +379,9 @@ def synthesize(self, text, config, speaker_wav, language, speaker_id=None, speed
             as latents used at inference.
 
         """
-        assert (
-            "zh-cn" if language == "zh" else language in self.config.languages
-        ), f" ❗ Language {language} is not supported. Supported languages are {self.config.languages}"
+        assert "zh-cn" if language == "zh" else language in self.config.languages, (
+            f" ❗ Language {language} is not supported. Supported languages are {self.config.languages}"
+        )
         # Use generally found best tuning knobs for generation.
         settings = {
             "temperature": config.temperature,
@@ -526,9 +521,9 @@ def inference(
             sent = sent.strip().lower()
             text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device)
 
-            assert (
-                text_tokens.shape[-1] < self.args.gpt_max_text_tokens
-            ), " ❗ XTTS can only generate text with a maximum of 400 tokens."
+            assert text_tokens.shape[-1] < self.args.gpt_max_text_tokens, (
+                " ❗ XTTS can only generate text with a maximum of 400 tokens."
+            )
 
             with torch.no_grad():
                 gpt_codes = self.gpt.generate(
@@ -631,9 +626,9 @@ def inference_stream(
             sent = sent.strip().lower()
             text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device)
 
-            assert (
-                text_tokens.shape[-1] < self.args.gpt_max_text_tokens
-            ), " ❗ XTTS can only generate text with a maximum of 400 tokens."
+            assert text_tokens.shape[-1] < self.args.gpt_max_text_tokens, (
+                " ❗ XTTS can only generate text with a maximum of 400 tokens."
+            )
 
             fake_inputs = self.gpt.compute_embeddings(
                 gpt_cond_latent.to(self.device),
@@ -722,13 +717,13 @@ def get_compatible_checkpoint_state_dict(self, model_path):
     def load_checkpoint(
         self,
         config: "XttsConfig",
-        checkpoint_dir: Optional[str] = None,
-        checkpoint_path: Optional[str] = None,
-        vocab_path: Optional[str] = None,
+        checkpoint_dir: str | None = None,
+        checkpoint_path: str | None = None,
+        vocab_path: str | None = None,
         eval: bool = True,
         strict: bool = True,
         use_deepspeed: bool = False,
-        speaker_file_path: Optional[str] = None,
+        speaker_file_path: str | None = None,
     ):
         """
         Loads a checkpoint from disk and initializes the model's state and tokenizer.
diff --git a/TTS/tts/utils/data.py b/TTS/tts/utils/data.py
index 22e46b683a..d0269060c8 100644
--- a/TTS/tts/utils/data.py
+++ b/TTS/tts/utils/data.py
@@ -11,7 +11,7 @@ def _pad_data(x, length):
 
 
 def prepare_data(inputs):
-    max_len = max((len(x) for x in inputs))
+    max_len = max(len(x) for x in inputs)
     return np.stack([_pad_data(x, max_len) for x in inputs])
 
 
@@ -23,7 +23,7 @@ def _pad_tensor(x, length):
 
 
 def prepare_tensor(inputs, out_steps):
-    max_len = max((x.shape[1] for x in inputs))
+    max_len = max(x.shape[1] for x in inputs)
     remainder = max_len % out_steps
     pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
     return np.stack([_pad_tensor(x, pad_len) for x in inputs])
@@ -46,7 +46,7 @@ def _pad_stop_target(x: np.ndarray, length: int, pad_val=1) -> np.ndarray:
 
 def prepare_stop_target(inputs, out_steps):
     """Pad row vectors with 1."""
-    max_len = max((x.shape[0] for x in inputs))
+    max_len = max(x.shape[0] for x in inputs)
     remainder = max_len % out_steps
     pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
     return np.stack([_pad_stop_target(x, pad_len) for x in inputs])
diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py
index ff10f751f2..a3648eff4b 100644
--- a/TTS/tts/utils/helpers.py
+++ b/TTS/tts/utils/helpers.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 import numpy as np
 import torch
 from scipy.stats import betabinom
@@ -35,7 +33,7 @@ def inverse_transform(self, X):
 
 
 # from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
-def sequence_mask(sequence_length: torch.Tensor, max_len: Optional[int] = None) -> torch.Tensor:
+def sequence_mask(sequence_length: torch.Tensor, max_len: int | None = None) -> torch.Tensor:
     """Create a sequence mask for filtering padding in a sequence tensor.
 
     Args:
@@ -107,9 +105,9 @@ def rand_segments(
         _x_lenghts[len_diff < 0] = segment_size
         len_diff = _x_lenghts - segment_size
     else:
-        assert all(
-            len_diff > 0
-        ), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}"
+        assert all(len_diff > 0), (
+            f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}"
+        )
     segment_indices = (torch.rand([B]).type_as(x) * (len_diff + 1)).long()
     ret = segment(x, segment_indices, segment_size, pad_short=pad_short)
     return ret, segment_indices
@@ -164,7 +162,7 @@ def generate_path(duration: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
 
 
 def generate_attention(
-    duration: torch.Tensor, x_mask: torch.Tensor, y_mask: Optional[torch.Tensor] = None
+    duration: torch.Tensor, x_mask: torch.Tensor, y_mask: torch.Tensor | None = None
 ) -> torch.Tensor:
     """Generate an attention map from the linear scale durations.
 
diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py
index c72de2d4e6..5ce7759dd8 100644
--- a/TTS/tts/utils/languages.py
+++ b/TTS/tts/utils/languages.py
@@ -1,5 +1,5 @@
 import os
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Optional
 
 import fsspec
 import numpy as np
@@ -27,8 +27,8 @@ class LanguageManager(BaseIDManager):
 
     def __init__(
         self,
-        language_ids_file_path: Union[str, os.PathLike[Any]] = "",
-        config: Optional[Coqpit] = None,
+        language_ids_file_path: str | os.PathLike[Any] = "",
+        config: Coqpit | None = None,
     ):
         super().__init__(id_file_path=language_ids_file_path)
 
@@ -40,11 +40,11 @@ def num_languages(self) -> int:
         return len(list(self.name_to_id.keys()))
 
     @property
-    def language_names(self) -> List:
+    def language_names(self) -> list:
         return list(self.name_to_id.keys())
 
     @staticmethod
-    def parse_language_ids_from_config(c: Coqpit) -> Dict:
+    def parse_language_ids_from_config(c: Coqpit) -> dict:
         """Set language id from config.
 
         Args:
@@ -70,13 +70,13 @@ def set_language_ids_from_config(self, c: Coqpit) -> None:
         self.name_to_id = self.parse_language_ids_from_config(c)
 
     @staticmethod
-    def parse_ids_from_data(items: List, parse_key: str) -> Any:
+    def parse_ids_from_data(items: list, parse_key: str) -> Any:
         raise NotImplementedError
 
-    def set_ids_from_data(self, items: List, parse_key: str) -> Any:
+    def set_ids_from_data(self, items: list, parse_key: str) -> Any:
         raise NotImplementedError
 
-    def save_ids_to_file(self, file_path: Union[str, os.PathLike[Any]]) -> None:
+    def save_ids_to_file(self, file_path: str | os.PathLike[Any]) -> None:
         """Save language IDs to a json file.
 
         Args:
diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py
index 3a715dd75d..49e93454f2 100644
--- a/TTS/tts/utils/managers.py
+++ b/TTS/tts/utils/managers.py
@@ -1,7 +1,7 @@
 import json
 import os
 import random
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any
 
 import fsspec
 import numpy as np
@@ -13,7 +13,7 @@
 from TTS.utils.generic_utils import is_pytorch_at_least_2_4
 
 
-def load_file(path: Union[str, os.PathLike[Any]]):
+def load_file(path: str | os.PathLike[Any]):
     path = str(path)
     if path.endswith(".json"):
         with fsspec.open(path, "r") as f:
@@ -25,7 +25,7 @@ def load_file(path: Union[str, os.PathLike[Any]]):
         raise ValueError("Unsupported file type")
 
 
-def save_file(obj: Any, path: Union[str, os.PathLike[Any]]):
+def save_file(obj: Any, path: str | os.PathLike[Any]):
     path = str(path)
     if path.endswith(".json"):
         with fsspec.open(path, "w") as f:
@@ -42,23 +42,23 @@ class BaseIDManager:
     It defines common `ID` manager specific functions.
     """
 
-    def __init__(self, id_file_path: Union[str, os.PathLike[Any]] = ""):
+    def __init__(self, id_file_path: str | os.PathLike[Any] = ""):
         self.name_to_id = {}
 
         if id_file_path:
             self.load_ids_from_file(id_file_path)
 
     @staticmethod
-    def _load_json(json_file_path: Union[str, os.PathLike[Any]]) -> Dict:
+    def _load_json(json_file_path: str | os.PathLike[Any]) -> dict:
         with fsspec.open(str(json_file_path), "r") as f:
             return json.load(f)
 
     @staticmethod
-    def _save_json(json_file_path: Union[str, os.PathLike[Any]], data: dict) -> None:
+    def _save_json(json_file_path: str | os.PathLike[Any], data: dict) -> None:
         with fsspec.open(str(json_file_path), "w") as f:
             json.dump(data, f, indent=4)
 
-    def set_ids_from_data(self, items: List, parse_key: str) -> None:
+    def set_ids_from_data(self, items: list, parse_key: str) -> None:
         """Set IDs from data samples.
 
         Args:
@@ -66,7 +66,7 @@ def set_ids_from_data(self, items: List, parse_key: str) -> None:
         """
         self.name_to_id = self.parse_ids_from_data(items, parse_key=parse_key)
 
-    def load_ids_from_file(self, file_path: Union[str, os.PathLike[Any]]) -> None:
+    def load_ids_from_file(self, file_path: str | os.PathLike[Any]) -> None:
         """Set IDs from a file.
 
         Args:
@@ -74,7 +74,7 @@ def load_ids_from_file(self, file_path: Union[str, os.PathLike[Any]]) -> None:
         """
         self.name_to_id = load_file(file_path)
 
-    def save_ids_to_file(self, file_path: Union[str, os.PathLike[Any]]) -> None:
+    def save_ids_to_file(self, file_path: str | os.PathLike[Any]) -> None:
         """Save IDs to a json file.
 
         Args:
@@ -96,7 +96,7 @@ def get_random_id(self) -> Any:
         return None
 
     @staticmethod
-    def parse_ids_from_data(items: List, parse_key: str) -> Tuple[Dict]:
+    def parse_ids_from_data(items: list, parse_key: str) -> tuple[dict]:
         """Parse IDs from data samples retured by `load_tts_samples()`.
 
         Args:
@@ -133,10 +133,10 @@ class EmbeddingManager(BaseIDManager):
 
     def __init__(
         self,
-        embedding_file_path: Union[Union[str, os.PathLike[Any]], list[Union[str, os.PathLike[Any]]]] = "",
-        id_file_path: Union[str, os.PathLike[Any]] = "",
-        encoder_model_path: Union[str, os.PathLike[Any]] = "",
-        encoder_config_path: Union[str, os.PathLike[Any]] = "",
+        embedding_file_path: str | os.PathLike[Any] | list[str | os.PathLike[Any]] = "",
+        id_file_path: str | os.PathLike[Any] = "",
+        encoder_model_path: str | os.PathLike[Any] = "",
+        encoder_config_path: str | os.PathLike[Any] = "",
         use_cuda: bool = False,
     ):
         super().__init__(id_file_path=id_file_path)
@@ -179,7 +179,7 @@ def embedding_names(self):
         """Get embedding names."""
         return list(self.embeddings_by_names.keys())
 
-    def save_embeddings_to_file(self, file_path: Union[str, os.PathLike[Any]]) -> None:
+    def save_embeddings_to_file(self, file_path: str | os.PathLike[Any]) -> None:
         """Save embeddings to a json file.
 
         Args:
@@ -188,7 +188,7 @@ def save_embeddings_to_file(self, file_path: Union[str, os.PathLike[Any]]) -> No
         save_file(self.embeddings, file_path)
 
     @staticmethod
-    def read_embeddings_from_file(file_path: Union[str, os.PathLike[Any]]):
+    def read_embeddings_from_file(file_path: str | os.PathLike[Any]):
         """Load embeddings from a json file.
 
         Args:
@@ -207,7 +207,7 @@ def read_embeddings_from_file(file_path: Union[str, os.PathLike[Any]]):
                 embeddings_by_names[x["name"]].append(x["embedding"])
         return name_to_id, clip_ids, embeddings, embeddings_by_names
 
-    def load_embeddings_from_file(self, file_path: Union[str, os.PathLike[Any]]) -> None:
+    def load_embeddings_from_file(self, file_path: str | os.PathLike[Any]) -> None:
         """Load embeddings from a json file.
 
         Args:
@@ -217,7 +217,7 @@ def load_embeddings_from_file(self, file_path: Union[str, os.PathLike[Any]]) ->
             file_path
         )
 
-    def load_embeddings_from_list_of_files(self, file_paths: list[Union[str, os.PathLike[Any]]]) -> None:
+    def load_embeddings_from_list_of_files(self, file_paths: list[str | os.PathLike[Any]]) -> None:
         """Load embeddings from a list of json files and don't allow duplicate keys.
 
         Args:
@@ -242,7 +242,7 @@ def load_embeddings_from_list_of_files(self, file_paths: list[Union[str, os.Path
         # reset name_to_id to get the right speaker ids
         self.name_to_id = {name: i for i, name in enumerate(self.name_to_id)}
 
-    def get_embedding_by_clip(self, clip_idx: str) -> List:
+    def get_embedding_by_clip(self, clip_idx: str) -> list:
         """Get embedding by clip ID.
 
         Args:
@@ -253,7 +253,7 @@ def get_embedding_by_clip(self, clip_idx: str) -> List:
         """
         return self.embeddings[clip_idx]["embedding"]
 
-    def get_embeddings_by_name(self, idx: str) -> List[List]:
+    def get_embeddings_by_name(self, idx: str) -> list[list]:
         """Get all embeddings of a speaker.
 
         Args:
@@ -264,7 +264,7 @@ def get_embeddings_by_name(self, idx: str) -> List[List]:
         """
         return self.embeddings_by_names[idx]
 
-    def get_embeddings_by_names(self) -> Dict:
+    def get_embeddings_by_names(self) -> dict:
         """Get all embeddings by names.
 
         Returns:
@@ -313,11 +313,11 @@ def get_random_embedding(self) -> Any:
 
         return None
 
-    def get_clips(self) -> List:
+    def get_clips(self) -> list:
         return sorted(self.embeddings.keys())
 
     def init_encoder(
-        self, model_path: Union[str, os.PathLike[Any]], config_path: Union[str, os.PathLike[Any]], use_cuda=False
+        self, model_path: str | os.PathLike[Any], config_path: str | os.PathLike[Any], use_cuda=False
     ) -> None:
         """Initialize a speaker encoder model.
 
@@ -334,9 +334,8 @@ def init_encoder(
         )
         self.encoder_ap = AudioProcessor(**self.encoder_config.audio)
 
-    def compute_embedding_from_clip(
-        self, wav_file: Union[Union[str, os.PathLike[Any]], List[Union[str, os.PathLike[Any]]]]
-    ) -> list:
+    @torch.inference_mode()
+    def compute_embedding_from_clip(self, wav_file: str | os.PathLike[Any] | list[str | os.PathLike[Any]]) -> list:
         """Compute a embedding from a given audio file.
 
         Args:
@@ -373,7 +372,7 @@ def _compute(wav_file: str):
         embedding = _compute(wav_file)
         return embedding[0].tolist()
 
-    def compute_embeddings(self, feats: Union[torch.Tensor, np.ndarray]) -> List:
+    def compute_embeddings(self, feats: torch.Tensor | np.ndarray) -> list:
         """Compute embedding from features.
 
         Args:
diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py
index 89c56583f5..6fab27de5a 100644
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@@ -1,7 +1,7 @@
 import json
 import logging
 import os
-from typing import Any, Dict, List, Optional, Union
+from typing import Any
 
 import fsspec
 import numpy as np
@@ -56,11 +56,11 @@ class SpeakerManager(EmbeddingManager):
 
     def __init__(
         self,
-        data_items: Optional[list[list[Any]]] = None,
+        data_items: list[list[Any]] | None = None,
         d_vectors_file_path: str = "",
-        speaker_id_file_path: Union[str, os.PathLike[Any]] = "",
-        encoder_model_path: Union[str, os.PathLike[Any]] = "",
-        encoder_config_path: Union[str, os.PathLike[Any]] = "",
+        speaker_id_file_path: str | os.PathLike[Any] = "",
+        encoder_model_path: str | os.PathLike[Any] = "",
+        encoder_config_path: str | os.PathLike[Any] = "",
         use_cuda: bool = False,
     ):
         super().__init__(
@@ -82,11 +82,11 @@ def num_speakers(self):
     def speaker_names(self):
         return list(self.name_to_id.keys())
 
-    def get_speakers(self) -> List:
+    def get_speakers(self) -> list:
         return self.name_to_id
 
     @staticmethod
-    def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager":
+    def init_from_config(config: "Coqpit", samples: list[list] | list[dict] = None) -> "SpeakerManager":
         """Initialize a speaker manager from config
 
         Args:
@@ -150,7 +150,7 @@ def save_speaker_mapping(out_path, speaker_mapping):
             json.dump(speaker_mapping, f, indent=4)
 
 
-def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager:
+def get_speaker_manager(c: Coqpit, data: list = None, restore_path: str = None, out_path: str = None) -> SpeakerManager:
     """Initiate a `SpeakerManager` instance by the provided config.
 
     Args:
@@ -185,9 +185,9 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
             elif not c.use_d_vector_file:  # restor speaker manager with speaker ID file.
                 speaker_ids_from_data = speaker_manager.name_to_id
                 speaker_manager.load_ids_from_file(speakers_file)
-                assert all(
-                    speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data
-                ), " [!] You cannot introduce new speakers to a pre-trained model."
+                assert all(speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data), (
+                    " [!] You cannot introduce new speakers to a pre-trained model."
+                )
         elif c.use_d_vector_file and c.d_vector_file:
             # new speaker manager with external speaker embeddings.
             speaker_manager.load_embeddings_from_file(c.d_vector_file)
diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py
index eddf05db3f..660370a832 100644
--- a/TTS/tts/utils/ssim.py
+++ b/TTS/tts/utils/ssim.py
@@ -1,6 +1,5 @@
 # Adopted from https://github.com/photosynthesis-team/piq
 
-from typing import List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -24,11 +23,11 @@ def _reduce(x: torch.Tensor, reduction: str = "mean") -> torch.Tensor:
 
 
 def _validate_input(
-    tensors: List[torch.Tensor],
-    dim_range: Tuple[int, int] = (0, -1),
-    data_range: Tuple[float, float] = (0.0, -1.0),
+    tensors: list[torch.Tensor],
+    dim_range: tuple[int, int] = (0, -1),
+    data_range: tuple[float, float] = (0.0, -1.0),
     # size_dim_range: Tuple[float, float] = (0., -1.),
-    size_range: Optional[Tuple[int, int]] = None,
+    size_range: tuple[int, int] | None = None,
 ) -> None:
     r"""Check that input(-s)  satisfies the requirements
     Args:
@@ -50,16 +49,16 @@ def _validate_input(
         if size_range is None:
             assert t.size() == x.size(), f"Expected tensors with same size, got {t.size()} and {x.size()}"
         else:
-            assert (
-                t.size()[size_range[0] : size_range[1]] == x.size()[size_range[0] : size_range[1]]
-            ), f"Expected tensors with same size at given dimensions, got {t.size()} and {x.size()}"
+            assert t.size()[size_range[0] : size_range[1]] == x.size()[size_range[0] : size_range[1]], (
+                f"Expected tensors with same size at given dimensions, got {t.size()} and {x.size()}"
+            )
 
         if dim_range[0] == dim_range[1]:
             assert t.dim() == dim_range[0], f"Expected number of dimensions to be {dim_range[0]}, got {t.dim()}"
         elif dim_range[0] < dim_range[1]:
-            assert (
-                dim_range[0] <= t.dim() <= dim_range[1]
-            ), f"Expected number of dimensions to be between {dim_range[0]} and {dim_range[1]}, got {t.dim()}"
+            assert dim_range[0] <= t.dim() <= dim_range[1], (
+                f"Expected number of dimensions to be between {dim_range[0]} and {dim_range[1]}, got {t.dim()}"
+            )
 
         if data_range[0] < data_range[1]:
             assert data_range[0] <= t.min(), f"Expected values to be greater or equal to {data_range[0]}, got {t.min()}"
@@ -89,13 +88,13 @@ def ssim(
     y: torch.Tensor,
     kernel_size: int = 11,
     kernel_sigma: float = 1.5,
-    data_range: Union[int, float] = 1.0,
+    data_range: int | float = 1.0,
     reduction: str = "mean",
     full: bool = False,
     downsample: bool = True,
     k1: float = 0.01,
     k2: float = 0.03,
-) -> List[torch.Tensor]:
+) -> list[torch.Tensor]:
     r"""Interface of Structural Similarity (SSIM) index.
     Inputs supposed to be in range ``[0, data_range]``.
     To match performance with skimage and tensorflow set ``'downsample' = True``.
@@ -218,7 +217,7 @@ def __init__(
         k2: float = 0.03,
         downsample: bool = True,
         reduction: str = "mean",
-        data_range: Union[int, float] = 1.0,
+        data_range: int | float = 1.0,
     ) -> None:
         super().__init__()
 
@@ -270,7 +269,7 @@ def _ssim_per_channel(
     kernel: torch.Tensor,
     k1: float = 0.01,
     k2: float = 0.03,
-) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
     r"""Calculate Structural Similarity (SSIM) index for X and Y per channel.
 
     Args:
@@ -286,8 +285,7 @@ def _ssim_per_channel(
     """
     if x.size(-1) < kernel.size(-1) or x.size(-2) < kernel.size(-2):
         raise ValueError(
-            f"Kernel size can't be greater than actual input size. Input size: {x.size()}. "
-            f"Kernel size: {kernel.size()}"
+            f"Kernel size can't be greater than actual input size. Input size: {x.size()}. Kernel size: {kernel.size()}"
         )
 
     c1 = k1**2
@@ -321,7 +319,7 @@ def _ssim_per_channel_complex(
     kernel: torch.Tensor,
     k1: float = 0.01,
     k2: float = 0.03,
-) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
     r"""Calculate Structural Similarity (SSIM) index for Complex X and Y per channel.
 
     Args:
@@ -338,8 +336,7 @@ def _ssim_per_channel_complex(
     n_channels = x.size(1)
     if x.size(-2) < kernel.size(-1) or x.size(-3) < kernel.size(-2):
         raise ValueError(
-            f"Kernel size can't be greater than actual input size. Input size: {x.size()}. "
-            f"Kernel size: {kernel.size()}"
+            f"Kernel size can't be greater than actual input size. Input size: {x.size()}. Kernel size: {kernel.size()}"
         )
 
     c1 = k1**2
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index 5dc4cc569f..c09c3f5aa2 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -1,13 +1,9 @@
-from typing import Dict, Optional, Union
-
 import numpy as np
 import torch
 from torch import nn
 
 
-def numpy_to_torch(
-    np_array: np.ndarray, dtype: torch.dtype, device: Union[str, torch.device] = "cpu"
-) -> Optional[torch.Tensor]:
+def numpy_to_torch(np_array: np.ndarray, dtype: torch.dtype, device: str | torch.device = "cpu") -> torch.Tensor | None:
     if np_array is None:
         return None
     return torch.as_tensor(np_array, dtype=dtype, device=device)
@@ -31,7 +27,7 @@ def run_model_torch(
     style_text: str = None,
     d_vector: torch.Tensor = None,
     language_id: torch.Tensor = None,
-) -> Dict:
+) -> dict:
     """Run a torch model for inference. It does not support batch inference.
 
     Args:
@@ -75,14 +71,14 @@ def inv_spectrogram(postnet_output, ap, CONFIG):
     return wav
 
 
-def id_to_torch(aux_id, device: Union[str, torch.device] = "cpu") -> Optional[torch.Tensor]:
+def id_to_torch(aux_id, device: str | torch.device = "cpu") -> torch.Tensor | None:
     if aux_id is not None:
         aux_id = np.asarray(aux_id)
         aux_id = torch.from_numpy(aux_id).to(device)
     return aux_id
 
 
-def embedding_to_torch(d_vector, device: Union[str, torch.device] = "cpu") -> Optional[torch.Tensor]:
+def embedding_to_torch(d_vector, device: str | torch.device = "cpu") -> torch.Tensor | None:
     if d_vector is not None:
         d_vector = np.asarray(d_vector)
         d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor)
diff --git a/TTS/tts/utils/text/bangla/phonemizer.py b/TTS/tts/utils/text/bangla/phonemizer.py
index cddcb00fd5..1537240380 100644
--- a/TTS/tts/utils/text/bangla/phonemizer.py
+++ b/TTS/tts/utils/text/bangla/phonemizer.py
@@ -45,7 +45,7 @@ def tag_text(text: str):
     # create start and end
     text = "start" + text + "end"
     # tag text
-    parts = re.split("[\u0600-\u06FF]+", text)
+    parts = re.split("[\u0600-\u06ff]+", text)
     # remove non chars
     parts = [p for p in parts if p.strip()]
     # unique parts
diff --git a/TTS/tts/utils/text/characters.py b/TTS/tts/utils/text/characters.py
index 4bf9bf6bd5..f8beaef036 100644
--- a/TTS/tts/utils/text/characters.py
+++ b/TTS/tts/utils/text/characters.py
@@ -1,6 +1,5 @@
 import logging
 from dataclasses import replace
-from typing import Dict
 
 from TTS.tts.configs.shared_configs import CharactersConfig
 
@@ -47,7 +46,7 @@ class BaseVocabulary:
         vocab (Dict): A dictionary of characters and their corresponding indices.
     """
 
-    def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
+    def __init__(self, vocab: dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
         self.vocab = vocab
         self.pad = pad
         self.blank = blank
@@ -290,9 +289,9 @@ def _create_vocab(self):
         self.vocab = _vocab + list(self._punctuations)
         if self.is_unique:
             duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
-            assert (
-                len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
-            ), f" [!] There are duplicate characters in the character set. {duplicates}"
+            assert len(self.vocab) == len(self._char_to_id) == len(self._id_to_char), (
+                f" [!] There are duplicate characters in the character set. {duplicates}"
+            )
 
     def char_to_id(self, char: str) -> int:
         try:
diff --git a/TTS/tts/utils/text/chinese_mandarin/numbers.py b/TTS/tts/utils/text/chinese_mandarin/numbers.py
index 4787ea6100..3e6a043918 100644
--- a/TTS/tts/utils/text/chinese_mandarin/numbers.py
+++ b/TTS/tts/utils/text/chinese_mandarin/numbers.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 
 # Licensed under WTFPL or the Unlicense or CC0.
 # This uses Python 3, but it's easy to port to Python 2 by changing
diff --git a/TTS/tts/utils/text/chinese_mandarin/phonemizer.py b/TTS/tts/utils/text/chinese_mandarin/phonemizer.py
index e9d62e9d06..4dccdd5778 100644
--- a/TTS/tts/utils/text/chinese_mandarin/phonemizer.py
+++ b/TTS/tts/utils/text/chinese_mandarin/phonemizer.py
@@ -1,5 +1,3 @@
-from typing import List
-
 try:
     import jieba
     import pypinyin
@@ -9,7 +7,7 @@
 from .pinyinToPhonemes import PINYIN_DICT
 
 
-def _chinese_character_to_pinyin(text: str) -> List[str]:
+def _chinese_character_to_pinyin(text: str) -> list[str]:
     pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)
     pinyins_flat_list = [item for sublist in pinyins for item in sublist]
     return pinyins_flat_list
@@ -25,9 +23,9 @@ def _chinese_pinyin_to_phoneme(pinyin: str) -> str:
 def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str:
     tokenized_text = jieba.cut(text, HMM=False)
     tokenized_text = " ".join(tokenized_text)
-    pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text)
+    pinyined_text: list[str] = _chinese_character_to_pinyin(tokenized_text)
 
-    results: List[str] = []
+    results: list[str] = []
 
     for token in pinyined_text:
         if token[-1] in "12345":  # TODO transform to is_pinyin()
diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py
index f496b9f0dd..795ab246d2 100644
--- a/TTS/tts/utils/text/cleaners.py
+++ b/TTS/tts/utils/text/cleaners.py
@@ -1,7 +1,6 @@
 """Set of default text cleaners"""
 
 import re
-from typing import Optional
 from unicodedata import normalize
 
 from anyascii import anyascii
@@ -47,7 +46,7 @@ def remove_aux_symbols(text: str) -> str:
     return text
 
 
-def replace_symbols(text: str, lang: Optional[str] = "en") -> str:
+def replace_symbols(text: str, lang: str | None = "en") -> str:
     """Replace symbols based on the language tag.
 
     Args:
diff --git a/TTS/tts/utils/text/cmudict.py b/TTS/tts/utils/text/cmudict.py
index f206fb043b..9c0df06196 100644
--- a/TTS/tts/utils/text/cmudict.py
+++ b/TTS/tts/utils/text/cmudict.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
 import re
 
 VALID_SYMBOLS = [
@@ -121,7 +119,7 @@ def get_arpabet(word, cmudict, punctuation_symbols):
             word = word[:-1]
         arpabet = cmudict.lookup(word)
         if arpabet is not None:
-            return first_symbol + "{%s}" % arpabet[0] + last_symbol
+            return first_symbol + "{%s}" % arpabet[0] + last_symbol  # noqa: UP031
         return first_symbol + word + last_symbol
 
 
diff --git a/TTS/tts/utils/text/english/abbreviations.py b/TTS/tts/utils/text/english/abbreviations.py
index cd93c13c8e..20042b255b 100644
--- a/TTS/tts/utils/text/english/abbreviations.py
+++ b/TTS/tts/utils/text/english/abbreviations.py
@@ -2,7 +2,7 @@
 
 # List of (regular expression, replacement) pairs for abbreviations in english:
 abbreviations_en = [
-    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
     for x in [
         ("mrs", "misess"),
         ("mr", "mister"),
diff --git a/TTS/tts/utils/text/english/number_norm.py b/TTS/tts/utils/text/english/number_norm.py
index e8377ede87..be2a4b3084 100644
--- a/TTS/tts/utils/text/english/number_norm.py
+++ b/TTS/tts/utils/text/english/number_norm.py
@@ -1,7 +1,6 @@
-""" from https://github.com/keithito/tacotron """
+"""from https://github.com/keithito/tacotron"""
 
 import re
-from typing import Dict
 
 import inflect
 
@@ -21,7 +20,7 @@ def _expand_decimal_point(m):
     return m.group(1).replace(".", " point ")
 
 
-def __expand_currency(value: str, inflection: Dict[float, str]) -> str:
+def __expand_currency(value: str, inflection: dict[float, str]) -> str:
     parts = value.replace(",", "").split(".")
     if len(parts) > 2:
         return f"{value} {inflection[2]}"  # Unexpected format
@@ -85,7 +84,11 @@ def _expand_number(m):
         if num % 100 == 0:
             return _inflect.number_to_words(num // 100) + " hundred"
         return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
-    return _inflect.number_to_words(num, andword="")
+    try:
+        text = _inflect.number_to_words(num, andword="")
+    except inflect.NumOutOfRangeError:
+        text = _inflect.number_to_words(num, group=1).replace(", ", " ")
+    return text
 
 
 def normalize_numbers(text):
diff --git a/TTS/tts/utils/text/french/abbreviations.py b/TTS/tts/utils/text/french/abbreviations.py
index f580dfed7b..e317bbbf3a 100644
--- a/TTS/tts/utils/text/french/abbreviations.py
+++ b/TTS/tts/utils/text/french/abbreviations.py
@@ -2,7 +2,7 @@
 
 # List of (regular expression, replacement) pairs for abbreviations in french:
 abbreviations_fr = [
-    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
     for x in [
         ("M", "monsieur"),
         ("Mlle", "mademoiselle"),
@@ -38,7 +38,7 @@
         ("boul", "boulevard"),
     ]
 ] + [
-    (re.compile("\\b%s" % x[0]), x[1])
+    (re.compile(f"\\b{x[0]}"), x[1])
     for x in [
         ("Mlle", "mademoiselle"),
         ("Mlles", "mesdemoiselles"),
diff --git a/TTS/tts/utils/text/korean/ko_dictionary.py b/TTS/tts/utils/text/korean/ko_dictionary.py
index 9b739339c6..706f9f5daf 100644
--- a/TTS/tts/utils/text/korean/ko_dictionary.py
+++ b/TTS/tts/utils/text/korean/ko_dictionary.py
@@ -1,4 +1,3 @@
-# coding: utf-8
 # Add the word you want to the dictionary.
 etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"}
 
diff --git a/TTS/tts/utils/text/korean/korean.py b/TTS/tts/utils/text/korean/korean.py
index 423aeed377..1b1e0ca0fb 100644
--- a/TTS/tts/utils/text/korean/korean.py
+++ b/TTS/tts/utils/text/korean/korean.py
@@ -1,4 +1,3 @@
-﻿# coding: utf-8
 # Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py
 import re
 
diff --git a/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py b/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py
index 3c4a35bbfa..3be7354636 100644
--- a/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py
@@ -1,5 +1,3 @@
-from typing import Dict
-
 from TTS.tts.utils.text.bangla.phonemizer import bangla_text_to_phonemes
 from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
 
@@ -41,7 +39,7 @@ def _phonemize(self, text, separator):
         return self.phonemize_bn(text, separator)
 
     @staticmethod
-    def supported_languages() -> Dict:
+    def supported_languages() -> dict:
         return {"bn": "Bangla"}
 
     def version(self) -> str:
diff --git a/TTS/tts/utils/text/phonemizers/base.py b/TTS/tts/utils/text/phonemizers/base.py
index 5e701df458..6cc6ec0b37 100644
--- a/TTS/tts/utils/text/phonemizers/base.py
+++ b/TTS/tts/utils/text/phonemizers/base.py
@@ -1,6 +1,5 @@
 import abc
 import logging
-from typing import List, Tuple
 
 from TTS.tts.utils.text.punctuation import Punctuation
 
@@ -37,7 +36,7 @@ class BasePhonemizer(abc.ABC):
     def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False):
         # ensure the backend is installed on the system
         if not self.is_available():
-            raise RuntimeError("{} not installed on your system".format(self.name()))  # pragma: nocover
+            raise RuntimeError(f"{self.name()} not installed on your system")  # pragma: nocover
 
         # ensure the backend support the requested language
         self._language = self._init_language(language)
@@ -53,7 +52,7 @@ def _init_language(self, language):
 
         """
         if not self.is_supported_language(language):
-            raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend")
+            raise RuntimeError(f'language "{language}" is not supported by the {self.name()} backend')
         return language
 
     @property
@@ -93,7 +92,7 @@ def is_supported_language(self, language):
     def _phonemize(self, text, separator):
         """The main phonemization method"""
 
-    def _phonemize_preprocess(self, text) -> Tuple[List[str], List]:
+    def _phonemize_preprocess(self, text) -> tuple[list[str], list]:
         """Preprocess the text before phonemization
 
         1. remove spaces
diff --git a/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py b/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py
index e5fcab6e09..fa4a515d1a 100644
--- a/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py
@@ -1,5 +1,3 @@
-from typing import Dict
-
 from TTS.tts.utils.text.belarusian.phonemizer import belarusian_text_to_phonemes
 from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
 
@@ -34,7 +32,7 @@ def _phonemize(self, text, separator):
         return self.phonemize_be(text, separator)
 
     @staticmethod
-    def supported_languages() -> Dict:
+    def supported_languages() -> dict:
         return {"be": "Belarusian"}
 
     def version(self) -> str:
diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
index a15df716e7..dbcb8994a7 100644
--- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
+++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
@@ -5,7 +5,6 @@
 import subprocess
 import tempfile
 from pathlib import Path
-from typing import Optional
 
 from packaging.version import Version
 
@@ -104,7 +103,7 @@ class ESpeak(BasePhonemizer):
     def __init__(
         self,
         language: str,
-        backend: Optional[str] = None,
+        backend: str | None = None,
         punctuations: str = Punctuation.default_puncs(),
         keep_puncs: bool = True,
     ):
@@ -184,7 +183,7 @@ def phonemize_espeak(self, text: str, separator: str = "|", *, tie: bool = False
             else:
                 args.append("--ipa=1")
         if tie:
-            args.append("--tie=%s" % tie)
+            args.append(f"--tie={tie}")
 
         tmp = tempfile.NamedTemporaryFile(mode="w+t", delete=False, encoding="utf8")
         tmp.write(text)
diff --git a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py
index f3e9c9abd4..836fccf5b8 100644
--- a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py
+++ b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py
@@ -1,5 +1,4 @@
 import importlib
-from typing import List
 
 import gruut
 from gruut_ipa import IPA
@@ -114,7 +113,7 @@ def is_supported_language(self, language):
         return gruut.is_language_supported(language)
 
     @staticmethod
-    def supported_languages() -> List:
+    def supported_languages() -> list:
         """Get a dictionary of supported languages.
 
         Returns:
diff --git a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py
index 878e5e5296..b3b3ba4db7 100644
--- a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py
@@ -1,5 +1,3 @@
-from typing import Dict
-
 from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
 from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
 
@@ -51,7 +49,7 @@ def phonemize(self, text: str, separator="|", language=None) -> str:
         return self._phonemize(text, separator)
 
     @staticmethod
-    def supported_languages() -> Dict:
+    def supported_languages() -> dict:
         return {"ja-jp": "Japanese (Japan)"}
 
     def version(self) -> str:
diff --git a/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py b/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py
index 0bdba2137b..93930d064e 100644
--- a/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py
@@ -1,5 +1,3 @@
-from typing import Dict
-
 from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes
 from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
 
@@ -44,7 +42,7 @@ def phonemize(self, text: str, separator: str = "", character: str = "hangeul",
         return self._phonemize(text, separator, character)
 
     @staticmethod
-    def supported_languages() -> Dict:
+    def supported_languages() -> dict:
         return {"ko-kr": "hangeul(korean)"}
 
     def version(self) -> str:
diff --git a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py
index 1a9e98b091..87fb940f6b 100644
--- a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py
@@ -1,5 +1,4 @@
 import logging
-from typing import Dict, List
 
 from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
 
@@ -19,7 +18,7 @@ class MultiPhonemizer:
 
     lang_to_phonemizer = {}
 
-    def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None:  # pylint: disable=dangerous-default-value
+    def __init__(self, lang_to_phonemizer_name: dict = {}) -> None:  # pylint: disable=dangerous-default-value
         for k, v in lang_to_phonemizer_name.items():
             if v == "" and k in DEF_LANG_TO_PHONEMIZER.keys():
                 lang_to_phonemizer_name[k] = DEF_LANG_TO_PHONEMIZER[k]
@@ -29,7 +28,7 @@ def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None:  # pylint: disab
         self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name)
 
     @staticmethod
-    def init_phonemizers(lang_to_phonemizer_name: Dict) -> Dict:
+    def init_phonemizers(lang_to_phonemizer_name: dict) -> dict:
         lang_to_phonemizer = {}
         for k, v in lang_to_phonemizer_name.items():
             lang_to_phonemizer[k] = get_phonemizer_by_name(v, language=k)
@@ -44,7 +43,7 @@ def phonemize(self, text, separator="|", language=""):
             raise ValueError("Language must be set for multi-phonemizer to phonemize.")
         return self.lang_to_phonemizer[language].phonemize(text, separator)
 
-    def supported_languages(self) -> List:
+    def supported_languages(self) -> list:
         return list(self.lang_to_phonemizer.keys())
 
     def print_logs(self, level: int = 0):
diff --git a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py
index 41480c4173..9e70b03a0c 100644
--- a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py
@@ -1,5 +1,3 @@
-from typing import Dict
-
 from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
 from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
 
@@ -41,7 +39,7 @@ def _phonemize(self, text, separator):
         return self.phonemize_zh_cn(text, separator)
 
     @staticmethod
-    def supported_languages() -> Dict:
+    def supported_languages() -> dict:
         return {"zh-cn": "Chinese (China)"}
 
     def version(self) -> str:
diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py
index f653cdf13f..07a8753884 100644
--- a/TTS/tts/utils/text/tokenizer.py
+++ b/TTS/tts/utils/text/tokenizer.py
@@ -1,5 +1,6 @@
 import logging
-from typing import Callable, Dict, List, Union
+from collections.abc import Callable
+from typing import Union
 
 from TTS.tts.utils.text import cleaners
 from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes
@@ -43,7 +44,7 @@ def __init__(
         use_phonemes=False,
         text_cleaner: Callable = None,
         characters: "BaseCharacters" = None,
-        phonemizer: Union["Phonemizer", Dict] = None,
+        phonemizer: Union["Phonemizer", dict] = None,
         add_blank: bool = False,
         use_eos_bos=False,
     ):
@@ -65,7 +66,7 @@ def characters(self, new_characters):
         self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
         self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
 
-    def encode(self, text: str) -> List[int]:
+    def encode(self, text: str) -> list[int]:
         """Encodes a string of text as a sequence of IDs."""
         token_ids = []
         for char in text:
@@ -80,14 +81,14 @@ def encode(self, text: str) -> List[int]:
                     logger.warning("Character %s not found in the vocabulary. Discarding it.", repr(char))
         return token_ids
 
-    def decode(self, token_ids: List[int]) -> str:
+    def decode(self, token_ids: list[int]) -> str:
         """Decodes a sequence of IDs to a string of text."""
         text = ""
         for token_id in token_ids:
             text += self.characters.id_to_char(token_id)
         return text
 
-    def text_to_ids(self, text: str, language: str = None) -> List[int]:  # pylint: disable=unused-argument
+    def text_to_ids(self, text: str, language: str = None) -> list[int]:  # pylint: disable=unused-argument
         """Converts a string of text to a sequence of token IDs.
 
         Args:
@@ -121,15 +122,15 @@ def text_to_ids(self, text: str, language: str = None) -> List[int]:  # pylint:
             text = self.pad_with_bos_eos(text)
         return text
 
-    def ids_to_text(self, id_sequence: List[int]) -> str:
+    def ids_to_text(self, id_sequence: list[int]) -> str:
         """Converts a sequence of token IDs to a string of text."""
         return self.decode(id_sequence)
 
-    def pad_with_bos_eos(self, char_sequence: List[str]):
+    def pad_with_bos_eos(self, char_sequence: list[str]):
         """Pads a sequence with the special BOS and EOS characters."""
         return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
 
-    def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
+    def intersperse_blank_char(self, char_sequence: list[str], use_blank_char: bool = False):
         """Intersperses the blank character between characters in a sequence.
 
         Use the ```blank``` character if defined else use the ```pad``` character.
@@ -163,7 +164,7 @@ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
         """
         # init cleaners
         text_cleaner = None
-        if isinstance(config.text_cleaner, (str, list)):
+        if isinstance(config.text_cleaner, str | list):
             text_cleaner = getattr(cleaners, config.text_cleaner)
 
         # init characters
diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py
index 0cba7fc8a8..7fd4259178 100644
--- a/TTS/utils/audio/numpy_transforms.py
+++ b/TTS/utils/audio/numpy_transforms.py
@@ -1,7 +1,7 @@
 import logging
 import os
 from io import BytesIO
-from typing import Any, Optional, Union
+from typing import Any
 
 import librosa
 import numpy as np
@@ -21,7 +21,7 @@ def build_mel_basis(
     fft_size: int,
     num_mels: int,
     mel_fmin: int,
-    mel_fmax: Optional[int] = None,
+    mel_fmax: int | None = None,
     **kwargs,
 ) -> np.ndarray:
     """Build melspectrogram basis.
@@ -177,8 +177,8 @@ def stft(
     *,
     y: np.ndarray,
     fft_size: int,
-    hop_length: Optional[int] = None,
-    win_length: Optional[int] = None,
+    hop_length: int | None = None,
+    win_length: int | None = None,
     pad_mode: str = "reflect",
     window: str = "hann",
     center: bool = True,
@@ -205,8 +205,8 @@ def stft(
 def istft(
     *,
     y: np.ndarray,
-    hop_length: Optional[int] = None,
-    win_length: Optional[int] = None,
+    hop_length: int | None = None,
+    win_length: int | None = None,
     window: str = "hann",
     center: bool = True,
     **kwargs,
@@ -248,8 +248,8 @@ def compute_stft_paddings(*, x: np.ndarray, hop_length: int, pad_two_sides: bool
 def compute_f0(
     *,
     x: np.ndarray,
-    pitch_fmax: Optional[float] = None,
-    pitch_fmin: Optional[float] = None,
+    pitch_fmax: float | None = None,
+    pitch_fmin: float | None = None,
     hop_length: int,
     win_length: int,
     sample_rate: int,
@@ -408,7 +408,7 @@ def rms_volume_norm(*, x: np.ndarray, db_level: float = -27.0, **kwargs) -> np.n
 
 
 def load_wav(
-    *, filename: Union[str, os.PathLike[Any]], sample_rate: Optional[int] = None, resample: bool = False, **kwargs
+    *, filename: str | os.PathLike[Any], sample_rate: int | None = None, resample: bool = False, **kwargs
 ) -> np.ndarray:
     """Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
 
@@ -437,7 +437,7 @@ def load_wav(
 def save_wav(
     *,
     wav: np.ndarray,
-    path: Union[str, os.PathLike[Any]],
+    path: str | os.PathLike[Any],
     sample_rate: int,
     pipe_out=None,
     do_rms_norm: bool = False,
diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py
index bf07333aea..55b8575aa4 100644
--- a/TTS/utils/audio/processor.py
+++ b/TTS/utils/audio/processor.py
@@ -1,6 +1,6 @@
 import logging
 import os
-from typing import Any, Optional, Union
+from typing import Any
 
 import librosa
 import numpy as np
@@ -222,9 +222,9 @@ def __init__(
             self.hop_length = hop_length
             self.win_length = win_length
         assert min_level_db != 0.0, " [!] min_level_db is 0"
-        assert (
-            self.win_length <= self.fft_size
-        ), f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}"
+        assert self.win_length <= self.fft_size, (
+            f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}"
+        )
         members = vars(self)
         logger.info("Setting up Audio Processor...")
         for key, value in members.items():
@@ -283,7 +283,9 @@ def normalize(self, S: np.ndarray) -> np.ndarray:
                 S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
                 if self.clip_norm:
                     S_norm = np.clip(
-                        S_norm, -self.max_norm, self.max_norm  # pylint: disable=invalid-unary-operand-type
+                        S_norm,
+                        -self.max_norm,  # pylint: disable=invalid-unary-operand-type
+                        self.max_norm,
                     )
                 return S_norm
             S_norm = self.max_norm * S_norm
@@ -318,7 +320,9 @@ def denormalize(self, S: np.ndarray) -> np.ndarray:
             if self.symmetric_norm:
                 if self.clip_norm:
                     S_denorm = np.clip(
-                        S_denorm, -self.max_norm, self.max_norm  # pylint: disable=invalid-unary-operand-type
+                        S_denorm,
+                        -self.max_norm,  # pylint: disable=invalid-unary-operand-type
+                        self.max_norm,
                     )
                 S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
                 return S_denorm + self.ref_level_db
@@ -351,9 +355,9 @@ def load_stats(self, stats_path: str) -> tuple[np.array, np.array, np.array, np.
             if key in skip_parameters:
                 continue
             if key not in ["sample_rate", "trim_db"]:
-                assert (
-                    stats_config[key] == self.__dict__[key]
-                ), f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}"
+                assert stats_config[key] == self.__dict__[key], (
+                    f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}"
+                )
         return mel_mean, mel_std, linear_mean, linear_std, stats_config
 
     # pylint: disable=attribute-defined-outside-init
@@ -549,7 +553,7 @@ def sound_norm(x: np.ndarray) -> np.ndarray:
         return volume_norm(x=x)
 
     ### save and load ###
-    def load_wav(self, filename: Union[str, os.PathLike[Any]], sr: Optional[int] = None) -> np.ndarray:
+    def load_wav(self, filename: str | os.PathLike[Any], sr: int | None = None) -> np.ndarray:
         """Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
 
         Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.
@@ -576,9 +580,7 @@ def load_wav(self, filename: Union[str, os.PathLike[Any]], sr: Optional[int] = N
             x = rms_volume_norm(x=x, db_level=self.db_level)
         return x
 
-    def save_wav(
-        self, wav: np.ndarray, path: Union[str, os.PathLike[Any]], sr: Optional[int] = None, pipe_out=None
-    ) -> None:
+    def save_wav(self, wav: np.ndarray, path: str | os.PathLike[Any], sr: int | None = None, pipe_out=None) -> None:
         """Save a waveform to a file using Scipy.
 
         Args:
diff --git a/TTS/utils/callbacks.py b/TTS/utils/callbacks.py
deleted file mode 100644
index 511d215c65..0000000000
--- a/TTS/utils/callbacks.py
+++ /dev/null
@@ -1,105 +0,0 @@
-class TrainerCallback:
-    @staticmethod
-    def on_init_start(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_init_start"):
-                trainer.model.module.on_init_start(trainer)
-        else:
-            if hasattr(trainer.model, "on_init_start"):
-                trainer.model.on_init_start(trainer)
-
-        if hasattr(trainer.criterion, "on_init_start"):
-            trainer.criterion.on_init_start(trainer)
-
-        if hasattr(trainer.optimizer, "on_init_start"):
-            trainer.optimizer.on_init_start(trainer)
-
-    @staticmethod
-    def on_init_end(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_init_end"):
-                trainer.model.module.on_init_end(trainer)
-        else:
-            if hasattr(trainer.model, "on_init_end"):
-                trainer.model.on_init_end(trainer)
-
-        if hasattr(trainer.criterion, "on_init_end"):
-            trainer.criterion.on_init_end(trainer)
-
-        if hasattr(trainer.optimizer, "on_init_end"):
-            trainer.optimizer.on_init_end(trainer)
-
-    @staticmethod
-    def on_epoch_start(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_epoch_start"):
-                trainer.model.module.on_epoch_start(trainer)
-        else:
-            if hasattr(trainer.model, "on_epoch_start"):
-                trainer.model.on_epoch_start(trainer)
-
-        if hasattr(trainer.criterion, "on_epoch_start"):
-            trainer.criterion.on_epoch_start(trainer)
-
-        if hasattr(trainer.optimizer, "on_epoch_start"):
-            trainer.optimizer.on_epoch_start(trainer)
-
-    @staticmethod
-    def on_epoch_end(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_epoch_end"):
-                trainer.model.module.on_epoch_end(trainer)
-        else:
-            if hasattr(trainer.model, "on_epoch_end"):
-                trainer.model.on_epoch_end(trainer)
-
-        if hasattr(trainer.criterion, "on_epoch_end"):
-            trainer.criterion.on_epoch_end(trainer)
-
-        if hasattr(trainer.optimizer, "on_epoch_end"):
-            trainer.optimizer.on_epoch_end(trainer)
-
-    @staticmethod
-    def on_train_step_start(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_train_step_start"):
-                trainer.model.module.on_train_step_start(trainer)
-        else:
-            if hasattr(trainer.model, "on_train_step_start"):
-                trainer.model.on_train_step_start(trainer)
-
-        if hasattr(trainer.criterion, "on_train_step_start"):
-            trainer.criterion.on_train_step_start(trainer)
-
-        if hasattr(trainer.optimizer, "on_train_step_start"):
-            trainer.optimizer.on_train_step_start(trainer)
-
-    @staticmethod
-    def on_train_step_end(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_train_step_end"):
-                trainer.model.module.on_train_step_end(trainer)
-        else:
-            if hasattr(trainer.model, "on_train_step_end"):
-                trainer.model.on_train_step_end(trainer)
-
-        if hasattr(trainer.criterion, "on_train_step_end"):
-            trainer.criterion.on_train_step_end(trainer)
-
-        if hasattr(trainer.optimizer, "on_train_step_end"):
-            trainer.optimizer.on_train_step_end(trainer)
-
-    @staticmethod
-    def on_keyboard_interrupt(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_keyboard_interrupt"):
-                trainer.model.module.on_keyboard_interrupt(trainer)
-        else:
-            if hasattr(trainer.model, "on_keyboard_interrupt"):
-                trainer.model.on_keyboard_interrupt(trainer)
-
-        if hasattr(trainer.criterion, "on_keyboard_interrupt"):
-            trainer.criterion.on_keyboard_interrupt(trainer)
-
-        if hasattr(trainer.optimizer, "on_keyboard_interrupt"):
-            trainer.optimizer.on_keyboard_interrupt(trainer)
diff --git a/TTS/utils/capacitron_optimizer.py b/TTS/utils/capacitron_optimizer.py
index 7206ffd508..01f303f98d 100644
--- a/TTS/utils/capacitron_optimizer.py
+++ b/TTS/utils/capacitron_optimizer.py
@@ -1,4 +1,4 @@
-from typing import Generator
+from collections.abc import Generator
 
 from trainer.trainer_utils import get_optimizer
 
diff --git a/TTS/utils/download.py b/TTS/utils/download.py
index e94b1d68c8..75ef9164f6 100644
--- a/TTS/utils/download.py
+++ b/TTS/utils/download.py
@@ -7,8 +7,9 @@
 import urllib
 import urllib.request
 import zipfile
+from collections.abc import Iterable
 from os.path import expanduser
-from typing import Any, Iterable, List, Optional
+from typing import Any
 
 from torch.utils.model_zoo import tqdm
 
@@ -16,7 +17,7 @@
 
 
 def stream_url(
-    url: str, start_byte: Optional[int] = None, block_size: int = 32 * 1024, progress_bar: bool = True
+    url: str, start_byte: int | None = None, block_size: int = 32 * 1024, progress_bar: bool = True
 ) -> Iterable:
     """Stream url by chunk
 
@@ -36,7 +37,7 @@ def stream_url(
 
     req = urllib.request.Request(url)
     if start_byte:
-        req.headers["Range"] = "bytes={}-".format(start_byte)
+        req.headers["Range"] = f"bytes={start_byte}-"
 
     with (
         urllib.request.urlopen(req) as upointer,
@@ -61,8 +62,8 @@ def stream_url(
 def download_url(
     url: str,
     download_folder: str,
-    filename: Optional[str] = None,
-    hash_value: Optional[str] = None,
+    filename: str | None = None,
+    hash_value: str | None = None,
     hash_type: str = "sha256",
     progress_bar: bool = True,
     resume: bool = False,
@@ -88,10 +89,10 @@ def download_url(
     filepath = os.path.join(download_folder, filename)
     if resume and os.path.exists(filepath):
         mode = "ab"
-        local_size: Optional[int] = os.path.getsize(filepath)
+        local_size: int | None = os.path.getsize(filepath)
 
     elif not resume and os.path.exists(filepath):
-        raise RuntimeError("{} already exists. Delete the file manually and retry.".format(filepath))
+        raise RuntimeError(f"{filepath} already exists. Delete the file manually and retry.")
     else:
         mode = "wb"
         local_size = None
@@ -100,7 +101,7 @@ def download_url(
         with open(filepath, "rb") as file_obj:
             if validate_file(file_obj, hash_value, hash_type):
                 return
-        raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath))
+        raise RuntimeError(f"The hash of {filepath} does not match. Delete the file manually and retry.")
 
     with open(filepath, mode) as fpointer:
         for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar):
@@ -108,7 +109,7 @@ def download_url(
 
     with open(filepath, "rb") as file_obj:
         if hash_value and not validate_file(file_obj, hash_value, hash_type):
-            raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath))
+            raise RuntimeError(f"The hash of {filepath} does not match. Delete the file manually and retry.")
 
 
 def validate_file(file_obj: Any, hash_value: str, hash_type: str = "sha256") -> bool:
@@ -140,7 +141,7 @@ def validate_file(file_obj: Any, hash_value: str, hash_type: str = "sha256") ->
     return hash_func.hexdigest() == hash_value
 
 
-def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bool = False) -> List[str]:
+def extract_archive(from_path: str, to_path: str | None = None, overwrite: bool = False) -> list[str]:
     """Extract archive.
     Args:
         from_path (str): the path of the archive.
diff --git a/TTS/utils/downloaders.py b/TTS/utils/downloaders.py
index 8705873982..c06c2649ad 100644
--- a/TTS/utils/downloaders.py
+++ b/TTS/utils/downloaders.py
@@ -1,6 +1,5 @@
 import logging
 import os
-from typing import Optional
 
 from TTS.utils.download import download_kaggle_dataset, download_url, extract_archive
 
@@ -21,7 +20,7 @@ def download_ljspeech(path: str):
     extract_archive(archive)
 
 
-def download_vctk(path: str, use_kaggle: Optional[bool] = False):
+def download_vctk(path: str, use_kaggle: bool | None = False):
     """Download and extract VCTK dataset.
 
     Args:
@@ -49,7 +48,7 @@ def download_tweb(path: str):
     download_kaggle_dataset("bryanpark/the-world-english-bible-speech-dataset", "TWEB", path)
 
 
-def download_libri_tts(path: str, subset: Optional[str] = "all"):
+def download_libri_tts(path: str, subset: str | None = "all"):
     """Download and extract libri tts dataset.
 
     Args:
diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py
index 54bb5ba825..e1df6f6ed4 100644
--- a/TTS/utils/generic_utils.py
+++ b/TTS/utils/generic_utils.py
@@ -1,11 +1,11 @@
-# -*- coding: utf-8 -*-
 import datetime
 import importlib
 import logging
 import os
 import re
+from collections.abc import Callable
 from pathlib import Path
-from typing import Any, Callable, Dict, Optional, TextIO, TypeVar, Union
+from typing import Any, TextIO, TypeVar
 
 import torch
 from packaging.version import Version
@@ -16,11 +16,11 @@
 _T = TypeVar("_T")
 
 
-def exists(val: Union[_T, None]) -> TypeIs[_T]:
+def exists(val: _T | None) -> TypeIs[_T]:
     return val is not None
 
 
-def default(val: Union[_T, None], d: Union[_T, Callable[[], _T]]) -> _T:
+def default(val: _T | None, d: _T | Callable[[], _T]) -> _T:
     if exists(val):
         return val
     return d() if callable(d) else d
@@ -31,6 +31,7 @@ def to_camel(text):
     text = re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
     text = text.replace("Tts", "TTS")
     text = text.replace("vc", "VC")
+    text = text.replace("Knn", "KNN")
     return text
 
 
@@ -68,7 +69,7 @@ def get_import_path(obj: object) -> str:
     return ".".join([type(obj).__module__, type(obj).__name__])
 
 
-def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict:
+def format_aux_input(def_args: dict, kwargs: dict) -> dict:
     """Format kwargs to hande auxilary inputs to models.
 
     Args:
@@ -79,9 +80,9 @@ def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict:
         Dict: arguments with formatted auxilary inputs.
     """
     kwargs = kwargs.copy()
-    for name in def_args:
+    for name, arg in def_args.items():
         if name not in kwargs or kwargs[name] is None:
-            kwargs[name] = def_args[name]
+            kwargs[name] = arg
     return kwargs
 
 
@@ -107,9 +108,9 @@ def setup_logger(
     logger_name: str,
     level: int = logging.INFO,
     *,
-    formatter: Optional[logging.Formatter] = None,
-    stream: Optional[TextIO] = None,
-    log_dir: Optional[Union[str, os.PathLike[Any]]] = None,
+    formatter: logging.Formatter | None = None,
+    stream: TextIO | None = None,
+    log_dir: str | os.PathLike[Any] | None = None,
     log_name: str = "log",
 ) -> None:
     """Set up a logger.
@@ -145,6 +146,6 @@ def is_pytorch_at_least_2_4() -> bool:
     return Version(torch.__version__) >= Version("2.4")
 
 
-def optional_to_str(x: Optional[Any]) -> str:
+def optional_to_str(x: Any | None) -> str:
     """Convert input to string, using empty string if input is None."""
     return "" if x is None else str(x)
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index d7d4deab9d..20d6ab226b 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -6,7 +6,7 @@
 import zipfile
 from pathlib import Path
 from shutil import copyfile, rmtree
-from typing import Any, Optional, TypedDict, Union
+from typing import Any, TypedDict
 
 import fsspec
 import requests
@@ -15,6 +15,7 @@
 from typing_extensions import Required
 
 from TTS.config import load_config, read_json_with_comments
+from TTS.vc.configs.knnvc_config import KNNVCConfig
 
 logger = logging.getLogger(__name__)
 
@@ -26,12 +27,12 @@ class ModelItem(TypedDict, total=False):
     license: str
     author: str
     contact: str
-    commit: Optional[str]
+    commit: str | None
     model_hash: str
     tos_required: bool
-    default_vocoder: Optional[str]
-    model_url: Union[str, list[str]]
-    github_rls_url: Union[str, list[str]]
+    default_vocoder: str | None
+    model_url: str | list[str]
+    github_rls_url: str | list[str]
     hf_url: list[str]
 
 
@@ -48,7 +49,7 @@ class ModelItem(TypedDict, total=False):
 }
 
 
-class ModelManager(object):
+class ModelManager:
     tqdm_progress = None
     """Manage TTS models defined in .models.json.
     It provides an interface to list and download
@@ -65,8 +66,8 @@ class ModelManager(object):
 
     def __init__(
         self,
-        models_file: Optional[Union[str, os.PathLike[Any]]] = None,
-        output_prefix: Optional[Union[str, os.PathLike[Any]]] = None,
+        models_file: str | os.PathLike[Any] | None = None,
+        output_prefix: str | os.PathLike[Any] | None = None,
         progress_bar: bool = False,
     ) -> None:
         super().__init__()
@@ -83,7 +84,7 @@ def __init__(
             path = Path(__file__).parent / "../.models.json"
             self.read_models_file(path)
 
-    def read_models_file(self, file_path: Union[str, os.PathLike[Any]]) -> None:
+    def read_models_file(self, file_path: str | os.PathLike[Any]) -> None:
         """Read .models.json as a dict
 
         Args:
@@ -267,13 +268,13 @@ def set_model_url(model_item: ModelItem) -> ModelItem:
             model_item["model_url"] = model_item["github_rls_url"]
         elif "hf_url" in model_item:
             model_item["model_url"] = model_item["hf_url"]
-        elif "fairseq" in model_item["model_name"]:
+        elif "fairseq" in model_item.get("model_name", ""):
             model_item["model_url"] = "https://dl.fbaipublicfiles.com/mms/tts/"
-        elif "xtts" in model_item["model_name"]:
+        elif "xtts" in model_item.get("model_name", ""):
             model_item["model_url"] = "https://huggingface.co/coqui/"
         return model_item
 
-    def _set_model_item(self, model_name: str) -> tuple[ModelItem, str, str, Optional[str]]:
+    def _set_model_item(self, model_name: str) -> tuple[ModelItem, str, str, str | None]:
         # fetch model info from the dict
         if "fairseq" in model_name:
             model_type, lang, dataset, model = model_name.split("/")
@@ -367,6 +368,9 @@ def create_dir_and_download_model(self, model_name: str, model_item: ModelItem,
             logger.exception("Failed to download the model file to %s", output_path)
             rmtree(output_path)
             raise e
+        checkpoints = list(Path(output_path).glob("*.pt*"))
+        if len(checkpoints) == 1:
+            checkpoints[0].rename(checkpoints[0].parent / "model.pth")
         self.print_model_license(model_item=model_item)
 
     def check_if_configs_are_equal(self, model_name: str, model_item: ModelItem, output_path: Path) -> None:
@@ -385,7 +389,7 @@ def check_if_configs_are_equal(self, model_name: str, model_item: ModelItem, out
             logger.info("%s is already downloaded however it has been changed. Redownloading it...", model_name)
             self.create_dir_and_download_model(model_name, model_item, output_path)
 
-    def download_model(self, model_name: str) -> tuple[Path, Optional[Path], ModelItem]:
+    def download_model(self, model_name: str) -> tuple[Path, Path | None, ModelItem]:
         """Download model files given the full model name.
         Model name is in the format
             'type/language/dataset/model'
@@ -431,11 +435,14 @@ def download_model(self, model_name: str) -> tuple[Path, Optional[Path], ModelIt
         output_model_path = output_path
         output_config_path = None
         if (
-            model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name and "xtts" not in model_name
+            model not in ["tortoise-v2", "bark", "knnvc"] and "fairseq" not in model_name and "xtts" not in model_name
         ):  # TODO:This is stupid but don't care for now.
             output_model_path, output_config_path = self._find_files(output_path)
         else:
             output_config_path = output_model_path / "config.json"
+        if model == "knnvc" and not output_config_path.exists():
+            knnvc_config = KNNVCConfig()
+            knnvc_config.save_json(output_config_path)
         # update paths in the config.json
         self._update_paths(output_path, output_config_path)
         return output_model_path, output_config_path, model_item
@@ -464,7 +471,7 @@ def _find_files(output_path: Path) -> tuple[Path, Path]:
         return model_file, config_file
 
     @staticmethod
-    def _find_speaker_encoder(output_path: Path) -> Optional[Path]:
+    def _find_speaker_encoder(output_path: Path) -> Path | None:
         """Find the speaker encoder file in the output path
 
         Args:
@@ -516,7 +523,7 @@ def _update_paths(self, output_path: Path, config_path: Path) -> None:
         self._update_path("model_args.speaker_encoder_config_path", speaker_encoder_config_path, config_path)
 
     @staticmethod
-    def _update_path(field_name: str, new_path: Optional[Path], config_path: Path) -> None:
+    def _update_path(field_name: str, new_path: Path | None, config_path: Path) -> None:
         """Update the path in the model config.json for the current environment after download"""
         if new_path is not None and new_path.is_file():
             config = load_config(str(config_path))
@@ -612,9 +619,7 @@ def _download_tar_file(file_url: str, output_folder: Path, progress_bar: bool) -
         rmtree(output_folder / tar_names[0])
 
     @staticmethod
-    def _download_model_files(
-        file_urls: list[str], output_folder: Union[str, os.PathLike[Any]], progress_bar: bool
-    ) -> None:
+    def _download_model_files(file_urls: list[str], output_folder: str | os.PathLike[Any], progress_bar: bool) -> None:
         """Download the github releases"""
         output_folder = Path(output_folder)
         for file_url in file_urls:
diff --git a/TTS/utils/radam.py b/TTS/utils/radam.py
index cbd14990f3..b893d115c9 100644
--- a/TTS/utils/radam.py
+++ b/TTS/utils/radam.py
@@ -9,16 +9,16 @@
 class RAdam(Optimizer):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
         if lr < 0.0:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
         if eps < 0.0:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
+            raise ValueError(f"Invalid epsilon value: {eps}")
         if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
         if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
 
         self.degenerated_to_sgd = degenerated_to_sgd
-        if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
+        if isinstance(params, list | tuple) and len(params) > 0 and isinstance(params[0], dict):
             for param in params:
                 if "betas" in param and (param["betas"][0] != betas[0] or param["betas"][1] != betas[1]):
                     param["buffer"] = [[None, None, None] for _ in range(10)]
diff --git a/TTS/utils/samplers.py b/TTS/utils/samplers.py
index b08a763a33..d24733977a 100644
--- a/TTS/utils/samplers.py
+++ b/TTS/utils/samplers.py
@@ -1,6 +1,6 @@
 import math
 import random
-from typing import Callable, List, Union
+from collections.abc import Callable
 
 from torch.utils.data.sampler import BatchSampler, Sampler, SubsetRandomSampler
 
@@ -49,9 +49,9 @@ def __init__(
         label_key="class_name",
     ):
         super().__init__(dataset_items)
-        assert (
-            batch_size % (num_classes_in_batch * num_gpus) == 0
-        ), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)."
+        assert batch_size % (num_classes_in_batch * num_gpus) == 0, (
+            "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)."
+        )
 
         label_indices = {}
         for idx, item in enumerate(dataset_items):
@@ -176,7 +176,7 @@ def __init__(
         data,
         batch_size,
         drop_last,
-        sort_key: Union[Callable, List] = identity,
+        sort_key: Callable | list = identity,
         bucket_size_multiplier=100,
     ):
         super().__init__(sampler, batch_size, drop_last)
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 517cb7d2b2..cebb094a48 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -2,7 +2,7 @@
 import os
 import time
 from pathlib import Path
-from typing import Any, List, Optional, Union
+from typing import Any
 
 import numpy as np
 import pysbd
@@ -30,18 +30,18 @@ class Synthesizer(nn.Module):
     def __init__(
         self,
         *,
-        tts_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None,
-        tts_config_path: Optional[Union[str, os.PathLike[Any]]] = None,
-        tts_speakers_file: Optional[Union[str, os.PathLike[Any]]] = None,
-        tts_languages_file: Optional[Union[str, os.PathLike[Any]]] = None,
-        vocoder_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None,
-        vocoder_config: Optional[Union[str, os.PathLike[Any]]] = None,
-        encoder_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None,
-        encoder_config: Optional[Union[str, os.PathLike[Any]]] = None,
-        vc_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None,
-        vc_config: Optional[Union[str, os.PathLike[Any]]] = None,
-        model_dir: Optional[Union[str, os.PathLike[Any]]] = None,
-        voice_dir: Optional[Union[str, os.PathLike[Any]]] = None,
+        tts_checkpoint: str | os.PathLike[Any] | None = None,
+        tts_config_path: str | os.PathLike[Any] | None = None,
+        tts_speakers_file: str | os.PathLike[Any] | None = None,
+        tts_languages_file: str | os.PathLike[Any] | None = None,
+        vocoder_checkpoint: str | os.PathLike[Any] | None = None,
+        vocoder_config: str | os.PathLike[Any] | None = None,
+        encoder_checkpoint: str | os.PathLike[Any] | None = None,
+        encoder_config: str | os.PathLike[Any] | None = None,
+        vc_checkpoint: str | os.PathLike[Any] | None = None,
+        vc_config: str | os.PathLike[Any] | None = None,
+        model_dir: str | os.PathLike[Any] | None = None,
+        voice_dir: str | os.PathLike[Any] | None = None,
         use_cuda: bool = False,
     ) -> None:
         """General 🐸 TTS interface for inference. It takes a tts and a vocoder
@@ -98,12 +98,12 @@ def __init__(
         if tts_checkpoint:
             self._load_tts(self.tts_checkpoint, self.tts_config_path, use_cuda)
 
-        if vocoder_checkpoint:
-            self._load_vocoder(self.vocoder_checkpoint, self.vocoder_config, use_cuda)
-
         if vc_checkpoint and model_dir == "":
             self._load_vc(self.vc_checkpoint, self.vc_config, use_cuda)
 
+        if vocoder_checkpoint:
+            self._load_vocoder(self.vocoder_checkpoint, self.vocoder_config, use_cuda)
+
         if model_dir:
             if "fairseq" in model_dir:
                 self._load_fairseq_from_dir(model_dir, use_cuda)
@@ -139,7 +139,9 @@ def _load_vc(self, vc_checkpoint: str, vc_config_path: str, use_cuda: bool) -> N
         """
         # pylint: disable=global-statement
         self.vc_config = load_config(vc_config_path)
-        self.output_sample_rate = self.vc_config.audio["output_sample_rate"]
+        self.output_sample_rate = self.vc_config.audio.get(
+            "output_sample_rate", self.vc_config.audio.get("sample_rate", None)
+        )
         self.vc_model = setup_vc_model(config=self.vc_config)
         self.vc_model.load_checkpoint(self.vc_config, vc_checkpoint)
         if use_cuda:
@@ -246,7 +248,7 @@ def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> N
         if use_cuda:
             self.vocoder_model.cuda()
 
-    def split_into_sentences(self, text) -> List[str]:
+    def split_into_sentences(self, text) -> list[str]:
         """Split give text into sentences.
 
         Args:
@@ -257,7 +259,7 @@ def split_into_sentences(self, text) -> List[str]:
         """
         return self.seg.segment(text)
 
-    def save_wav(self, wav: List[int], path: str, pipe_out=None) -> None:
+    def save_wav(self, wav: list[int], path: str, pipe_out=None) -> None:
         """Save the waveform as a file.
 
         Args:
@@ -272,9 +274,21 @@ def save_wav(self, wav: List[int], path: str, pipe_out=None) -> None:
             wav = np.array(wav)
         save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate, pipe_out=pipe_out)
 
-    def voice_conversion(self, source_wav: str, target_wav: str) -> List[int]:
-        output_wav = self.vc_model.voice_conversion(source_wav, target_wav)
-        return output_wav
+    def voice_conversion(self, source_wav: str, target_wav: str | list[str], **kwargs) -> list[int]:
+        start_time = time.time()
+
+        if not isinstance(target_wav, list):
+            target_wav = [target_wav]
+        output = self.vc_model.voice_conversion(source_wav, target_wav, **kwargs)
+        if self.vocoder_model is not None:
+            output = self.vocoder_model.inference(output)
+
+        output = output.squeeze()
+        process_time = time.time() - start_time
+        audio_time = len(output) / self.output_sample_rate
+        logger.info("Processing time: %.3f", process_time)
+        logger.info("Real-time factor: %.3f", process_time / audio_time)
+        return output
 
     def tts(
         self,
@@ -288,7 +302,7 @@ def tts(
         reference_speaker_name=None,
         split_sentences: bool = True,
         **kwargs,
-    ) -> List[int]:
+    ) -> list[int]:
         """🐸 TTS magic. Run all the models and generate speech.
 
         Args:
diff --git a/TTS/vc/configs/freevc_config.py b/TTS/vc/configs/freevc_config.py
index d600bfb1f4..37f8048b7f 100644
--- a/TTS/vc/configs/freevc_config.py
+++ b/TTS/vc/configs/freevc_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List, Optional
 
 from coqpit import Coqpit
 
@@ -47,7 +46,7 @@ class FreeVCAudioConfig(Coqpit):
     win_length: int = field(default=1280)
     n_mel_channels: int = field(default=80)
     mel_fmin: float = field(default=0.0)
-    mel_fmax: Optional[float] = field(default=None)
+    mel_fmax: float | None = field(default=None)
 
 
 @dataclass
@@ -122,11 +121,11 @@ class FreeVCArgs(Coqpit):
     kernel_size: int = field(default=3)
     p_dropout: float = field(default=0.1)
     resblock: str = field(default="1")
-    resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11])
-    resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
-    upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2])
+    resblock_kernel_sizes: list[int] = field(default_factory=lambda: [3, 7, 11])
+    resblock_dilation_sizes: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+    upsample_rates: list[int] = field(default_factory=lambda: [10, 8, 2, 2])
     upsample_initial_channel: int = field(default=512)
-    upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    upsample_kernel_sizes: list[int] = field(default_factory=lambda: [16, 16, 4, 4])
     n_layers_q: int = field(default=3)
     use_spectral_norm: bool = field(default=False)
     gin_channels: int = field(default=256)
@@ -269,7 +268,7 @@ class FreeVCConfig(BaseVCConfig):
 
     # use d-vectors
     use_d_vector_file: bool = False
-    d_vector_file: List[str] = None
+    d_vector_file: list[str] = None
     d_vector_dim: int = None
 
     def __post_init__(self):
diff --git a/TTS/vc/configs/knnvc_config.py b/TTS/vc/configs/knnvc_config.py
new file mode 100644
index 0000000000..7728ea0a9b
--- /dev/null
+++ b/TTS/vc/configs/knnvc_config.py
@@ -0,0 +1,59 @@
+from dataclasses import dataclass, field
+
+from coqpit import Coqpit
+
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.vc.configs.shared_configs import BaseVCConfig
+
+
+@dataclass
+class KNNVCAudioConfig(BaseAudioConfig):
+    """Audio configuration.
+
+    Args:
+        sample_rate (int):
+            The sampling rate of the input waveform.
+    """
+
+    sample_rate: int = field(default=16000)
+
+
+@dataclass
+class KNNVCArgs(Coqpit):
+    """Model arguments.
+
+    Args:
+        ssl_dim (int):
+            The dimension of the self-supervised learning embedding.
+    """
+
+    ssl_dim: int = field(default=1024)
+
+
+@dataclass
+class KNNVCConfig(BaseVCConfig):
+    """Parameters.
+
+    Args:
+        model (str):
+            Model name. Do not change unless you know what you are doing.
+
+        model_args (KNNVCArgs):
+            Model architecture arguments. Defaults to `KNNVCArgs()`.
+
+        audio (KNNVCAudioConfig):
+            Audio processing configuration. Defaults to `KNNVCAudioConfig()`.
+
+        wavlm_layer (int):
+            WavLM layer to use for feature extraction.
+
+        topk (int):
+            k in the kNN -- the number of nearest neighbors to average over
+    """
+
+    model: str = "knnvc"
+    model_args: KNNVCArgs = field(default_factory=KNNVCArgs)
+    audio: KNNVCAudioConfig = field(default_factory=KNNVCAudioConfig)
+
+    wavlm_layer: int = 6
+    topk: int = 4
diff --git a/TTS/vc/configs/openvoice_config.py b/TTS/vc/configs/openvoice_config.py
index 261cdd6f47..167a61ddb3 100644
--- a/TTS/vc/configs/openvoice_config.py
+++ b/TTS/vc/configs/openvoice_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import Optional
 
 from coqpit import Coqpit
 
@@ -187,13 +186,13 @@ class OpenVoiceConfig(BaseVCConfig):
     # multi-speaker settings
     # use speaker embedding layer
     num_speakers: int = 0
-    speakers_file: Optional[str] = None
+    speakers_file: str | None = None
     speaker_embedding_channels: int = 256
 
     # use d-vectors
     use_d_vector_file: bool = False
-    d_vector_file: Optional[list[str]] = None
-    d_vector_dim: Optional[int] = None
+    d_vector_file: list[str] | None = None
+    d_vector_dim: int | None = None
 
     def __post_init__(self) -> None:
         for key, val in self.model_args.items():
diff --git a/TTS/vc/configs/shared_configs.py b/TTS/vc/configs/shared_configs.py
index b2fe63d29d..b84a97e487 100644
--- a/TTS/vc/configs/shared_configs.py
+++ b/TTS/vc/configs/shared_configs.py
@@ -1,12 +1,11 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
 
 
 @dataclass
 class BaseVCConfig(BaseTrainingConfig):
-    """Shared parameters among all the tts models.
+    """Shared parameters among all the VC models.
 
     Args:
 
@@ -132,7 +131,7 @@ class BaseVCConfig(BaseTrainingConfig):
     shuffle: bool = False
     drop_last: bool = False
     # dataset
-    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
+    datasets: list[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
     # optimizer
     optimizer: str = "radam"
     optimizer_params: dict = None
@@ -140,7 +139,7 @@ class BaseVCConfig(BaseTrainingConfig):
     lr_scheduler: str = None
     lr_scheduler_params: dict = field(default_factory=lambda: {})
     # testing
-    test_sentences: List[str] = field(default_factory=lambda: [])
+    test_sentences: list[str] = field(default_factory=lambda: [])
     # evaluation
     eval_split_max_size: int = None
     eval_split_size: float = 0.01
diff --git a/TTS/vc/layers/freevc/modules.py b/TTS/vc/layers/freevc/modules.py
index c34f22d701..92df39b5e0 100644
--- a/TTS/vc/layers/freevc/modules.py
+++ b/TTS/vc/layers/freevc/modules.py
@@ -48,7 +48,7 @@ def forward(self, x, x_mask):
 
 class WN(torch.nn.Module):
     def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
-        super(WN, self).__init__()
+        super().__init__()
         assert kernel_size % 2 == 1
         self.hidden_channels = hidden_channels
         self.kernel_size = (kernel_size,)
@@ -122,7 +122,7 @@ def remove_weight_norm(self):
 
 class ResBlock1(torch.nn.Module):
     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super(ResBlock1, self).__init__()
+        super().__init__()
         self.convs1 = nn.ModuleList(
             [
                 weight_norm(
@@ -198,7 +198,7 @@ def remove_weight_norm(self):
 
 class ResBlock2(torch.nn.Module):
     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
-        super(ResBlock2, self).__init__()
+        super().__init__()
         self.convs = nn.ModuleList(
             [
                 weight_norm(
diff --git a/TTS/vc/layers/freevc/speaker_encoder/audio.py b/TTS/vc/layers/freevc/speaker_encoder/audio.py
index 5fa317ce45..5d14bf2f19 100644
--- a/TTS/vc/layers/freevc/speaker_encoder/audio.py
+++ b/TTS/vc/layers/freevc/speaker_encoder/audio.py
@@ -1,5 +1,4 @@
 from pathlib import Path
-from typing import Optional, Union
 
 # import webrtcvad
 import librosa
@@ -16,7 +15,7 @@
 int16_max = (2**15) - 1
 
 
-def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int] = None):
+def preprocess_wav(fpath_or_wav: str | Path | np.ndarray, source_sr: int | None = None):
     """
     Applies the preprocessing operations used in training the Speaker Encoder to a waveform
     either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
diff --git a/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py b/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py
index a6d5bcf942..d2f4ffe394 100644
--- a/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py
+++ b/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py
@@ -1,6 +1,5 @@
 import logging
 from time import perf_counter as timer
-from typing import List, Union
 
 import numpy as np
 import torch
@@ -22,12 +21,8 @@
 
 
 class SpeakerEncoder(nn.Module):
-    def __init__(self, weights_fpath, device: Union[str, torch.device] = None):
-        """
-        :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda").
-        If None, defaults to cuda if it is available on your machine, otherwise the model will
-        run on cpu. Outputs are always returned on the cpu, as numpy arrays.
-        """
+    def __init__(self, weights_fpath):
+        """FreeVC speaker encoder."""
         super().__init__()
 
         # Define the network
@@ -35,13 +30,6 @@ def __init__(self, weights_fpath, device: Union[str, torch.device] = None):
         self.linear = nn.Linear(model_hidden_size, model_embedding_size)
         self.relu = nn.ReLU()
 
-        # Get the target device
-        if device is None:
-            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        elif isinstance(device, str):
-            device = torch.device(device)
-        self.device = device
-
         # Load the pretrained model'speaker weights
         # weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt")
         # if not weights_fpath.exists():
@@ -52,8 +40,11 @@ def __init__(self, weights_fpath, device: Union[str, torch.device] = None):
         checkpoint = load_fsspec(weights_fpath, map_location="cpu")
 
         self.load_state_dict(checkpoint["model_state"], strict=False)
-        self.to(device)
-        logger.info("Loaded the voice encoder model on %s in %.2f seconds.", device.type, timer() - start)
+        logger.info("Loaded the voice encoder model in %.2f seconds.", timer() - start)
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
 
     def forward(self, mels: torch.FloatTensor):
         """
@@ -97,7 +88,7 @@ def compute_partial_slices(n_samples: int, rate, min_coverage):
         assert 0 < min_coverage <= 1
 
         # Compute how many frames separate two partial utterances
-        samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+        samples_per_frame = int(sampling_rate * mel_window_step / 1000)
         n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
         frame_step = int(np.round((sampling_rate / rate) / samples_per_frame))
         assert 0 < frame_step, "The rate is too high"
@@ -123,7 +114,7 @@ def compute_partial_slices(n_samples: int, rate, min_coverage):
 
         return wav_slices, mel_slices
 
-    def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75):
+    def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75) -> torch.Tensor:
         """
         Computes an embedding for a single utterance. The utterance is divided in partial
         utterances and an embedding is computed for each. The complete utterance embedding is the
@@ -143,8 +134,8 @@ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_
         then the last partial utterance will be considered by zero-padding the audio. Otherwise,
         it will be discarded. If there aren't enough frames for one partial utterance,
         this parameter is ignored so that the function always returns at least one slice.
-        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
-        <return_partials> is True, the partial utterances as a numpy array of float32 of shape
+        :return: the embedding as a float tensor of shape (model_embedding_size,). If
+        <return_partials> is True, the partial utterances as a float tensor of shape
         (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
         returned.
         """
@@ -160,24 +151,26 @@ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_
         mels = np.array([mel[s] for s in mel_slices])
         with torch.no_grad():
             mels = torch.from_numpy(mels).to(self.device)
-            partial_embeds = self(mels).cpu().numpy()
+            partial_embeds = self(mels)
 
         # Compute the utterance embedding from the partial embeddings
-        raw_embed = np.mean(partial_embeds, axis=0)
-        embed = raw_embed / np.linalg.norm(raw_embed, 2)
+        raw_embed = partial_embeds.mean(dim=0)
+        embed = raw_embed / torch.norm(raw_embed, p=2)
 
         if return_partials:
             return embed, partial_embeds, wav_slices
         return embed
 
-    def embed_speaker(self, wavs: List[np.ndarray], **kwargs):
+    def embed_speaker(self, wavs: list[np.ndarray], **kwargs):
         """
         Compute the embedding of a collection of wavs (presumably from the same speaker) by
         averaging their embedding and L2-normalizing it.
 
         :param wavs: list of wavs a numpy arrays of float32.
         :param kwargs: extra arguments to embed_utterance()
-        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,).
+        :return: the embedding as a float tensor of shape (model_embedding_size,).
         """
-        raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) for wav in wavs], axis=0)
-        return raw_embed / np.linalg.norm(raw_embed, 2)
+        raw_embed = torch.mean(
+            torch.stack([self.embed_utterance(wav, return_partials=False, **kwargs) for wav in wavs]), dim=0
+        )
+        return raw_embed / torch.norm(raw_embed, p=2)
diff --git a/TTS/vc/layers/freevc/wavlm/__init__.py b/TTS/vc/layers/freevc/wavlm/__init__.py
index 62f7e74aaf..d9c3858f89 100644
--- a/TTS/vc/layers/freevc/wavlm/__init__.py
+++ b/TTS/vc/layers/freevc/wavlm/__init__.py
@@ -13,7 +13,7 @@
 model_uri = "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/WavLM-Large.pt"
 
 
-def get_wavlm(device="cpu"):
+def get_wavlm(device="cpu") -> WavLM:
     """Download the model and return the model object."""
 
     output_path = get_user_data_dir("tts")
diff --git a/TTS/vc/layers/freevc/wavlm/modules.py b/TTS/vc/layers/freevc/wavlm/modules.py
index 37c1a6e877..cf31a866de 100644
--- a/TTS/vc/layers/freevc/wavlm/modules.py
+++ b/TTS/vc/layers/freevc/wavlm/modules.py
@@ -9,7 +9,6 @@
 
 import math
 import warnings
-from typing import Dict, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -89,7 +88,7 @@ class Swish(nn.Module):
 
     def __init__(self):
         """Construct an MultiHeadedAttention object."""
-        super(Swish, self).__init__()
+        super().__init__()
         self.act = torch.nn.Sigmoid()
 
     def forward(self, x):
@@ -98,7 +97,7 @@ def forward(self, x):
 
 class GLU_Linear(nn.Module):
     def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True):
-        super(GLU_Linear, self).__init__()
+        super().__init__()
 
         self.glu_type = glu_type
         self.output_dim = output_dim
@@ -158,7 +157,7 @@ def get_activation_fn(activation: str):
     elif activation == "glu":
         return lambda x: x
     else:
-        raise RuntimeError("--activation-fn {} not supported".format(activation))
+        raise RuntimeError(f"--activation-fn {activation} not supported")
 
 
 def init_bert_params(module):
@@ -219,7 +218,7 @@ def quant_noise(module, p, block_size):
         return module
 
     # supported modules
-    assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d))
+    assert isinstance(module, nn.Linear | nn.Embedding | nn.Conv2d)
 
     # test whether module.weight has the right sizes wrt block_size
     is_conv = module.weight.ndim == 4
@@ -331,7 +330,7 @@ def __init__(
         self.encoder_decoder_attention = encoder_decoder_attention
 
         assert not self.self_attention or self.qkv_same_dim, (
-            "Self-attention requires query, key and " "value to be of the same size"
+            "Self-attention requires query, key and value to be of the same size"
         )
 
         k_bias = True
@@ -424,17 +423,17 @@ def compute_bias(self, query_length, key_length):
     def forward(
         self,
         query,
-        key: Optional[Tensor],
-        value: Optional[Tensor],
-        key_padding_mask: Optional[Tensor] = None,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        key: Tensor | None,
+        value: Tensor | None,
+        key_padding_mask: Tensor | None = None,
+        incremental_state: dict[str, dict[str, Tensor | None]] | None = None,
         need_weights: bool = True,
         static_kv: bool = False,
-        attn_mask: Optional[Tensor] = None,
+        attn_mask: Tensor | None = None,
         before_softmax: bool = False,
         need_head_weights: bool = False,
-        position_bias: Optional[Tensor] = None,
-    ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+        position_bias: Tensor | None = None,
+    ) -> tuple[Tensor, Tensor | None, Tensor | None]:
         """Input shape: Time x Batch x Channel
 
         Args:
@@ -605,7 +604,7 @@ def forward(
                 else:
                     assert v is not None
                     v = torch.cat([prev_value, v], dim=1)
-            prev_key_padding_mask: Optional[Tensor] = None
+            prev_key_padding_mask: Tensor | None = None
             if "prev_key_padding_mask" in saved_state:
                 prev_key_padding_mask = saved_state["prev_key_padding_mask"]
             assert k is not None and v is not None
@@ -700,7 +699,7 @@ def forward(
         assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
         attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
         attn = self.out_proj(attn)
-        attn_weights: Optional[Tensor] = None
+        attn_weights: Tensor | None = None
         if need_weights:
             attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
             if not need_head_weights:
@@ -711,12 +710,12 @@ def forward(
 
     @staticmethod
     def _append_prev_key_padding_mask(
-        key_padding_mask: Optional[Tensor],
-        prev_key_padding_mask: Optional[Tensor],
+        key_padding_mask: Tensor | None,
+        prev_key_padding_mask: Tensor | None,
         batch_size: int,
         src_len: int,
         static_kv: bool,
-    ) -> Optional[Tensor]:
+    ) -> Tensor | None:
         # saved key padding masks have shape (bsz, seq_len)
         if prev_key_padding_mask is not None and static_kv:
             new_key_padding_mask = prev_key_padding_mask
@@ -748,19 +747,19 @@ def _append_prev_key_padding_mask(
         return new_key_padding_mask
 
     def _get_input_buffer(
-        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
-    ) -> Dict[str, Optional[Tensor]]:
+        self, incremental_state: dict[str, dict[str, Tensor | None]] | None
+    ) -> dict[str, Tensor | None]:
         result = self.get_incremental_state(incremental_state, "attn_state")
         if result is not None:
             return result
         else:
-            empty_result: Dict[str, Optional[Tensor]] = {}
+            empty_result: dict[str, Tensor | None] = {}
             return empty_result
 
     def _set_input_buffer(
         self,
-        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
-        buffer: Dict[str, Optional[Tensor]],
+        incremental_state: dict[str, dict[str, Tensor | None]],
+        buffer: dict[str, Tensor | None],
     ):
         return self.set_incremental_state(incremental_state, "attn_state", buffer)
 
diff --git a/TTS/vc/layers/freevc/wavlm/wavlm.py b/TTS/vc/layers/freevc/wavlm/wavlm.py
index 775f3e5979..6358662e18 100644
--- a/TTS/vc/layers/freevc/wavlm/wavlm.py
+++ b/TTS/vc/layers/freevc/wavlm/wavlm.py
@@ -9,7 +9,7 @@
 
 import logging
 import math
-from typing import List, Optional, Tuple
+from typing import Any
 
 import numpy as np
 import torch
@@ -33,8 +33,8 @@
 
 
 def compute_mask_indices(
-    shape: Tuple[int, int],
-    padding_mask: Optional[torch.Tensor],
+    shape: tuple[int, int],
+    padding_mask: torch.Tensor | None,
     mask_prob: float,
     mask_length: int,
     mask_type: str = "static",
@@ -68,8 +68,7 @@ def compute_mask_indices(
 
     all_num_mask = int(
         # add a random number for probabilistic rounding
-        mask_prob * all_sz / float(mask_length)
-        + np.random.rand()
+        mask_prob * all_sz / float(mask_length) + np.random.rand()
     )
 
     all_num_mask = max(min_masks, all_num_mask)
@@ -80,8 +79,7 @@ def compute_mask_indices(
             sz = all_sz - padding_mask[i].long().sum().item()
             num_mask = int(
                 # add a random number for probabilistic rounding
-                mask_prob * sz / float(mask_length)
-                + np.random.rand()
+                mask_prob * sz / float(mask_length) + np.random.rand()
             )
             num_mask = max(min_masks, num_mask)
         else:
@@ -155,9 +153,7 @@ def arrange(s, e, length, keep_length):
 
 class WavLMConfig:
     def __init__(self, cfg=None):
-        self.extractor_mode: str = (
-            "default"  # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
-        )
+        self.extractor_mode: str = "default"  # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
         self.encoder_layers: int = 12  # num encoder layers in the transformer
 
         self.encoder_embed_dim: int = 768  # encoder embedding dimension
@@ -166,9 +162,7 @@ def __init__(self, cfg=None):
         self.activation_fn: str = "gelu"  # activation function to use
 
         self.layer_norm_first: bool = False  # apply layernorm first in the transformer
-        self.conv_feature_layers: str = (
-            "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2"  # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
-        )
+        self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2"  # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
         self.conv_bias: bool = False  # include bias in conv encoder
         self.feature_grad_mult: float = 1.0  # multiply feature extractor var grads by this
 
@@ -225,7 +219,7 @@ def __init__(
         cfg: WavLMConfig,
     ) -> None:
         super().__init__()
-        logger.info(f"WavLM Config: {cfg.__dict__}")
+        logger.info("WavLM Config: %s", cfg.__dict__)
 
         self.cfg = cfg
         feature_enc_layers = eval(cfg.conv_feature_layers)
@@ -317,12 +311,12 @@ def forward_padding_mask(
     def extract_features(
         self,
         source: torch.Tensor,
-        padding_mask: Optional[torch.Tensor] = None,
+        padding_mask: torch.Tensor | None = None,
         mask: bool = False,
         ret_conv: bool = False,
-        output_layer: Optional[int] = None,
+        output_layer: int | None = None,
         ret_layer_results: bool = False,
-    ):
+    ) -> tuple[torch.Tensor, dict[str, Any]]:
         if self.feature_grad_mult > 0:
             features = self.feature_extractor(source)
             if self.feature_grad_mult != 1.0:
@@ -367,7 +361,7 @@ def extract_features(
 class ConvFeatureExtractionModel(nn.Module):
     def __init__(
         self,
-        conv_layers: List[Tuple[int, int, int]],
+        conv_layers: list[tuple[int, int, int]],
         dropout: float = 0.0,
         mode: str = "default",
         conv_bias: bool = False,
diff --git a/TTS/vc/models/__init__.py b/TTS/vc/models/__init__.py
index a9807d7006..859eaeb2a7 100644
--- a/TTS/vc/models/__init__.py
+++ b/TTS/vc/models/__init__.py
@@ -1,15 +1,21 @@
 import importlib
 import logging
 import re
-from typing import Dict, List, Union
+
+from TTS.vc.configs.shared_configs import BaseVCConfig
+from TTS.vc.models.base_vc import BaseVC
 
 logger = logging.getLogger(__name__)
 
 
-def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseVC":
+def setup_model(config: BaseVCConfig) -> BaseVC:
     logger.info("Using model: %s", config.model)
     # fetch the right model implementation.
-    if "model" in config and config["model"].lower() == "freevc":
+    if config["model"].lower() == "freevc":
         MyModel = importlib.import_module("TTS.vc.models.freevc").FreeVC
-        model = MyModel.init_from_config(config, samples)
-    return model
+    elif config["model"].lower() == "knnvc":
+        MyModel = importlib.import_module("TTS.vc.models.knnvc").KNNVC
+    else:
+        msg = f"Model {config.model} does not exist!"
+        raise ValueError(msg)
+    return MyModel.init_from_config(config)
diff --git a/TTS/vc/models/base_vc.py b/TTS/vc/models/base_vc.py
index 22ffd0095c..a953b901e8 100644
--- a/TTS/vc/models/base_vc.py
+++ b/TTS/vc/models/base_vc.py
@@ -1,7 +1,7 @@
 import logging
 import os
 import random
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 import torch.distributed as dist
@@ -37,9 +37,9 @@ class BaseVC(BaseTrainerModel):
     def __init__(
         self,
         config: Coqpit,
-        ap: AudioProcessor,
-        speaker_manager: Optional[SpeakerManager] = None,
-        language_manager: Optional[LanguageManager] = None,
+        ap: AudioProcessor | None = None,
+        speaker_manager: SpeakerManager | None = None,
+        language_manager: LanguageManager | None = None,
     ) -> None:
         super().__init__()
         self.config = config
@@ -51,7 +51,7 @@ def __init__(
     def _set_model_args(self, config: Coqpit) -> None:
         """Setup model args based on the config type (`ModelConfig` or `ModelArgs`).
 
-        `ModelArgs` has all the fields reuqired to initialize the model architecture.
+        `ModelArgs` has all the fields required to initialize the model architecture.
 
         `ModelConfig` has all the fields required for training, inference and containes `ModelArgs`.
 
@@ -69,7 +69,7 @@ def _set_model_args(self, config: Coqpit) -> None:
         else:
             raise ValueError("config must be either a *Config or *Args")
 
-    def init_multispeaker(self, config: Coqpit, data: Optional[list[Any]] = None) -> None:
+    def init_multispeaker(self, config: Coqpit, data: list[Any] | None = None) -> None:
         """Initialize a speaker embedding layer if needen and define expected embedding channel size for defining
         `in_channels` size of the connected layers.
 
@@ -106,7 +106,7 @@ def get_aux_input(self, **kwargs: Any) -> dict[str, Any]:
         """Prepare and return `aux_input` used by `forward()`"""
         return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None}
 
-    def get_aux_input_from_test_sentences(self, sentence_info: Union[str, list[str]]) -> dict[str, Any]:
+    def get_aux_input_from_test_sentences(self, sentence_info: str | list[str]) -> dict[str, Any]:
         if hasattr(self.config, "model_args"):
             config = self.config.model_args
         else:
@@ -199,9 +199,9 @@ def format_batch(self, batch: dict[str, Any]) -> dict[str, Any]:
                 extra_frames = dur.sum() - mel_lengths[idx]
                 largest_idxs = torch.argsort(-dur)[:extra_frames]
                 dur[largest_idxs] -= 1
-                assert (
-                    dur.sum() == mel_lengths[idx]
-                ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}"
+                assert dur.sum() == mel_lengths[idx], (
+                    f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}"
+                )
                 durations[idx, : text_lengths[idx]] = dur
 
         # set stop targets wrt reduction factor
@@ -275,10 +275,10 @@ def get_data_loader(
         config: Coqpit,
         assets: dict,
         is_eval: bool,
-        samples: Union[list[dict], list[list]],
+        samples: list[dict] | list[list],
         verbose: bool,
         num_gpus: int,
-        rank: Optional[int] = None,
+        rank: int | None = None,
     ) -> "DataLoader":
         if is_eval and not config.run_eval:
             loader = None
@@ -402,13 +402,11 @@ def test_run(self, assets: dict) -> tuple[dict, dict]:
                 use_griffin_lim=True,
                 do_trim_silence=False,
             )
-            test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
-            test_figures["{}-prediction".format(idx)] = plot_spectrogram(
+            test_audios[f"{idx}-audio"] = outputs_dict["wav"]
+            test_figures[f"{idx}-prediction"] = plot_spectrogram(
                 outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False
             )
-            test_figures["{}-alignment".format(idx)] = plot_alignment(
-                outputs_dict["outputs"]["alignments"], output_fig=False
-            )
+            test_figures[f"{idx}-alignment"] = plot_alignment(outputs_dict["outputs"]["alignments"], output_fig=False)
         return test_figures, test_audios
 
     def on_init_start(self, trainer: Trainer) -> None:
diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py
index c654219c39..59af40a836 100644
--- a/TTS/vc/models/freevc.py
+++ b/TTS/vc/models/freevc.py
@@ -1,5 +1,4 @@
 import logging
-from typing import Dict, List, Optional, Tuple, Union
 
 import librosa
 import numpy as np
@@ -102,7 +101,7 @@ def __init__(
         upsample_kernel_sizes,
         gin_channels=0,
     ):
-        super(Generator, self).__init__()
+        super().__init__()
         self.num_kernels = len(resblock_kernel_sizes)
         self.num_upsamples = len(upsample_rates)
         self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
@@ -165,7 +164,7 @@ def remove_weight_norm(self):
 
 class MultiPeriodDiscriminator(torch.nn.Module):
     def __init__(self, use_spectral_norm=False):
-        super(MultiPeriodDiscriminator, self).__init__()
+        super().__init__()
         periods = [2, 3, 5, 7, 11]
 
         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
@@ -190,7 +189,7 @@ def forward(self, y, y_hat):
 
 class SpeakerEncoder(torch.nn.Module):
     def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
-        super(SpeakerEncoder, self).__init__()
+        super().__init__()
         self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
         self.linear = nn.Linear(model_hidden_size, model_embedding_size)
         self.relu = nn.ReLU()
@@ -233,7 +232,7 @@ def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
 class FreeVC(BaseVC):
     """
 
-    Papaer::
+    Paper::
         https://arxiv.org/abs/2210.15418#
 
     Paper Abstract::
@@ -306,15 +305,11 @@ def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
 
         self.wavlm = get_wavlm()
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     def load_pretrained_speaker_encoder(self):
         """Load pretrained speaker encoder model as mentioned in the paper."""
         logger.info("Loading pretrained speaker encoder model ...")
         self.enc_spk_ex = SpeakerEncoderEx(
-            "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/speaker_encoder.pt", device=self.device
+            "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/speaker_encoder.pt"
         )
 
     def init_multispeaker(self, config: Coqpit):
@@ -335,15 +330,15 @@ def forward(
         self,
         c: torch.Tensor,
         spec: torch.Tensor,
-        g: Optional[torch.Tensor] = None,
-        mel: Optional[torch.Tensor] = None,
-        c_lengths: Optional[torch.Tensor] = None,
-        spec_lengths: Optional[torch.Tensor] = None,
-    ) -> Tuple[
+        g: torch.Tensor | None = None,
+        mel: torch.Tensor | None = None,
+        c_lengths: torch.Tensor | None = None,
+        spec_lengths: torch.Tensor | None = None,
+    ) -> tuple[
         torch.Tensor,
         torch.Tensor,
         torch.Tensor,
-        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+        tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
     ]:
         """
         Forward pass of the model.
@@ -389,8 +384,8 @@ def forward(
 
         return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
 
-    @torch.no_grad()
-    def inference(self, c, g=None, mel=None, c_lengths=None):
+    @torch.inference_mode()
+    def inference(self, c, g=None, c_lengths=None):
         """
         Inference pass of the model
 
@@ -405,9 +400,6 @@ def inference(self, c, g=None, mel=None, c_lengths=None):
         """
         if c_lengths is None:
             c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
-        if not self.use_spk:
-            g = self.enc_spk.embed_utterance(mel)
-            g = g.unsqueeze(-1)
         z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
         z = self.flow(z_p, c_mask, g=g, reverse=True)
         o = self.dec(z * c_mask, g=g)
@@ -438,51 +430,52 @@ def load_audio(self, wav):
         return wav.float()
 
     @torch.inference_mode()
-    def voice_conversion(self, src, tgt):
+    def voice_conversion(self, src: str | torch.Tensor, tgt: list[str | torch.Tensor]):
         """
         Voice conversion pass of the model.
 
         Args:
             src (str or torch.Tensor): Source utterance.
-            tgt (str or torch.Tensor): Target utterance.
+            tgt (list of str or torch.Tensor): Target utterances.
 
         Returns:
             torch.Tensor: Output tensor.
         """
 
-        wav_tgt = self.load_audio(tgt).cpu().numpy()
-        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
-
-        if self.config.model_args.use_spk:
-            g_tgt = self.enc_spk_ex.embed_utterance(wav_tgt)
-            g_tgt = torch.from_numpy(g_tgt)[None, :, None].to(self.device)
-        else:
-            wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(self.device)
-            mel_tgt = mel_spectrogram_torch(
-                wav_tgt,
-                self.config.audio.filter_length,
-                self.config.audio.n_mel_channels,
-                self.config.audio.input_sample_rate,
-                self.config.audio.hop_length,
-                self.config.audio.win_length,
-                self.config.audio.mel_fmin,
-                self.config.audio.mel_fmax,
-            )
         # src
         wav_src = self.load_audio(src)
         c = self.extract_wavlm_features(wav_src[None, :])
 
-        if self.config.model_args.use_spk:
-            audio = self.inference(c, g=g_tgt)
-        else:
-            audio = self.inference(c, mel=mel_tgt.transpose(1, 2))
-        audio = audio[0][0].data.cpu().float().numpy()
-        return audio
+        # tgt
+        g_tgts = []
+        for tg in tgt:
+            wav_tgt = self.load_audio(tg).cpu().numpy()
+            wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+
+            if self.config.model_args.use_spk:
+                g_tgts.append(self.enc_spk_ex.embed_utterance(wav_tgt)[None, :, None])
+            else:
+                wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(self.device)
+                mel_tgt = mel_spectrogram_torch(
+                    wav_tgt,
+                    self.config.audio.filter_length,
+                    self.config.audio.n_mel_channels,
+                    self.config.audio.input_sample_rate,
+                    self.config.audio.hop_length,
+                    self.config.audio.win_length,
+                    self.config.audio.mel_fmin,
+                    self.config.audio.mel_fmax,
+                )
+                g_tgts.append(self.enc_spk.embed_utterance(mel_tgt.transpose(1, 2)).unsqueeze(-1))
+
+        g_tgt = torch.stack(g_tgts).mean(dim=0)
+        audio = self.inference(c, g=g_tgt)
+        return audio[0][0].data.cpu().float().numpy()
 
     def eval_step(): ...
 
     @staticmethod
-    def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: FreeVCConfig) -> "FreeVC":
         model = FreeVC(config)
         return model
 
diff --git a/TTS/vc/models/knnvc.py b/TTS/vc/models/knnvc.py
new file mode 100644
index 0000000000..c31f52e749
--- /dev/null
+++ b/TTS/vc/models/knnvc.py
@@ -0,0 +1,181 @@
+import logging
+import os
+from typing import Any, TypeAlias
+
+import torch
+import torch.nn.functional as F
+import torchaudio
+from coqpit import Coqpit
+
+from TTS.vc.configs.knnvc_config import KNNVCConfig
+from TTS.vc.layers.freevc.wavlm import get_wavlm
+from TTS.vc.models.base_vc import BaseVC
+
+logger = logging.getLogger(__name__)
+
+PathOrTensor: TypeAlias = str | os.PathLike[Any] | torch.Tensor
+
+
+class KNNVC(BaseVC):
+    """
+    Paper::
+        https://arxiv.org/abs/2305.18975
+
+    Paper Abstract::
+        Any-to-any voice conversion aims to transform source speech
+        into a target voice with just a few examples of the target speaker as a
+        reference. Recent methods produce convincing conversions, but at the cost of
+        increased complexity -- making results difficult to reproduce and build on.
+        Instead, we keep it simple. We propose k-nearest neighbors voice conversion
+        (kNN-VC): a straightforward yet effective method for any-to-any conversion.
+        First, we extract self-supervised representations of the source and reference
+        speech. To convert to the target speaker, we replace each frame of the source
+        representation with its nearest neighbor in the reference. Finally, a pretrained
+        vocoder synthesizes audio from the converted representation. Objective and
+        subjective evaluations show that kNN-VC improves speaker similarity with similar
+        intelligibility scores to existing methods.
+
+    Samples::
+        https://bshall.github.io/knn-vc
+
+    Original code::
+        https://github.com/bshall/knn-vc
+
+    Examples:
+        >>> from TTS.vc.configs.knnvc_config import KNNVCConfig
+        >>> from TTS.vc.models.knnvc import KNNVC
+        >>> config = KNNVCConfig()
+        >>> model = KNNVC(config)
+    """
+
+    def __init__(self, config: Coqpit):
+        super().__init__(config)
+        self.ssl_dim = self.args.ssl_dim
+        self.wavlm = get_wavlm()
+
+    @staticmethod
+    def init_from_config(config: KNNVCConfig) -> "KNNVC":
+        return KNNVC(config)
+
+    @torch.inference_mode()
+    def get_features(self, audio: PathOrTensor, vad_trigger_level=0) -> torch.Tensor:
+        """Return features for the given waveform with output shape (seq_len, dim).
+
+        Optionally perform VAD trimming on start/end with `vad_trigger_level`.
+        """
+        # load audio
+        if isinstance(audio, torch.Tensor):
+            x: torch.Tensor = audio
+            sr = self.config.audio.sample_rate
+            if x.dim() == 1:
+                x = x[None]
+        else:
+            x, sr = torchaudio.load(audio, normalize=True)
+
+        if not sr == self.config.audio.sample_rate:
+            logger.info("Resampling %d to %d in %s", sr, self.config.audio.sample_rate, audio)
+            x = torchaudio.functional.resample(x, orig_freq=sr, new_freq=self.config.audio.sample_rate)
+            sr = self.config.audio.sample_rate
+
+        # trim silence from front and back
+        if vad_trigger_level > 1e-3:
+            transform = torchaudio.transforms.Vad(sample_rate=sr, trigger_level=vad_trigger_level)
+            x_front_trim = transform(x)
+            waveform_reversed = torch.flip(x_front_trim, (-1,))
+            waveform_reversed_front_trim = transform(waveform_reversed)
+            x = torch.flip(waveform_reversed_front_trim, (-1,))
+
+        # extract the representation of each layer
+        wav_input_16khz = x.to(self.device)
+        features = self.wavlm.extract_features(
+            wav_input_16khz, output_layer=self.config.wavlm_layer, ret_layer_results=False
+        )[0]
+        return features.squeeze(0)
+
+    def get_matching_set(self, wavs: list[PathOrTensor], vad_trigger_level=7) -> torch.Tensor:
+        """Get concatenated wavlm features for the matching set using all waveforms in `wavs`.
+
+        Wavs are specified as either a list of paths or list of loaded waveform tensors of
+        shape (channels, T), assumed to be of 16kHz sample rate.
+        """
+        feats = []
+        for p in wavs:
+            feats.append(self.get_features(p, vad_trigger_level=vad_trigger_level))
+
+        feats = torch.concat(feats, dim=0).cpu()
+        return feats
+
+    @staticmethod
+    def fast_cosine_dist(source_feats: torch.Tensor, matching_pool: torch.Tensor) -> torch.Tensor:
+        """Like torch.cdist, but fixed dim=-1 and for cosine distance."""
+        source_norms = torch.norm(source_feats, p=2, dim=-1)
+        matching_norms = torch.norm(matching_pool, p=2, dim=-1)
+        dotprod = (
+            -(torch.cdist(source_feats[None], matching_pool[None], p=2)[0] ** 2)
+            + source_norms[:, None] ** 2
+            + matching_norms[None] ** 2
+        )
+        dotprod /= 2
+
+        dists = 1 - (dotprod / (source_norms[:, None] * matching_norms[None]))
+        return dists
+
+    @torch.inference_mode()
+    def match(
+        self,
+        query_seq: torch.Tensor,
+        matching_set: torch.Tensor,
+        synth_set: torch.Tensor | None = None,
+        topk: int | None = None,
+        target_duration: float | None = None,
+    ) -> torch.Tensor:
+        """Given `query_seq`, `matching_set`, and `synth_set` tensors of shape (N, dim), perform kNN regression matching
+        with k=`topk`.
+
+        Args:
+            `query_seq`: Tensor (N1, dim) of the input/source query features.
+            `matching_set`: Tensor (N2, dim) of the matching set used as the 'training set' for the kNN algorithm.
+            `synth_set`: optional Tensor (N2, dim) corresponding to the matching set. We use the matching set to assign
+                         each query vector to a vector in the matching set, and then use the corresponding vector from
+                         the synth set during HiFiGAN synthesis.
+                         By default, and for best performance, this should be identical to the matching set.
+            `topk`: k in the kNN -- the number of nearest neighbors to average over.
+            `target_duration`: if set to a float, interpolate waveform duration to be equal to this value in seconds.
+
+        Returns:
+            - converted features (1, N, dim)
+        """
+        if topk is None:
+            topk = self.config.topk
+        synth_set = matching_set.to(self.device) if synth_set is None else synth_set.to(self.device)
+        matching_set = matching_set.to(self.device)
+        query_seq = query_seq.to(self.device)
+
+        if target_duration is not None:
+            target_samples = int(target_duration * self.config.audio.sample_rate)
+            scale_factor = (target_samples / self.hop_length) / query_seq.shape[0]  # n_targ_feats / n_input_feats
+            query_seq = F.interpolate(query_seq.T[None], scale_factor=scale_factor, mode="linear")[0].T
+
+        dists = self.fast_cosine_dist(query_seq, matching_set)
+        best = dists.topk(k=topk, largest=False, dim=-1)
+        out_feats = synth_set[best.indices].mean(dim=1)
+        return out_feats.unsqueeze(0)
+
+    def load_checkpoint(self, vc_config: KNNVCConfig, _vc_checkpoint: str | os.PathLike[Any]) -> None:
+        """kNN-VC does not use checkpoints."""
+
+    def forward(self) -> None: ...
+    def inference(self) -> None: ...
+
+    @torch.inference_mode()
+    def voice_conversion(
+        self,
+        source: PathOrTensor,
+        target: list[PathOrTensor],
+        topk: int | None = None,
+    ) -> torch.Tensor:
+        if not isinstance(target, list):
+            target = [target]
+        source_features = self.get_features(source)
+        matching_set = self.get_matching_set(target)
+        return self.match(source_features, matching_set, topk=topk)
diff --git a/TTS/vc/models/openvoice.py b/TTS/vc/models/openvoice.py
index 135b0861b9..1049a580c7 100644
--- a/TTS/vc/models/openvoice.py
+++ b/TTS/vc/models/openvoice.py
@@ -1,8 +1,9 @@
 import json
 import logging
 import os
+from collections.abc import Mapping
 from pathlib import Path
-from typing import Any, Mapping, Optional, Union
+from typing import Any
 
 import librosa
 import numpy as np
@@ -117,7 +118,7 @@ class OpenVoice(BaseVC):
     October 2023, serving as the backend of MyShell.
     """
 
-    def __init__(self, config: Coqpit, speaker_manager: Optional[SpeakerManager] = None) -> None:
+    def __init__(self, config: Coqpit, speaker_manager: SpeakerManager | None = None) -> None:
         super().__init__(config, None, speaker_manager, None)
 
         self.init_multispeaker(config)
@@ -174,15 +175,11 @@ def __init__(self, config: Coqpit, speaker_manager: Optional[SpeakerManager] = N
 
         self.ref_enc = ReferenceEncoder(self.spec_channels, self.gin_channels)
 
-    @property
-    def device(self) -> torch.device:
-        return next(self.parameters()).device
-
     @staticmethod
     def init_from_config(config: OpenVoiceConfig) -> "OpenVoice":
         return OpenVoice(config)
 
-    def init_multispeaker(self, config: Coqpit, data: Optional[list[Any]] = None) -> None:
+    def init_multispeaker(self, config: Coqpit, data: list[Any] | None = None) -> None:
         """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
         or with external `d_vectors` computed from a speaker encoder model.
 
@@ -199,7 +196,7 @@ def init_multispeaker(self, config: Coqpit, data: Optional[list[Any]] = None) ->
     def load_checkpoint(
         self,
         config: OpenVoiceConfig,
-        checkpoint_path: Union[str, os.PathLike[Any]],
+        checkpoint_path: str | os.PathLike[Any],
         eval: bool = False,
         strict: bool = True,
         cache: bool = False,
@@ -223,16 +220,16 @@ def train_step(self) -> None: ...
     def eval_step(self) -> None: ...
 
     @staticmethod
-    def _set_x_lengths(x: torch.Tensor, aux_input: Mapping[str, Optional[torch.Tensor]]) -> torch.Tensor:
+    def _set_x_lengths(x: torch.Tensor, aux_input: Mapping[str, torch.Tensor | None]) -> torch.Tensor:
         if "x_lengths" in aux_input and aux_input["x_lengths"] is not None:
             return aux_input["x_lengths"]
-        return torch.tensor(x.shape[1:2]).to(x.device)
+        return torch.tensor(x.shape[-1:]).to(x.device)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         x: torch.Tensor,
-        aux_input: Mapping[str, Optional[torch.Tensor]] = {"x_lengths": None, "g_src": None, "g_tgt": None},
+        aux_input: Mapping[str, torch.Tensor | None] = {"x_lengths": None, "g_src": None, "g_tgt": None},
     ) -> dict[str, torch.Tensor]:
         """
         Inference pass of the model
@@ -271,7 +268,7 @@ def inference(
             "z_hat": z_hat,
         }
 
-    def load_audio(self, wav: Union[str, npt.NDArray[np.float32], torch.Tensor, list[float]]) -> torch.Tensor:
+    def load_audio(self, wav: str | npt.NDArray[np.float32] | torch.Tensor | list[float]) -> torch.Tensor:
         """Read and format the input audio."""
         if isinstance(wav, str):
             out = torch.from_numpy(librosa.load(wav, sr=self.config.audio.input_sample_rate)[0])
@@ -283,9 +280,8 @@ def load_audio(self, wav: Union[str, npt.NDArray[np.float32], torch.Tensor, list
             out = wav
         return out.to(self.device).float()
 
-    def extract_se(self, audio: Union[str, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
-        audio_ref = self.load_audio(audio)
-        y = torch.FloatTensor(audio_ref)
+    def extract_se(self, audio: str | torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        y = self.load_audio(audio)
         y = y.to(self.device)
         y = y.unsqueeze(0)
         spec = wav_to_spec(
@@ -301,19 +297,23 @@ def extract_se(self, audio: Union[str, torch.Tensor]) -> tuple[torch.Tensor, tor
         return g, spec
 
     @torch.inference_mode()
-    def voice_conversion(self, src: Union[str, torch.Tensor], tgt: Union[str, torch.Tensor]) -> npt.NDArray[np.float32]:
+    def voice_conversion(self, src: str | torch.Tensor, tgt: list[str | torch.Tensor]) -> npt.NDArray[np.float32]:
         """
         Voice conversion pass of the model.
 
         Args:
             src (str or torch.Tensor): Source utterance.
-            tgt (str or torch.Tensor): Target utterance.
+            tgt (list of str or torch.Tensor): Target utterance.
 
         Returns:
             Output numpy array.
         """
         src_se, src_spec = self.extract_se(src)
-        tgt_se, _ = self.extract_se(tgt)
+        tgt_ses = []
+        for tg in tgt:
+            tgt_se, _ = self.extract_se(tg)
+            tgt_ses.append(tgt_se)
+        tgt_se = torch.stack(tgt_ses).mean(dim=0)
 
         aux_input = {"g_src": src_se, "g_tgt": tgt_se}
         audio = self.inference(src_spec, aux_input)
diff --git a/TTS/vocoder/configs/hifigan_config.py b/TTS/vocoder/configs/hifigan_config.py
index 9a102f0c89..60dde496b2 100644
--- a/TTS/vocoder/configs/hifigan_config.py
+++ b/TTS/vocoder/configs/hifigan_config.py
@@ -5,7 +5,7 @@
 
 @dataclass
 class HifiganConfig(BaseGANVocoderConfig):
-    """Defines parameters for FullBand MelGAN vocoder.
+    """Defines parameters for HifiGAN vocoder.
 
     Example:
 
diff --git a/TTS/vocoder/configs/univnet_config.py b/TTS/vocoder/configs/univnet_config.py
index 67f324cfce..85662831ee 100644
--- a/TTS/vocoder/configs/univnet_config.py
+++ b/TTS/vocoder/configs/univnet_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import Dict
 
 from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig
 
@@ -96,7 +95,7 @@ class UnivnetConfig(BaseGANVocoderConfig):
     # model specific params
     discriminator_model: str = "univnet_discriminator"
     generator_model: str = "univnet_generator"
-    generator_model_params: Dict = field(
+    generator_model_params: dict = field(
         default_factory=lambda: {
             "in_channels": 64,
             "out_channels": 1,
@@ -121,7 +120,7 @@ class UnivnetConfig(BaseGANVocoderConfig):
 
     # loss weights - overrides
     stft_loss_weight: float = 2.5
-    stft_loss_params: Dict = field(
+    stft_loss_params: dict = field(
         default_factory=lambda: {
             "n_ffts": [1024, 2048, 512],
             "hop_lengths": [120, 240, 50],
@@ -133,7 +132,7 @@ class UnivnetConfig(BaseGANVocoderConfig):
     hinge_G_loss_weight: float = 0
     feat_match_loss_weight: float = 0
     l1_spec_loss_weight: float = 0
-    l1_spec_loss_params: Dict = field(
+    l1_spec_loss_params: dict = field(
         default_factory=lambda: {
             "use_mel": True,
             "sample_rate": 22050,
@@ -153,7 +152,7 @@ class UnivnetConfig(BaseGANVocoderConfig):
     # lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1})
     lr_scheduler_disc: str = None  # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
     # lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1})
-    optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.5, 0.9], "weight_decay": 0.0})
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.5, 0.9], "weight_decay": 0.0})
     steps_to_start_discriminator: int = 200000
 
     def __post_init__(self):
diff --git a/TTS/vocoder/datasets/__init__.py b/TTS/vocoder/datasets/__init__.py
index 04462817a8..cef6a50b05 100644
--- a/TTS/vocoder/datasets/__init__.py
+++ b/TTS/vocoder/datasets/__init__.py
@@ -1,5 +1,3 @@
-from typing import List
-
 from coqpit import Coqpit
 from torch.utils.data import Dataset
 
@@ -10,7 +8,7 @@
 from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
 
 
-def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List) -> Dataset:
+def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: list) -> Dataset:
     if config.model.lower() in "gan":
         dataset = GANDataset(
             ap=ap,
diff --git a/TTS/vocoder/datasets/gan_dataset.py b/TTS/vocoder/datasets/gan_dataset.py
index 0806c0d496..076545f8a2 100644
--- a/TTS/vocoder/datasets/gan_dataset.py
+++ b/TTS/vocoder/datasets/gan_dataset.py
@@ -32,7 +32,7 @@ def __init__(
         super().__init__()
         self.ap = ap
         self.item_list = items
-        self.compute_feat = not isinstance(items[0], (tuple, list))
+        self.compute_feat = not isinstance(items[0], tuple | list)
         self.seq_len = seq_len
         self.hop_len = hop_len
         self.pad_short = pad_short
@@ -128,9 +128,9 @@ def load_item(self, idx):
         # correct the audio length wrt padding applied in stft
         audio = np.pad(audio, (0, self.hop_len), mode="edge")
         audio = audio[: mel.shape[-1] * self.hop_len]
-        assert (
-            mel.shape[-1] * self.hop_len == audio.shape[-1]
-        ), f" [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}"
+        assert mel.shape[-1] * self.hop_len == audio.shape[-1], (
+            f" [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}"
+        )
 
         audio = torch.from_numpy(audio).float().unsqueeze(0)
         mel = torch.from_numpy(mel).float().squeeze(0)
diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py
index 6f34bccb7c..435330bebe 100644
--- a/TTS/vocoder/datasets/wavegrad_dataset.py
+++ b/TTS/vocoder/datasets/wavegrad_dataset.py
@@ -2,7 +2,6 @@
 import os
 import random
 from multiprocessing import Manager
-from typing import List, Tuple
 
 import numpy as np
 import torch
@@ -65,7 +64,7 @@ def __getitem__(self, idx):
         item = self.load_item(idx)
         return item
 
-    def load_test_samples(self, num_samples: int) -> List[Tuple]:
+    def load_test_samples(self, num_samples: int) -> list[tuple]:
         """Return test samples.
 
         Args:
@@ -103,9 +102,9 @@ def load_item(self, idx):
                     audio = np.pad(
                         audio, (0, self.seq_len + self.pad_short - len(audio)), mode="constant", constant_values=0.0
                     )
-                assert (
-                    audio.shape[-1] >= self.seq_len + self.pad_short
-                ), f"{audio.shape[-1]} vs {self.seq_len + self.pad_short}"
+                assert audio.shape[-1] >= self.seq_len + self.pad_short, (
+                    f"{audio.shape[-1]} vs {self.seq_len + self.pad_short}"
+                )
 
             # correct the audio length wrt hop length
             p = (audio.shape[-1] // self.hop_len + 1) * self.hop_len - audio.shape[-1]
diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py
index 4c4f5c48df..ffb71177c5 100644
--- a/TTS/vocoder/datasets/wavernn_dataset.py
+++ b/TTS/vocoder/datasets/wavernn_dataset.py
@@ -18,7 +18,7 @@ class WaveRNNDataset(Dataset):
     def __init__(self, ap, items, seq_len, hop_len, pad, mode, mulaw, is_training=True, return_segments=True):
         super().__init__()
         self.ap = ap
-        self.compute_feat = not isinstance(items[0], (tuple, list))
+        self.compute_feat = not isinstance(items[0], tuple | list)
         self.item_list = items
         self.seq_len = seq_len
         self.hop_len = hop_len
diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py
index 8d4dd725ef..81a1f30884 100644
--- a/TTS/vocoder/layers/losses.py
+++ b/TTS/vocoder/layers/losses.py
@@ -1,5 +1,3 @@
-from typing import Dict, Union
-
 import torch
 from torch import nn
 from torch.nn import functional as F
@@ -226,9 +224,9 @@ class GeneratorLoss(nn.Module):
 
     def __init__(self, C):
         super().__init__()
-        assert not (
-            C.use_mse_gan_loss and C.use_hinge_gan_loss
-        ), " [!] Cannot use HingeGANLoss and MSEGANLoss together."
+        assert not (C.use_mse_gan_loss and C.use_hinge_gan_loss), (
+            " [!] Cannot use HingeGANLoss and MSEGANLoss together."
+        )
 
         self.use_stft_loss = C.use_stft_loss if "use_stft_loss" in C else False
         self.use_subband_stft_loss = C.use_subband_stft_loss if "use_subband_stft_loss" in C else False
@@ -313,9 +311,9 @@ class DiscriminatorLoss(nn.Module):
 
     def __init__(self, C):
         super().__init__()
-        assert not (
-            C.use_mse_gan_loss and C.use_hinge_gan_loss
-        ), " [!] Cannot use HingeGANLoss and MSEGANLoss together."
+        assert not (C.use_mse_gan_loss and C.use_hinge_gan_loss), (
+            " [!] Cannot use HingeGANLoss and MSEGANLoss together."
+        )
 
         self.use_mse_gan_loss = C.use_mse_gan_loss
         self.use_hinge_gan_loss = C.use_hinge_gan_loss
@@ -352,7 +350,7 @@ def forward(self, scores_fake, scores_real):
 
 
 class WaveRNNLoss(nn.Module):
-    def __init__(self, wave_rnn_mode: Union[str, int]):
+    def __init__(self, wave_rnn_mode: str | int):
         super().__init__()
         if wave_rnn_mode == "mold":
             self.loss_func = discretized_mix_logistic_loss
@@ -363,6 +361,6 @@ def __init__(self, wave_rnn_mode: Union[str, int]):
         else:
             raise ValueError(" [!] Unknown mode for Wavernn.")
 
-    def forward(self, y_hat, y) -> Dict:
+    def forward(self, y_hat, y) -> dict:
         loss = self.loss_func(y_hat, y)
         return {"loss": loss}
diff --git a/TTS/vocoder/layers/lvc_block.py b/TTS/vocoder/layers/lvc_block.py
index 8913a1132e..ab1a56e7fc 100644
--- a/TTS/vocoder/layers/lvc_block.py
+++ b/TTS/vocoder/layers/lvc_block.py
@@ -175,9 +175,9 @@ def location_variable_convolution(x, kernel, bias, dilation, hop_size):
         batch, _, in_length = x.shape
         batch, _, out_channels, kernel_size, kernel_length = kernel.shape
 
-        assert in_length == (
-            kernel_length * hop_size
-        ), f"length of (x, kernel) is not matched, {in_length} vs {kernel_length * hop_size}"
+        assert in_length == (kernel_length * hop_size), (
+            f"length of (x, kernel) is not matched, {in_length} vs {kernel_length * hop_size}"
+        )
 
         padding = dilation * int((kernel_size - 1) / 2)
         x = F.pad(x, (padding, padding), "constant", 0)  # (batch, in_channels, in_length + 2*padding)
diff --git a/TTS/vocoder/layers/wavegrad.py b/TTS/vocoder/layers/wavegrad.py
index 9f1512c6d4..187e7062e2 100644
--- a/TTS/vocoder/layers/wavegrad.py
+++ b/TTS/vocoder/layers/wavegrad.py
@@ -74,7 +74,7 @@ def shif_and_scale(x, scale, shift):
 class UBlock(nn.Module):
     def __init__(self, input_size, hidden_size, factor, dilation):
         super().__init__()
-        assert isinstance(dilation, (list, tuple))
+        assert isinstance(dilation, list | tuple)
         assert len(dilation) == 4
 
         self.factor = factor
diff --git a/TTS/vocoder/models/__init__.py b/TTS/vocoder/models/__init__.py
index b6a1850484..481d234a54 100644
--- a/TTS/vocoder/models/__init__.py
+++ b/TTS/vocoder/models/__init__.py
@@ -5,11 +5,13 @@
 from coqpit import Coqpit
 
 from TTS.utils.generic_utils import to_camel
+from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig, BaseVocoderConfig
+from TTS.vocoder.models.base_vocoder import BaseVocoder
 
 logger = logging.getLogger(__name__)
 
 
-def setup_model(config: Coqpit):
+def setup_model(config: BaseVocoderConfig) -> BaseVocoder:
     """Load models directly from configuration."""
     if "discriminator_model" in config and "generator_model" in config:
         MyModel = importlib.import_module("TTS.vocoder.models.gan")
@@ -26,19 +28,20 @@ def setup_model(config: Coqpit):
             try:
                 MyModel = getattr(MyModel, to_camel(config.model))
             except ModuleNotFoundError as e:
-                raise ValueError(f"Model {config.model} not exist!") from e
+                raise ValueError(f"Model {config.model} does not exist!") from e
     logger.info("Vocoder model: %s", config.model)
     return MyModel.init_from_config(config)
 
 
-def setup_generator(c):
+def setup_generator(c: BaseGANVocoderConfig):
     """TODO: use config object as arguments"""
     logger.info("Generator model: %s", c.generator_model)
     MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower())
     MyModel = getattr(MyModel, to_camel(c.generator_model))
     # this is to preserve the Wavernn class name (instead of Wavernn)
     if c.generator_model.lower() in "hifigan_generator":
-        model = MyModel(in_channels=c.audio["num_mels"], out_channels=1, **c.generator_model_params)
+        c.generator_model_params["in_channels"] = c.generator_model_params.get("in_channels", c.audio["num_mels"])
+        model = MyModel(out_channels=1, **c.generator_model_params)
     elif c.generator_model.lower() in "melgan_generator":
         model = MyModel(
             in_channels=c.audio["num_mels"],
@@ -94,8 +97,8 @@ def setup_generator(c):
     return model
 
 
-def setup_discriminator(c):
-    """TODO: use config objekt as arguments"""
+def setup_discriminator(c: BaseGANVocoderConfig):
+    """TODO: use config object as arguments"""
     logger.info("Discriminator model: %s", c.discriminator_model)
     if "parallel_wavegan" in c.discriminator_model:
         MyModel = importlib.import_module("TTS.vocoder.models.parallel_wavegan_discriminator")
@@ -104,7 +107,7 @@ def setup_discriminator(c):
     MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower()))
     if c.discriminator_model in "hifigan_discriminator":
         model = MyModel()
-    if c.discriminator_model in "random_window_discriminator":
+    elif c.discriminator_model in "random_window_discriminator":
         model = MyModel(
             cond_channels=c.audio["num_mels"],
             hop_length=c.audio["hop_length"],
@@ -113,7 +116,7 @@ def setup_discriminator(c):
             cond_disc_out_channels=c.discriminator_model_params["cond_disc_out_channels"],
             window_sizes=c.discriminator_model_params["window_sizes"],
         )
-    if c.discriminator_model in "melgan_multiscale_discriminator":
+    elif c.discriminator_model in "melgan_multiscale_discriminator":
         model = MyModel(
             in_channels=1,
             out_channels=1,
@@ -122,7 +125,7 @@ def setup_discriminator(c):
             max_channels=c.discriminator_model_params["max_channels"],
             downsample_factors=c.discriminator_model_params["downsample_factors"],
         )
-    if c.discriminator_model == "residual_parallel_wavegan_discriminator":
+    elif c.discriminator_model == "residual_parallel_wavegan_discriminator":
         model = MyModel(
             in_channels=1,
             out_channels=1,
@@ -137,7 +140,7 @@ def setup_discriminator(c):
             nonlinear_activation="LeakyReLU",
             nonlinear_activation_params={"negative_slope": 0.2},
         )
-    if c.discriminator_model == "parallel_wavegan_discriminator":
+    elif c.discriminator_model == "parallel_wavegan_discriminator":
         model = MyModel(
             in_channels=1,
             out_channels=1,
@@ -149,6 +152,8 @@ def setup_discriminator(c):
             nonlinear_activation_params={"negative_slope": 0.2},
             bias=True,
         )
-    if c.discriminator_model == "univnet_discriminator":
+    elif c.discriminator_model == "univnet_discriminator":
         model = MyModel()
+    else:
+        raise NotImplementedError(f"Model {c.discriminator_model} not implemented!")
     return model
diff --git a/TTS/vocoder/models/fullband_melgan_generator.py b/TTS/vocoder/models/fullband_melgan_generator.py
index ee25559af0..292d3323bb 100644
--- a/TTS/vocoder/models/fullband_melgan_generator.py
+++ b/TTS/vocoder/models/fullband_melgan_generator.py
@@ -24,7 +24,7 @@ def __init__(
             num_res_blocks=num_res_blocks,
         )
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, cond_features):
         cond_features = cond_features.to(self.layers[1].weight.device)
         cond_features = torch.nn.functional.pad(
diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py
index 8792950a56..ba3852e795 100644
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@@ -1,5 +1,4 @@
 from inspect import signature
-from typing import Dict, List, Tuple
 
 import numpy as np
 import torch
@@ -65,7 +64,7 @@ def inference(self, x: torch.Tensor) -> torch.Tensor:
         """
         return self.model_g.inference(x)
 
-    def train_step(self, batch: Dict, criterion: Dict, optimizer_idx: int) -> Tuple[Dict, Dict]:
+    def train_step(self, batch: dict, criterion: dict, optimizer_idx: int) -> tuple[dict, dict]:
         """Compute model outputs and the loss values. `optimizer_idx` selects the generator or the discriminator for
         network on the current pass.
 
@@ -185,7 +184,7 @@ def train_step(self, batch: Dict, criterion: Dict, optimizer_idx: int) -> Tuple[
             outputs = {"model_outputs": self.y_hat_g}
         return outputs, loss_dict
 
-    def _log(self, name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, Dict]:
+    def _log(self, name: str, ap: AudioProcessor, batch: dict, outputs: dict) -> tuple[dict, dict]:
         """Logging shared by the training and evaluation.
 
         Args:
@@ -205,22 +204,32 @@ def _log(self, name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tup
         return figures, audios
 
     def train_log(
-        self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int  # pylint: disable=unused-argument
-    ) -> Tuple[Dict, np.ndarray]:
+        self,
+        batch: dict,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
+    ) -> tuple[dict, np.ndarray]:
         """Call `_log()` for training."""
         figures, audios = self._log("eval", self.ap, batch, outputs)
         logger.eval_figures(steps, figures)
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
-    @torch.no_grad()
-    def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]:
+    @torch.inference_mode()
+    def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> tuple[dict, dict]:
         """Call `train_step()` with `no_grad()`"""
         self.train_disc = True  # Avoid a bug in the Training with the missing discriminator loss
         return self.train_step(batch, criterion, optimizer_idx)
 
     def eval_log(
-        self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int  # pylint: disable=unused-argument
-    ) -> Tuple[Dict, np.ndarray]:
+        self,
+        batch: dict,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
+    ) -> tuple[dict, np.ndarray]:
         """Call `_log()` for evaluation."""
         figures, audios = self._log("eval", self.ap, batch, outputs)
         logger.eval_figures(steps, figures)
@@ -259,7 +268,7 @@ def on_train_step_start(self, trainer) -> None:
         """
         self.train_disc = trainer.total_steps_done >= self.config.steps_to_start_discriminator
 
-    def get_optimizer(self) -> List:
+    def get_optimizer(self) -> list:
         """Initiate and return the GAN optimizers based on the config parameters.
 
         It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator.
@@ -275,7 +284,7 @@ def get_optimizer(self) -> List:
         )
         return [optimizer2, optimizer1]
 
-    def get_lr(self) -> List:
+    def get_lr(self) -> list:
         """Set the initial learning rates for each optimizer.
 
         Returns:
@@ -283,7 +292,7 @@ def get_lr(self) -> List:
         """
         return [self.config.lr_disc, self.config.lr_gen]
 
-    def get_scheduler(self, optimizer) -> List:
+    def get_scheduler(self, optimizer) -> list:
         """Set the schedulers for each optimizer.
 
         Args:
@@ -297,7 +306,7 @@ def get_scheduler(self, optimizer) -> List:
         return [scheduler2, scheduler1]
 
     @staticmethod
-    def format_batch(batch: List) -> Dict:
+    def format_batch(batch: list) -> dict:
         """Format the batch for training.
 
         Args:
@@ -316,9 +325,9 @@ def format_batch(batch: List) -> Dict:
     def get_data_loader(  # pylint: disable=no-self-use, unused-argument
         self,
         config: Coqpit,
-        assets: Dict,
+        assets: dict,
         is_eval: True,
-        samples: List,
+        samples: list,
         verbose: bool,
         num_gpus: int,
         rank: int = None,  # pylint: disable=unused-argument
diff --git a/TTS/vocoder/models/hifigan_generator.py b/TTS/vocoder/models/hifigan_generator.py
index 8273d02037..308b12ab56 100644
--- a/TTS/vocoder/models/hifigan_generator.py
+++ b/TTS/vocoder/models/hifigan_generator.py
@@ -179,6 +179,7 @@ def __init__(
         conv_post_weight_norm=True,
         conv_post_bias=True,
         cond_in_each_up_layer=False,
+        pre_linear=None,
     ):
         r"""HiFiGAN Generator with Multi-Receptive Field Fusion (MRF)
 
@@ -198,6 +199,7 @@ def __init__(
                 for each consecutive upsampling layer.
             upsample_factors (List[int]): upsampling factors (stride) for each upsampling layer.
             inference_padding (int): constant padding applied to the input at inference time. Defaults to 5.
+            pre_linear (int): If not None, add nn.Linear(pre_linear, in_channels) before the convolutions.
         """
         super().__init__()
         self.inference_padding = inference_padding
@@ -206,6 +208,8 @@ def __init__(
         self.cond_in_each_up_layer = cond_in_each_up_layer
 
         # initial upsampling layers
+        if pre_linear is not None:
+            self.lin_pre = nn.Linear(pre_linear, in_channels)
         self.conv_pre = weight_norm(Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3))
         resblock = ResBlock1 if resblock_type == "1" else ResBlock2
         # upsampling layers
@@ -258,6 +262,9 @@ def forward(self, x, g=None):
             x: [B, C, T]
             Tensor: [B, 1, T]
         """
+        if hasattr(self, "lin_pre"):
+            x = self.lin_pre(x)
+            x = x.permute(0, 2, 1)
         o = self.conv_pre(x)
         if hasattr(self, "cond_layer"):
             o = o + self.cond_layer(g)
@@ -280,7 +287,7 @@ def forward(self, x, g=None):
         o = torch.tanh(o)
         return o
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, c):
         """
         Args:
@@ -306,9 +313,7 @@ def remove_weight_norm(self):
         remove_parametrizations(self.conv_pre, "weight")
         remove_parametrizations(self.conv_post, "weight")
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         self.load_state_dict(state["model"])
         if eval:
diff --git a/TTS/vocoder/models/melgan_generator.py b/TTS/vocoder/models/melgan_generator.py
index 03c971afa4..53ed700755 100644
--- a/TTS/vocoder/models/melgan_generator.py
+++ b/TTS/vocoder/models/melgan_generator.py
@@ -84,9 +84,7 @@ def remove_weight_norm(self):
                 except ValueError:
                     layer.remove_weight_norm()
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         self.load_state_dict(state["model"])
         if eval:
diff --git a/TTS/vocoder/models/multiband_melgan_generator.py b/TTS/vocoder/models/multiband_melgan_generator.py
index 25d6590659..6eee712db3 100644
--- a/TTS/vocoder/models/multiband_melgan_generator.py
+++ b/TTS/vocoder/models/multiband_melgan_generator.py
@@ -32,7 +32,7 @@ def pqmf_analysis(self, x):
     def pqmf_synthesis(self, x):
         return self.pqmf_layer.synthesis(x)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, cond_features):
         cond_features = cond_features.to(self.layers[1].weight.device)
         cond_features = torch.nn.functional.pad(
diff --git a/TTS/vocoder/models/parallel_wavegan_discriminator.py b/TTS/vocoder/models/parallel_wavegan_discriminator.py
index 211d45d91c..02ad60e0ff 100644
--- a/TTS/vocoder/models/parallel_wavegan_discriminator.py
+++ b/TTS/vocoder/models/parallel_wavegan_discriminator.py
@@ -71,7 +71,7 @@ def forward(self, x):
 
     def apply_weight_norm(self):
         def _apply_weight_norm(m):
-            if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)):
+            if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d):
                 torch.nn.utils.parametrizations.weight_norm(m)
 
         self.apply(_apply_weight_norm)
@@ -174,7 +174,7 @@ def forward(self, x):
 
     def apply_weight_norm(self):
         def _apply_weight_norm(m):
-            if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)):
+            if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d):
                 torch.nn.utils.parametrizations.weight_norm(m)
 
         self.apply(_apply_weight_norm)
diff --git a/TTS/vocoder/models/parallel_wavegan_generator.py b/TTS/vocoder/models/parallel_wavegan_generator.py
index e60c8781f0..71b38d4c0d 100644
--- a/TTS/vocoder/models/parallel_wavegan_generator.py
+++ b/TTS/vocoder/models/parallel_wavegan_generator.py
@@ -108,9 +108,9 @@ def forward(self, c):
         # perform upsampling
         if c is not None and self.upsample_net is not None:
             c = self.upsample_net(c)
-            assert (
-                c.shape[-1] == x.shape[-1]
-            ), f" [!] Upsampling scale does not match the expected output. {c.shape} vs {x.shape}"
+            assert c.shape[-1] == x.shape[-1], (
+                f" [!] Upsampling scale does not match the expected output. {c.shape} vs {x.shape}"
+            )
 
         # encode to hidden representation
         x = self.first_conv(x)
@@ -127,7 +127,7 @@ def forward(self, c):
 
         return x
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, c):
         c = c.to(self.first_conv.weight.device)
         c = torch.nn.functional.pad(c, (self.inference_padding, self.inference_padding), "replicate")
@@ -145,7 +145,7 @@ def _remove_weight_norm(m):
 
     def apply_weight_norm(self):
         def _apply_weight_norm(m):
-            if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)):
+            if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d):
                 torch.nn.utils.parametrizations.weight_norm(m)
                 logger.info("Weight norm is applied to %s", m)
 
@@ -155,9 +155,7 @@ def _apply_weight_norm(m):
     def receptive_field_size(self):
         return _get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         self.load_state_dict(state["model"])
         if eval:
diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py
index 5d1f817927..d991941441 100644
--- a/TTS/vocoder/models/univnet_generator.py
+++ b/TTS/vocoder/models/univnet_generator.py
@@ -1,5 +1,4 @@
 import logging
-from typing import List
 
 import numpy as np
 import torch
@@ -21,7 +20,7 @@ def __init__(
         out_channels: int,
         hidden_channels: int,
         cond_channels: int,
-        upsample_factors: List[int],
+        upsample_factors: list[int],
         lvc_layers_each_block: int,
         lvc_kernel_size: int,
         kpnet_hidden_channels: int,
@@ -128,7 +127,7 @@ def apply_weight_norm(self):
         """Apply weight normalization module from all of the layers."""
 
         def _apply_weight_norm(m):
-            if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)):
+            if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d):
                 torch.nn.utils.parametrizations.weight_norm(m)
                 logger.info("Weight norm is applied to %s", m)
 
@@ -139,7 +138,7 @@ def receptive_field_size(self):
         """Return receptive field size."""
         return _get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, c):
         """Perform inference.
         Args:
diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py
index c49abd2201..b1a4a26562 100644
--- a/TTS/vocoder/models/wavegrad.py
+++ b/TTS/vocoder/models/wavegrad.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import Dict, List, Tuple
 
 import numpy as np
 import torch
@@ -25,10 +24,10 @@ class WavegradArgs(Coqpit):
     use_weight_norm: bool = False
     y_conv_channels: int = 32
     x_conv_channels: int = 768
-    dblock_out_channels: List[int] = field(default_factory=lambda: [128, 128, 256, 512])
-    ublock_out_channels: List[int] = field(default_factory=lambda: [512, 512, 256, 128, 128])
-    upsample_factors: List[int] = field(default_factory=lambda: [4, 4, 4, 2, 2])
-    upsample_dilations: List[List[int]] = field(
+    dblock_out_channels: list[int] = field(default_factory=lambda: [128, 128, 256, 512])
+    ublock_out_channels: list[int] = field(default_factory=lambda: [512, 512, 256, 128, 128])
+    upsample_factors: list[int] = field(default_factory=lambda: [4, 4, 4, 2, 2])
+    upsample_dilations: list[list[int]] = field(
         default_factory=lambda: [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]]
     )
 
@@ -123,7 +122,7 @@ def load_noise_schedule(self, path):
         beta = np.load(path, allow_pickle=True).item()["beta"]  # pylint: disable=unexpected-keyword-arg
         self.compute_noise_level(beta)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, x, y_n=None):
         """
         Shapes:
@@ -218,9 +217,7 @@ def apply_weight_norm(self):
         self.out_conv = weight_norm(self.out_conv)
         self.y_conv = weight_norm(self.y_conv)
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         self.load_state_dict(state["model"])
         if eval:
@@ -242,7 +239,7 @@ def load_checkpoint(
             )
             self.compute_noise_level(betas)
 
-    def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]:
+    def train_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]:
         # format data
         x = batch["input"]
         y = batch["waveform"]
@@ -258,20 +255,30 @@ def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]:
         return {"model_output": noise_hat}, {"loss": loss}
 
     def train_log(  # pylint: disable=no-self-use
-        self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int  # pylint: disable=unused-argument
-    ) -> Tuple[Dict, np.ndarray]:
+        self,
+        batch: dict,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
+    ) -> tuple[dict, np.ndarray]:
         pass
 
-    @torch.no_grad()
-    def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
+    @torch.inference_mode()
+    def eval_step(self, batch: dict, criterion: nn.Module) -> tuple[dict, dict]:
         return self.train_step(batch, criterion)
 
     def eval_log(  # pylint: disable=no-self-use
-        self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int  # pylint: disable=unused-argument
+        self,
+        batch: dict,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
     ) -> None:
         pass
 
-    def test(self, assets: Dict, test_loader: "DataLoader", outputs=None):  # pylint: disable=unused-argument
+    def test(self, assets: dict, test_loader: "DataLoader", outputs=None):  # pylint: disable=unused-argument
         # setup noise schedule and inference
         ap = assets["audio_processor"]
         noise_schedule = self.config["test_noise_schedule"]
@@ -302,13 +309,13 @@ def get_criterion():
         return torch.nn.L1Loss()
 
     @staticmethod
-    def format_batch(batch: Dict) -> Dict:
+    def format_batch(batch: dict) -> dict:
         # return a whole audio segment
         m, y = batch[0], batch[1]
         y = y.unsqueeze(1)
         return {"input": m, "waveform": y}
 
-    def get_data_loader(self, config: Coqpit, assets: Dict, is_eval: True, samples: List, verbose: bool, num_gpus: int):
+    def get_data_loader(self, config: Coqpit, assets: dict, is_eval: True, samples: list, verbose: bool, num_gpus: int):
         ap = assets["audio_processor"]
         dataset = WaveGradDataset(
             ap=ap,
diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py
index 1847679890..5a93f125ba 100644
--- a/TTS/vocoder/models/wavernn.py
+++ b/TTS/vocoder/models/wavernn.py
@@ -1,7 +1,6 @@
 import sys
 import time
 from dataclasses import dataclass, field
-from typing import Dict, List, Tuple
 
 import numpy as np
 import torch
@@ -171,7 +170,7 @@ class WavernnArgs(Coqpit):
     num_res_blocks: int = 10
     use_aux_net: bool = True
     use_upsample_net: bool = True
-    upsample_factors: List[int] = field(default_factory=lambda: [4, 8, 8])
+    upsample_factors: list[int] = field(default_factory=lambda: [4, 8, 8])
     mode: str = "mold"  # mold [string], gauss [string], bits [int]
     mulaw: bool = True  # apply mulaw if mode is bits
     pad: int = 2
@@ -226,9 +225,9 @@ class of models has however remained an elusive problem. With a focus on text-to
         self.aux_dims = self.args.res_out_dims // 4
 
         if self.args.use_upsample_net:
-            assert (
-                np.cumprod(self.args.upsample_factors)[-1] == config.audio.hop_length
-            ), " [!] upsample scales needs to be equal to hop_length"
+            assert np.cumprod(self.args.upsample_factors)[-1] == config.audio.hop_length, (
+                " [!] upsample scales needs to be equal to hop_length"
+            )
             self.upsample = UpsampleNetwork(
                 self.args.feat_dims,
                 self.args.upsample_factors,
@@ -307,7 +306,7 @@ def inference(self, mels, batched=None, target=None, overlap=None):
         rnn1 = self.get_gru_cell(self.rnn1)
         rnn2 = self.get_gru_cell(self.rnn2)
 
-        with torch.no_grad():
+        with torch.inference_mode():
             if isinstance(mels, np.ndarray):
                 mels = torch.FloatTensor(mels).to(str(next(self.parameters()).device))
 
@@ -528,16 +527,14 @@ def xfade_and_unfold(y, target, overlap):
 
         return unfolded
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         self.load_state_dict(state["model"])
         if eval:
             self.eval()
             assert not self.training
 
-    def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]:
+    def train_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]:
         mels = batch["input"]
         waveform = batch["waveform"]
         waveform_coarse = batch["waveform_coarse"]
@@ -552,13 +549,16 @@ def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]:
         loss_dict = criterion(y_hat, waveform_coarse)
         return {"model_output": y_hat}, loss_dict
 
-    def eval_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]:
+    def eval_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]:
         return self.train_step(batch, criterion)
 
     @torch.no_grad()
     def test(
-        self, assets: Dict, test_loader: "DataLoader", output: Dict  # pylint: disable=unused-argument
-    ) -> Tuple[Dict, Dict]:
+        self,
+        assets: dict,
+        test_loader: "DataLoader",
+        output: dict,  # pylint: disable=unused-argument
+    ) -> tuple[dict, dict]:
         ap = self.ap
         figures = {}
         audios = {}
@@ -579,14 +579,18 @@ def test(
         return figures, audios
 
     def test_log(
-        self, outputs: Dict, logger: "Logger", assets: Dict, steps: int  # pylint: disable=unused-argument
-    ) -> Tuple[Dict, np.ndarray]:
+        self,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
+    ) -> tuple[dict, np.ndarray]:
         figures, audios = outputs
         logger.eval_figures(steps, figures)
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     @staticmethod
-    def format_batch(batch: Dict) -> Dict:
+    def format_batch(batch: dict) -> dict:
         waveform = batch[0]
         mels = batch[1]
         waveform_coarse = batch[2]
@@ -595,9 +599,9 @@ def format_batch(batch: Dict) -> Dict:
     def get_data_loader(  # pylint: disable=no-self-use
         self,
         config: Coqpit,
-        assets: Dict,
+        assets: dict,
         is_eval: True,
-        samples: List,
+        samples: list,
         verbose: bool,
         num_gpus: int,
     ):
diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py
index fe706ba9ff..bef68e5564 100644
--- a/TTS/vocoder/utils/distribution.py
+++ b/TTS/vocoder/utils/distribution.py
@@ -12,7 +12,7 @@ def gaussian_loss(y_hat, y, log_std_min=-7.0):
     mean = y_hat[:, :, :1]
     log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min)
     # TODO: replace with pytorch dist
-    log_probs = -0.5 * (-math.log(2.0 * math.pi) - 2.0 * log_std - torch.pow(y - mean, 2) * torch.exp((-2.0 * log_std)))
+    log_probs = -0.5 * (-math.log(2.0 * math.pi) - 2.0 * log_std - torch.pow(y - mean, 2) * torch.exp(-2.0 * log_std))
     return log_probs.squeeze().mean()
 
 
diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py
index ac797d97f7..2823d206a0 100644
--- a/TTS/vocoder/utils/generic_utils.py
+++ b/TTS/vocoder/utils/generic_utils.py
@@ -1,5 +1,4 @@
 import logging
-from typing import Dict
 
 import numpy as np
 import torch
@@ -32,7 +31,7 @@ def interpolate_vocoder_input(scale_factor, spec):
     return spec
 
 
-def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> Dict:
+def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> dict:
     """Plot the predicted and the real waveform and their spectrograms.
 
     Args:
diff --git a/docs/source/docker_images.md b/docs/source/docker_images.md
index 042f9f8e7a..ef98fe302e 100644
--- a/docs/source/docker_images.md
+++ b/docs/source/docker_images.md
@@ -7,11 +7,11 @@ You can use premade images built automatically from the latest TTS version.
 
 ### CPU version
 ```bash
-docker pull ghcr.io/coqui-ai/tts-cpu
+docker pull ghcr.io/idiap/coqui-tts-cpu
 ```
 ### GPU version
 ```bash
-docker pull ghcr.io/coqui-ai/tts
+docker pull ghcr.io/idiap/coqui-tts
 ```
 
 ## Building your own image
@@ -25,14 +25,14 @@ You can pass any tts argument after the image name.
 
 ### CPU version
 ```bash
-docker run --rm -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts-cpu --text "Hello." --out_path /root/tts-output/hello.wav
+docker run --rm -v ~/tts-output:/root/tts-output ghcr.io/idiap/coqui-tts-cpu --text "Hello." --out_path /root/tts-output/hello.wav
 ```
 ### GPU version
 For the GPU version, you need to have the latest NVIDIA drivers installed.
 With `nvidia-smi` you can check the CUDA version supported, it must be >= 11.8
 
 ```bash
-docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda
+docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/idiap/coqui-tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda
 ```
 
 ## Start a server
@@ -41,14 +41,14 @@ Start the container and get a shell inside it.
 
 ### CPU version
 ```bash
-docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
+docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/idiap/coqui-tts-cpu
 python3 TTS/server/server.py --list_models #To get the list of available models
 python3 TTS/server/server.py --model_name tts_models/en/vctk/vits
 ```
 
 ### GPU version
 ```bash
-docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/coqui-ai/tts
+docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/idiap/coqui-tts
 python3 TTS/server/server.py --list_models #To get the list of available models
 python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda
 ```
diff --git a/docs/source/extension/implementing_a_new_model.md b/docs/source/extension/implementing_a_new_model.md
index 2521789771..188f466c72 100644
--- a/docs/source/extension/implementing_a_new_model.md
+++ b/docs/source/extension/implementing_a_new_model.md
@@ -37,7 +37,7 @@
     an infinite flexibility to add custom behaviours for your model and training routines.
 
     For more details, see [BaseTTS](../main_classes/model_api.md#base-tts-model)
-    and `TTS.utils.callbacks`.
+    and [`trainer.callbacks`](https://github.com/idiap/coqui-ai-Trainer/blob/main/trainer/callbacks.py).
 
 6. Optionally, define `MyModelArgs`.
 
diff --git a/docs/source/inference.md b/docs/source/inference.md
index cb7d01fca3..1bb844aee3 100644
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@@ -16,6 +16,7 @@ Coqui TTS provides three main methods for inference:
 
 ```{toctree}
 :hidden:
+vc
 server
 marytts
 ```
diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md
index 91d4b4078c..5f6c6ba44c 100644
--- a/docs/source/models/xtts.md
+++ b/docs/source/models/xtts.md
@@ -182,7 +182,7 @@ To use the model API, you need to download the model files and pass config and m
 If you want to be able to `load_checkpoint` with `use_deepspeed=True` and **enjoy the speedup**, you need to install deepspeed first.
 
 ```console
-pip install deepspeed==0.10.3
+pip install deepspeed
 ```
 
 #### Inference parameters
diff --git a/docs/source/server.md b/docs/source/server.md
index 3fa211d0d7..69bdace27b 100644
--- a/docs/source/server.md
+++ b/docs/source/server.md
@@ -4,8 +4,7 @@
 
 You can boot up a demo 🐸TTS server to run an inference with your models (make
 sure to install the additional dependencies with `pip install coqui-tts[server]`).
-Note that the server is not optimized for performance and does not support all
-Coqui models yet.
+Note that the server is not optimized for performance.
 
 The demo server provides pretty much the same interface as the CLI command.
 
@@ -15,7 +14,8 @@ tts-server --list_models  # list the available models.
 ```
 
 Run a TTS model, from the release models list, with its default vocoder.
-If the model you choose is a multi-speaker TTS model, you can select different speakers on the Web interface and synthesize
+If the model you choose is a multi-speaker or multilingual TTS model, you can
+select different speakers and languages on the Web interface and synthesize
 speech.
 
 ```bash
diff --git a/docs/source/vc.md b/docs/source/vc.md
new file mode 100644
index 0000000000..8b45d9393a
--- /dev/null
+++ b/docs/source/vc.md
@@ -0,0 +1,84 @@
+# Voice conversion
+
+## Overview
+
+Voice conversion (VC) converts the voice in a speech signal from one speaker to
+that of another speaker while preserving the linguistic content. Coqui supports
+both voice conversion on its own, as well as applying it after speech synthesis
+to enable multi-speaker output with single-speaker TTS models.
+
+### Python API
+
+Converting the voice in `source_wav` to the voice of `target_wav` (the latter
+can also be a list of files):
+
+```python
+from TTS.api import TTS
+
+tts = TTS("voice_conversion_models/multilingual/vctk/freevc24").to("cuda")
+tts.voice_conversion_to_file(
+  source_wav="my/source.wav",
+  target_wav="my/target.wav",
+  file_path="output.wav"
+)
+```
+
+Voice cloning by combining TTS and VC. The FreeVC model is used for voice
+conversion after synthesizing speech.
+
+```python
+
+tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
+tts.tts_with_vc_to_file(
+  "Wie sage ich auf Italienisch, dass ich dich liebe?",
+  speaker_wav=["target1.wav", "target2.wav"],
+  file_path="output.wav"
+)
+```
+
+Some models, including [XTTS](models/xtts.md), support voice cloning directly
+and a separate voice conversion step is not necessary:
+
+```python
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
+wav = tts.tts(
+  text="Hello world!",
+  speaker_wav="my/cloning/audio.wav",
+  language="en"
+)
+```
+
+### CLI
+
+```sh
+tts --out_path output/path/speech.wav \
+    --model_name "<language>/<dataset>/<model_name>" \
+    --source_wav <path/to/speaker/wav> \
+    --target_wav <path/to/reference/wav1> <path/to/reference/wav2>
+```
+
+## Pretrained models
+
+Coqui includes the following pretrained voice conversion models. Training is not
+supported.
+
+### FreeVC
+
+- `voice_conversion_models/multilingual/vctk/freevc24`
+
+Adapted from: https://github.com/OlaWod/FreeVC
+
+### kNN-VC
+
+- `voice_conversion_models/multilingual/multi-dataset/knnvc`
+
+At least 1-5 minutes of target speaker data are recommended.
+
+Adapted from: https://github.com/bshall/knn-vc
+
+### OpenVoice
+
+- `voice_conversion_models/multilingual/multi-dataset/openvoice_v1`
+- `voice_conversion_models/multilingual/multi-dataset/openvoice_v2`
+
+Adapted from: https://github.com/myshell-ai/OpenVoice
diff --git a/hubconf.py b/hubconf.py
index 6e10928265..b49c9d6bcc 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,4 +1,14 @@
-dependencies = ["torch", "gdown", "pysbd", "gruut", "anyascii", "pypinyin", "coqpit", "mecab-python3", "unidic-lite"]
+dependencies = [
+    "torch",
+    "gdown",
+    "pysbd",
+    "gruut",
+    "anyascii",
+    "pypinyin",
+    "coqpit-config",
+    "mecab-python3",
+    "unidic-lite",
+]
 import torch
 
 from TTS.utils.manage import ModelManager
@@ -39,5 +49,5 @@ def tts(model_name="tts_models/en/ljspeech/tacotron2-DCA", vocoder_name=None, us
 
 
 if __name__ == "__main__":
-    synthesizer = torch.hub.load("coqui-ai/TTS:dev", "tts", source="github")
+    synthesizer = torch.hub.load("idiap/coqui-ai-TTS:dev", "tts", source="github")
     synthesizer.tts("This is a test!")
diff --git a/notebooks/dataset_analysis/analyze.py b/notebooks/dataset_analysis/analyze.py
index 4855886efd..44bf25c071 100644
--- a/notebooks/dataset_analysis/analyze.py
+++ b/notebooks/dataset_analysis/analyze.py
@@ -43,7 +43,7 @@ def process_meta_data(path):
     meta_data = {}
 
     # load meta data
-    with open(path, "r", encoding="utf-8") as f:
+    with open(path, encoding="utf-8") as f:
         data = csv.reader(f, delimiter="|")
         for row in data:
             frames = int(row[2])
@@ -58,7 +58,7 @@ def process_meta_data(path):
                     "utt": utt,
                     "frames": frames,
                     "audio_len": audio_len,
-                    "row": "{}|{}|{}|{}".format(row[0], row[1], row[2], row[3]),
+                    "row": f"{row[0]}|{row[1]}|{row[2]}|{row[3]}",
                 }
             )
 
@@ -156,7 +156,7 @@ def plot_phonemes(train_path, cmu_dict_path, save_path):
 
     phonemes = {}
 
-    with open(train_path, "r", encoding="utf-8") as f:
+    with open(train_path, encoding="utf-8") as f:
         data = csv.reader(f, delimiter="|")
         phonemes["None"] = 0
         for row in data:
diff --git a/notebooks/PlotUmapLibriTTS.ipynb b/notebooks/plot_embeddings_umap.ipynb
similarity index 56%
rename from notebooks/PlotUmapLibriTTS.ipynb
rename to notebooks/plot_embeddings_umap.ipynb
index 1e29790b9e..b661f85673 100644
--- a/notebooks/PlotUmapLibriTTS.ipynb
+++ b/notebooks/plot_embeddings_umap.ipynb
@@ -4,13 +4,20 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Overview\n",
+    "# Overview\n",
     "\n",
     "This notebook can be used with both a single or multi- speaker corpus and allows the interactive plotting of speaker embeddings linked to underlying audio (see instructions in the repo's speaker_embedding directory)\n",
     "\n",
     "Depending on the directory structure used for your corpus, you may need to adjust handling of **speaker_to_utter** and **locations**."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -19,63 +26,47 @@
    "source": [
     "import os\n",
     "import glob\n",
+    "import random\n",
+    "from collections import defaultdict\n",
+    "from pathlib import Path\n",
+    "\n",
     "import numpy as np\n",
+    "import torch\n",
     "import umap\n",
     "\n",
-    "from TTS.utils.audio import AudioProcessor\n",
+    "from TTS.bin.compute_embeddings import compute_embeddings\n",
     "from TTS.config import load_config\n",
+    "from TTS.config.shared_configs import BaseDatasetConfig\n",
+    "from TTS.tts.datasets import load_tts_samples\n",
+    "from TTS.utils.audio import AudioProcessor\n",
     "\n",
     "from bokeh.io import output_notebook, show\n",
     "from bokeh.plotting import figure\n",
     "from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n",
     "from bokeh.transform import factor_cmap\n",
-    "from bokeh.palettes import Category10"
+    "from bokeh.palettes import Category10\n",
+    "\n",
+    "output_notebook()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For larger sets of speakers, you can use **Category20**, but you need to change it in the **pal** variable too\n",
+    "For larger sets of speakers, you can use `Category20`, but you need to change it in the `pal` variable too\n",
     "\n",
-    "List of Bokeh palettes here: http://docs.bokeh.org/en/1.4.0/docs/reference/palettes.html\n",
+    "List of Bokeh palettes here: https://docs.bokeh.org/en/latest/docs/reference/palettes.html\n",
     "\n",
     "**NB:** if you have problems with other palettes, first see https://stackoverflow.com/questions/48333820/why-do-some-bokeh-palettes-raise-a-valueerror-when-used-in-factor-cmap"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "output_notebook()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "You should also adjust all the path constants to point at the relevant locations for you locally"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n",
-    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth\"\n",
-    "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
-    "\n",
-    "# My single speaker locations\n",
-    "#EMBED_PATH = \"/home/neil/main/Projects/TTS3/embeddings/neil14/\"\n",
-    "#AUDIO_PATH = \"/home/neil/data/Projects/NeilTTS/neil14/wavs/\"\n",
+    "## Config\n",
     "\n",
-    "# My multi speaker locations\n",
-    "EMBED_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360-embed_128/\"\n",
-    "AUDIO_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360/\""
+    "You should adjust all the paths to point at the relevant locations for you locally."
    ]
   },
   {
@@ -84,7 +75,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!ls -1 $MODEL_RUN_PATH"
+    "# Dataset\n",
+    "formatter_name = \"ljspeech\"\n",
+    "dataset_name = \"ljspeech\"\n",
+    "dataset_path = \"path/to/LJSpeech-1.1\"\n",
+    "meta_file_train = \"metadata.csv\"\n",
+    "\n",
+    "# Speaker encoder\n",
+    "se_model_path = \"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar\"\n",
+    "se_config_path = \"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json\"\n",
+    "embedding_path = \"speakers.pth\""
    ]
   },
   {
@@ -93,15 +93,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "CONFIG = load_config(CONFIG_PATH)\n",
-    "ap = AudioProcessor(**CONFIG['audio'])"
+    "dataset_config = BaseDatasetConfig()\n",
+    "dataset_config.formatter = formatter_name\n",
+    "dataset_config.dataset_name = dataset_name\n",
+    "dataset_config.path = dataset_path\n",
+    "dataset_config.meta_file_train = meta_file_train\n",
+    "\n",
+    "meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=False)\n",
+    "utt_to_wav = {\n",
+    "    item[\"audio_unique_name\"]: str(Path(item[\"audio_file\"]).relative_to(dataset_path)) for item in meta_data_train\n",
+    "}"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Bring in the embeddings created by **compute_embeddings.py**"
+    "## Compute embeddings\n",
+    "\n",
+    "You can skip this if you have already computed embeddings with `TTS/bin/compute_embeddings.py`"
    ]
   },
   {
@@ -110,33 +120,38 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "embed_files = glob.glob(EMBED_PATH+\"/**/*.npy\", recursive=True)\n",
-    "print(f'Embeddings found: {len(embed_files)}')"
+    "compute_embeddings(\n",
+    "    model_path=se_model_path,\n",
+    "    config_path=se_config_path,\n",
+    "    output_path=embedding_path,\n",
+    "    formatter_name=formatter_name,\n",
+    "    dataset_name=dataset_name,\n",
+    "    dataset_path=dataset_path,\n",
+    "    meta_file_train=meta_file_train,\n",
+    ")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Check that we did indeed find an embedding"
+    "## Plot Umap"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "embed_files[0]"
+    "Bring in the embeddings created by `TTS/bin/compute_embeddings.py`"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "### Process the speakers\n",
-    "\n",
-    "Assumes count of **speaker_paths** corresponds to number of speakers (so a corpus in just one directory would be treated like a single speaker and the multiple directories of LibriTTS are treated as distinct speakers)"
+    "embeddings = torch.load(embedding_path, weights_only=True)"
    ]
   },
   {
@@ -145,15 +160,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "speaker_paths = list(set([os.path.dirname(os.path.dirname(embed_file)) for embed_file in embed_files]))\n",
-    "speaker_to_utter = {}\n",
-    "for embed_file in embed_files:\n",
-    "    speaker_path = os.path.dirname(os.path.dirname(embed_file))\n",
-    "    try:\n",
-    "        speaker_to_utter[speaker_path].append(embed_file)\n",
-    "    except:\n",
-    "        speaker_to_utter[speaker_path]=[embed_file]\n",
-    "print(f'Speaker count: {len(speaker_paths)}')"
+    "speakers = set()\n",
+    "speaker_to_utter = defaultdict(list)\n",
+    "for idx, embedding in embeddings.items():\n",
+    "    speaker = embedding[\"name\"]\n",
+    "    speakers.add(speaker)\n",
+    "    speaker_to_utter[speaker].append(idx)\n",
+    "print(f\"Speaker count: {len(speakers)}\")"
    ]
   },
   {
@@ -175,35 +188,32 @@
     "labels = []\n",
     "locations = []\n",
     "\n",
-    "# single speaker \n",
-    "#num_speakers = 1\n",
-    "#num_utters = 1000\n",
+    "# single speaker\n",
+    "num_speakers = 1\n",
+    "num_utters = 1000\n",
     "\n",
     "# multi speaker\n",
-    "num_speakers = 10\n",
-    "num_utters = 20\n",
+    "# num_speakers = 10\n",
+    "# num_utters = 20\n",
     "\n",
-    "\n",
-    "speaker_idxs = np.random.choice(range(len(speaker_paths)), num_speakers, replace=False )\n",
+    "speaker_idxs = random.sample(list(speakers), num_speakers)\n",
     "\n",
     "for speaker_num, speaker_idx in enumerate(speaker_idxs):\n",
-    "    speaker_path = speaker_paths[speaker_idx]\n",
-    "    speakers_utter = speaker_to_utter[speaker_path]\n",
-    "    utter_idxs = np.random.randint(0, len(speakers_utter) , num_utters)\n",
+    "    speakers_utter = speaker_to_utter[speaker_idx]\n",
+    "    utter_idxs = random.sample(speakers_utter, num_utters)\n",
     "    for utter_idx in utter_idxs:\n",
-    "            embed_path = speaker_to_utter[speaker_path][utter_idx]\n",
-    "            embed = np.load(embed_path)\n",
-    "            embeds.append(embed)\n",
-    "            labels.append(str(speaker_num))\n",
-    "            locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n",
-    "embeds = np.concatenate(embeds)"
+    "        embed = np.array(embeddings[utter_idx][\"embedding\"])\n",
+    "        embeds.append(embed)\n",
+    "        labels.append(speaker_idx)\n",
+    "        locations.append(utt_to_wav[utter_idx])\n",
+    "embeds = np.stack(embeds)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Load embeddings with UMAP"
+    "### Load embeddings with UMAP"
    ]
   },
   {
@@ -222,9 +232,7 @@
    "source": [
     "### Interactively charting the data in Bokeh\n",
     "\n",
-    "Set up various details for Bokeh to plot the data\n",
-    "\n",
-    "You can use the regular Bokeh [tools](http://docs.bokeh.org/en/1.4.0/docs/user_guide/tools.html?highlight=tools) to explore the data, with reset setting it back to normal\n",
+    "You can use the regular Bokeh [tools](https://docs.bokeh.org/en/latest/docs/user_guide/interaction/tools.html) to explore the data, with reset setting it back to normal\n",
     "\n",
     "Once you have started the local server (see cell below) you can then click on plotted points which will open a tab to play the audio for that point, enabling easy exploration of your corpus\n",
     "\n",
@@ -238,22 +246,17 @@
    "outputs": [],
    "source": [
     "source_wav_stems = ColumnDataSource(\n",
-    "        data=dict(\n",
-    "            x = projection.T[0].tolist(),\n",
-    "            y = projection.T[1].tolist(),\n",
-    "            desc=locations,\n",
-    "            label=labels\n",
-    "        )\n",
+    "    data=dict(\n",
+    "        x=projection.T[0].tolist(),\n",
+    "        y=projection.T[1].tolist(),\n",
+    "        desc=locations,\n",
+    "        label=labels,\n",
     "    )\n",
+    ")\n",
     "\n",
-    "hover = HoverTool(\n",
-    "        tooltips=[\n",
-    "            (\"file\", \"@desc\"),\n",
-    "            (\"speaker\", \"@label\"),\n",
-    "        ]\n",
-    "    )\n",
+    "hover = HoverTool(tooltips=[(\"file\", \"@desc\"), (\"speaker\", \"@label\")])\n",
     "\n",
-    "# optionally consider adding these to the tooltips if you want additional detail\n",
+    "### Optionally consider adding these to the tooltips if you want additional detail\n",
     "# for the coordinates: (\"(x,y)\", \"($x, $y)\"),\n",
     "# for the index of the embedding / wav file: (\"index\", \"$index\"),\n",
     "\n",
@@ -261,10 +264,13 @@
     "pal_size = max(len(factors), 3)\n",
     "pal = Category10[pal_size]\n",
     "\n",
-    "p = figure(plot_width=600, plot_height=400, tools=[hover,BoxZoomTool(), ResetTool(), TapTool()])\n",
-    "\n",
-    "\n",
-    "p.circle('x', 'y',  source=source_wav_stems, color=factor_cmap('label', palette=pal, factors=factors),)\n",
+    "p = figure(width=600, height=400, tools=[hover, BoxZoomTool(), ResetTool(), TapTool()])\n",
+    "p.scatter(\n",
+    "    \"x\",\n",
+    "    \"y\",\n",
+    "    source=source_wav_stems,\n",
+    "    color=factor_cmap(\"label\", palette=pal, factors=factors),\n",
+    ")\n",
     "\n",
     "url = \"http://localhost:8000/@desc\"\n",
     "taptool = p.select(type=TapTool)\n",
@@ -292,7 +298,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%cd $AUDIO_PATH\n",
+    "%cd $dataset_path\n",
     "%pwd\n",
     "!python -m http.server"
    ]
@@ -300,7 +306,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -314,7 +320,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,
diff --git a/pyproject.toml b/pyproject.toml
index a7baf29e31..e9516d3d8c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,10 +25,10 @@ build-backend = "hatchling.build"
 
 [project]
 name = "coqui-tts"
-version = "0.25.1"
+version = "0.25.3"
 description = "Deep learning for Text to Speech."
 readme = "README.md"
-requires-python = ">=3.9, <3.13"
+requires-python = ">=3.10, <3.13"
 license = {text = "MPL-2.0"}
 authors = [
     {name = "Eren Gölge", email = "egolge@coqui.ai"}
@@ -39,7 +39,6 @@ maintainers = [
 classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -70,6 +69,7 @@ dependencies = [
     "pyyaml>=6.0",
     "fsspec[http]>=2023.6.0",
     "packaging>=23.1",
+    "typing_extensions>=4.10",
     # Inference
     "pysbd>=0.3.4",
     # Training
@@ -87,13 +87,13 @@ dependencies = [
     "encodec>=0.1.1",
     # XTTS
     "num2words>=0.5.14",
-    "spacy[ja]>=3,<3.8",
+    "spacy[ja]>=3.2,<3.8",
 ]
 
 [project.optional-dependencies]
 # Only used in notebooks
 notebooks = [
-    "bokeh==1.4.0",
+    "bokeh>=3.0.3",
     "pandas>=1.4,<2.0",
     "umap-learn>=0.5.1",
 ]
@@ -115,7 +115,7 @@ ko = [
 ]
 # Japanese
 ja = [
-    "mecab-python3>=1.0.2",
+    "mecab-python3>=1.0.6",
     "unidic-lite==1.0.8",
     "cutlet>=0.2.0",
 ]
@@ -135,11 +135,10 @@ all = [
 
 [dependency-groups]
 dev = [
-    "black==24.2.0",
     "coverage[toml]>=7",
-    "nose2>=0.15",
-    "pre-commit>=3",
-    "ruff==0.7.0",
+    "pre-commit>=4",
+    "pytest>=8",
+    "ruff==0.9.1",
 ]
 # Dependencies for building the documentation
 docs = [
@@ -173,7 +172,6 @@ exclude = [
     "/.readthedocs.yml",
     "/Makefile",
     "/dockerfiles",
-    "/run_bash_tests.sh",
     "/scripts",
     "/tests",
 ]
@@ -192,6 +190,7 @@ lint.extend-select = [
     "F704", # yield-outside-function
     "F706", # return-outside-function
     "F841", # unused-variable
+    "G004", # no f-string in logging
     "I", # import sorting
     "PIE790", # unnecessary-pass
     "PLC",
@@ -201,6 +200,7 @@ lint.extend-select = [
     "PLR0911", # too-many-return-statements
     "PLR1711", # useless-return
     "PLW",
+    "UP", # pyupgrade
     "W291", # trailing-whitespace
     "NPY201",  # NumPy 2.0 deprecation
 ]
@@ -231,14 +231,10 @@ max-returns = 7
     "E402", # module level import not at top of file
 ]
 
-[tool.black]
-line-length = 120
-target-version = ['py39']
+[tool.coverage.report]
+skip_covered = true
+skip_empty = true
 
 [tool.coverage.run]
 parallel = true
 source = ["TTS"]
-
-[tool.cibuildwheel]
-build = "cp*"
-skip = "*-win32 *i686 *musllinux*"
diff --git a/run_bash_tests.sh b/run_bash_tests.sh
deleted file mode 100755
index 2f5ba88934..0000000000
--- a/run_bash_tests.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-set -e
-TF_CPP_MIN_LOG_LEVEL=3
-
-# runtime bash based tests
-# TODO: move these to python
-./tests/bash_tests/test_demo_server.sh && \
-./tests/bash_tests/test_compute_statistics.sh
diff --git a/tests/__init__.py b/tests/__init__.py
index f0a8b2f118..1a03d07552 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,5 +1,8 @@
 import os
+from collections.abc import Callable
+from typing import Optional
 
+import pytest
 from trainer.generic_utils import get_cuda
 
 from TTS.config import BaseDatasetConfig
@@ -44,6 +47,12 @@ def run_cli(command):
     assert exit_status == 0, f" [!] command `{command}` failed."
 
 
+def run_main(main_func: Callable, args: list[str] | None = None, expected_code: int = 0):
+    with pytest.raises(SystemExit) as exc_info:
+        main_func(args)
+    assert exc_info.value.code == expected_code
+
+
 def get_test_data_config():
     return BaseDatasetConfig(formatter="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv")
 
diff --git a/tests/aux_tests/test_audio_processor.py b/tests/aux_tests/test_audio_processor.py
index 5b1fa9d38a..6caf6db30d 100644
--- a/tests/aux_tests/test_audio_processor.py
+++ b/tests/aux_tests/test_audio_processor.py
@@ -1,190 +1,194 @@
 import os
-import unittest
 
-from tests import get_tests_input_path, get_tests_output_path, get_tests_path
+import pytest
+
+from tests import get_tests_input_path
 from TTS.config import BaseAudioConfig
 from TTS.utils.audio.processor import AudioProcessor
 
-TESTS_PATH = get_tests_path()
-OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 
-os.makedirs(OUT_PATH, exist_ok=True)
 conf = BaseAudioConfig(mel_fmax=8000, pitch_fmax=640, pitch_fmin=1)
 
 
-# pylint: disable=protected-access
-class TestAudio(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.ap = AudioProcessor(**conf)
-
-    def test_audio_synthesis(self):
-        """1. load wav
-        2. set normalization parameters
-        3. extract mel-spec
-        4. invert to wav and save the output
-        """
-        print(" > Sanity check for the process wav -> mel -> wav")
-
-        def _test(max_norm, signal_norm, symmetric_norm, clip_norm):
-            self.ap.max_norm = max_norm
-            self.ap.signal_norm = signal_norm
-            self.ap.symmetric_norm = symmetric_norm
-            self.ap.clip_norm = clip_norm
-            wav = self.ap.load_wav(WAV_FILE)
-            mel = self.ap.melspectrogram(wav)
-            wav_ = self.ap.inv_melspectrogram(mel)
-            file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav".format(
-                max_norm, signal_norm, symmetric_norm, clip_norm
-            )
-            print(" | > Creating wav file at : ", file_name)
-            self.ap.save_wav(wav_, OUT_PATH + file_name)
-
-        # maxnorm = 1.0
-        _test(1.0, False, False, False)
-        _test(1.0, True, False, False)
-        _test(1.0, True, True, False)
-        _test(1.0, True, False, True)
-        _test(1.0, True, True, True)
-        # maxnorm = 4.0
-        _test(4.0, False, False, False)
-        _test(4.0, True, False, False)
-        _test(4.0, True, True, False)
-        _test(4.0, True, False, True)
-        _test(4.0, True, True, True)
-
-    def test_normalize(self):
-        """Check normalization and denormalization for range values and consistency"""
-        print(" > Testing normalization and denormalization.")
-        wav = self.ap.load_wav(WAV_FILE)
-        wav = self.ap.sound_norm(wav)  # normalize audio to get abetter normalization range below.
-        self.ap.signal_norm = False
-        x = self.ap.melspectrogram(wav)
-        x_old = x
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = False
-        self.ap.clip_norm = False
-        self.ap.max_norm = 4.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-        assert (x_old - x).sum() == 0
-        # check value range
-        assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
-        assert x_norm.min() >= 0 - 1, x_norm.min()
-        # check denorm.
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3, (x - x_).mean()
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = False
-        self.ap.clip_norm = True
-        self.ap.max_norm = 4.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-
-        assert (x_old - x).sum() == 0
-        # check value range
-        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
-        assert x_norm.min() >= 0, x_norm.min()
-        # check denorm.
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3, (x - x_).mean()
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = True
-        self.ap.clip_norm = False
-        self.ap.max_norm = 4.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-
-        assert (x_old - x).sum() == 0
-        # check value range
-        assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
-        assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min()  # pylint: disable=invalid-unary-operand-type
-        assert x_norm.min() <= 0, x_norm.min()
-        # check denorm.
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3, (x - x_).mean()
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = True
-        self.ap.clip_norm = True
-        self.ap.max_norm = 4.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-
-        assert (x_old - x).sum() == 0
-        # check value range
-        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
-        assert x_norm.min() >= -self.ap.max_norm, x_norm.min()  # pylint: disable=invalid-unary-operand-type
-        assert x_norm.min() <= 0, x_norm.min()
-        # check denorm.
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3, (x - x_).mean()
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = False
-        self.ap.max_norm = 1.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-
-        assert (x_old - x).sum() == 0
-        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
-        assert x_norm.min() >= 0, x_norm.min()
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = True
-        self.ap.max_norm = 1.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-
-        assert (x_old - x).sum() == 0
-        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
-        assert x_norm.min() >= -self.ap.max_norm, x_norm.min()  # pylint: disable=invalid-unary-operand-type
-        assert x_norm.min() < 0, x_norm.min()
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3
-
-    def test_scaler(self):
-        scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy")
-        conf.stats_path = scaler_stats_path
-        conf.preemphasis = 0.0
-        conf.do_trim_silence = True
-        conf.signal_norm = True
-
-        ap = AudioProcessor(**conf)
-        mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path)
-        ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)
-
-        self.ap.signal_norm = False
-        self.ap.preemphasis = 0.0
-
-        # test scaler forward and backward transforms
-        wav = self.ap.load_wav(WAV_FILE)
-        mel_reference = self.ap.melspectrogram(wav)
-        mel_norm = ap.melspectrogram(wav)
-        mel_denorm = ap.denormalize(mel_norm)
-        assert abs(mel_reference - mel_denorm).max() < 1e-4
-
-    def test_compute_f0(self):  # pylint: disable=no-self-use
-        ap = AudioProcessor(**conf)
-        wav = ap.load_wav(WAV_FILE)
-        pitch = ap.compute_f0(wav)
-        mel = ap.melspectrogram(wav)
-        assert pitch.shape[0] == mel.shape[1]
+@pytest.fixture
+def ap():
+    """Set up audio processor."""
+    return AudioProcessor(**conf)
+
+
+norms = [
+    # maxnorm = 1.0
+    (1.0, False, False, False),
+    (1.0, True, False, False),
+    (1.0, True, True, False),
+    (1.0, True, False, True),
+    (1.0, True, True, True),
+    # maxnorm = 4.0
+    (4.0, False, False, False),
+    (4.0, True, False, False),
+    (4.0, True, True, False),
+    (4.0, True, False, True),
+    (4.0, True, True, True),
+]
+
+
+@pytest.mark.parametrize("norms", norms)
+def test_audio_synthesis(tmp_path, ap, norms):
+    """1. load wav
+    2. set normalization parameters
+    3. extract mel-spec
+    4. invert to wav and save the output
+    """
+    print(" > Sanity check for the process wav -> mel -> wav")
+    max_norm, signal_norm, symmetric_norm, clip_norm = norms
+    ap.max_norm = max_norm
+    ap.signal_norm = signal_norm
+    ap.symmetric_norm = symmetric_norm
+    ap.clip_norm = clip_norm
+    wav = ap.load_wav(WAV_FILE)
+    mel = ap.melspectrogram(wav)
+    wav_ = ap.inv_melspectrogram(mel)
+    file_name = (
+        f"audio_test-melspec_max_norm_{max_norm}-signal_norm_{signal_norm}-"
+        f"symmetric_{symmetric_norm}-clip_norm_{clip_norm}.wav"
+    )
+    print(" | > Creating wav file at : ", file_name)
+    ap.save_wav(wav_, tmp_path / file_name)
+
+
+def test_normalize(ap):
+    """Check normalization and denormalization for range values and consistency"""
+    print(" > Testing normalization and denormalization.")
+    wav = ap.load_wav(WAV_FILE)
+    wav = ap.sound_norm(wav)  # normalize audio to get abetter normalization range below.
+    ap.signal_norm = False
+    x = ap.melspectrogram(wav)
+    x_old = x
+
+    ap.signal_norm = True
+    ap.symmetric_norm = False
+    ap.clip_norm = False
+    ap.max_norm = 4.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+    assert (x_old - x).sum() == 0
+    # check value range
+    assert x_norm.max() <= ap.max_norm + 1, x_norm.max()
+    assert x_norm.min() >= 0 - 1, x_norm.min()
+    # check denorm.
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3, (x - x_).mean()
+
+    ap.signal_norm = True
+    ap.symmetric_norm = False
+    ap.clip_norm = True
+    ap.max_norm = 4.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+
+    assert (x_old - x).sum() == 0
+    # check value range
+    assert x_norm.max() <= ap.max_norm, x_norm.max()
+    assert x_norm.min() >= 0, x_norm.min()
+    # check denorm.
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3, (x - x_).mean()
+
+    ap.signal_norm = True
+    ap.symmetric_norm = True
+    ap.clip_norm = False
+    ap.max_norm = 4.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+
+    assert (x_old - x).sum() == 0
+    # check value range
+    assert x_norm.max() <= ap.max_norm + 1, x_norm.max()
+    assert x_norm.min() >= -ap.max_norm - 2, x_norm.min()  # pylint: disable=invalid-unary-operand-type
+    assert x_norm.min() <= 0, x_norm.min()
+    # check denorm.
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3, (x - x_).mean()
+
+    ap.signal_norm = True
+    ap.symmetric_norm = True
+    ap.clip_norm = True
+    ap.max_norm = 4.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+
+    assert (x_old - x).sum() == 0
+    # check value range
+    assert x_norm.max() <= ap.max_norm, x_norm.max()
+    assert x_norm.min() >= -ap.max_norm, x_norm.min()  # pylint: disable=invalid-unary-operand-type
+    assert x_norm.min() <= 0, x_norm.min()
+    # check denorm.
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3, (x - x_).mean()
+
+    ap.signal_norm = True
+    ap.symmetric_norm = False
+    ap.max_norm = 1.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+
+    assert (x_old - x).sum() == 0
+    assert x_norm.max() <= ap.max_norm, x_norm.max()
+    assert x_norm.min() >= 0, x_norm.min()
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3
+
+    ap.signal_norm = True
+    ap.symmetric_norm = True
+    ap.max_norm = 1.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+
+    assert (x_old - x).sum() == 0
+    assert x_norm.max() <= ap.max_norm, x_norm.max()
+    assert x_norm.min() >= -ap.max_norm, x_norm.min()  # pylint: disable=invalid-unary-operand-type
+    assert x_norm.min() < 0, x_norm.min()
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3
+
+
+def test_scaler(ap):
+    scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy")
+    conf.stats_path = scaler_stats_path
+    conf.preemphasis = 0.0
+    conf.do_trim_silence = True
+    conf.signal_norm = True
+
+    ap = AudioProcessor(**conf)
+    mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path)
+    ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)
+
+    ap.signal_norm = False
+    ap.preemphasis = 0.0
+
+    # test scaler forward and backward transforms
+    wav = ap.load_wav(WAV_FILE)
+    mel_reference = ap.melspectrogram(wav)
+    mel_norm = ap.melspectrogram(wav)
+    mel_denorm = ap.denormalize(mel_norm)
+    assert abs(mel_reference - mel_denorm).max() < 1e-4
+
+
+def test_compute_f0(ap):
+    wav = ap.load_wav(WAV_FILE)
+    pitch = ap.compute_f0(wav)
+    mel = ap.melspectrogram(wav)
+    assert pitch.shape[0] == mel.shape[1]
diff --git a/tests/aux_tests/test_compute_statistics.py b/tests/aux_tests/test_compute_statistics.py
new file mode 100644
index 0000000000..d6809eb480
--- /dev/null
+++ b/tests/aux_tests/test_compute_statistics.py
@@ -0,0 +1,10 @@
+from pathlib import Path
+
+from tests import get_tests_input_path, run_main
+from TTS.bin.compute_statistics import main
+
+
+def test_compute_statistics(tmp_path):
+    config_path = Path(get_tests_input_path()) / "test_glow_tts_config.json"
+    output_path = tmp_path / "scale_stats.npy"
+    run_main(main, ["--config_path", str(config_path), "--out_path", str(output_path)])
diff --git a/tests/aux_tests/test_extract_tts_spectrograms.py b/tests/aux_tests/test_extract_tts_spectrograms.py
index f2d119ac35..563c5dae02 100644
--- a/tests/aux_tests/test_extract_tts_spectrograms.py
+++ b/tests/aux_tests/test_extract_tts_spectrograms.py
@@ -1,67 +1,23 @@
-import os
-import unittest
+from pathlib import Path
 
+import pytest
 import torch
 
-from tests import get_tests_input_path, get_tests_output_path, run_cli
+from tests import get_tests_input_path, run_main
+from TTS.bin.extract_tts_spectrograms import main
 from TTS.config import load_config
 from TTS.tts.models import setup_model
 
 torch.manual_seed(1)
 
 
-# pylint: disable=protected-access
-class TestExtractTTSSpectrograms(unittest.TestCase):
-    @staticmethod
-    def test_GlowTTS():
-        # set paths
-        config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "glowtts.pth")
-        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
-        # load config
-        c = load_config(config_path)
-        # create model
-        model = setup_model(c)
-        # save model
-        torch.save({"model": model.state_dict()}, checkpoint_path)
-        # run test
-        run_cli(
-            f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"'
-        )
-        run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"')
+@pytest.mark.parametrize("model", ["glow_tts", "tacotron", "tacotron2"])
+def test_extract_tts_spectrograms(tmp_path, model):
+    config_path = str(Path(get_tests_input_path()) / f"test_{model}_config.json")
+    checkpoint_path = str(tmp_path / f"{model}.pth")
+    output_path = str(tmp_path / "output_extract_tts_spectrograms")
 
-    @staticmethod
-    def test_Tacotron2():
-        # set paths
-        config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "tacotron2.pth")
-        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
-        # load config
-        c = load_config(config_path)
-        # create model
-        model = setup_model(c)
-        # save model
-        torch.save({"model": model.state_dict()}, checkpoint_path)
-        # run test
-        run_cli(
-            f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"'
-        )
-        run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"')
-
-    @staticmethod
-    def test_Tacotron():
-        # set paths
-        config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "tacotron.pth")
-        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
-        # load config
-        c = load_config(config_path)
-        # create model
-        model = setup_model(c)
-        # save model
-        torch.save({"model": model.state_dict()}, checkpoint_path)
-        # run test
-        run_cli(
-            f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"'
-        )
-        run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"')
+    config = load_config(config_path)
+    model = setup_model(config)
+    torch.save({"model": model.state_dict()}, checkpoint_path)
+    run_main(main, ["--config_path", config_path, "--checkpoint_path", checkpoint_path, "--output_path", output_path])
diff --git a/tests/aux_tests/test_find_unique_phonemes.py b/tests/aux_tests/test_find_unique_phonemes.py
index 018679f573..53298cdebd 100644
--- a/tests/aux_tests/test_find_unique_phonemes.py
+++ b/tests/aux_tests/test_find_unique_phonemes.py
@@ -1,16 +1,12 @@
-import os
-import unittest
-
 import torch
 
-from tests import get_tests_output_path, run_cli
+from tests import run_main
+from TTS.bin.find_unique_phonemes import main
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 
 torch.manual_seed(1)
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-
 dataset_config_en = BaseDatasetConfig(
     formatter="ljspeech",
     meta_file_train="metadata.csv",
@@ -30,52 +26,26 @@
 """
 
 
-# pylint: disable=protected-access
-class TestFindUniquePhonemes(unittest.TestCase):
-    @staticmethod
-    def test_espeak_phonemes():
-        # prepare the config
-        config = VitsConfig(
-            batch_size=2,
-            eval_batch_size=2,
-            num_loader_workers=0,
-            num_eval_loader_workers=0,
-            text_cleaner="english_cleaners",
-            use_phonemes=True,
-            phoneme_language="en-us",
-            phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-            run_eval=True,
-            test_delay_epochs=-1,
-            epochs=1,
-            print_step=1,
-            print_eval=True,
-            datasets=[dataset_config_en],
-        )
-        config.save_json(config_path)
-
-        # run test
-        run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"')
-
-    @staticmethod
-    def test_no_espeak_phonemes():
-        # prepare the config
-        config = VitsConfig(
-            batch_size=2,
-            eval_batch_size=2,
-            num_loader_workers=0,
-            num_eval_loader_workers=0,
-            text_cleaner="english_cleaners",
-            use_phonemes=True,
-            phoneme_language="en-us",
-            phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-            run_eval=True,
-            test_delay_epochs=-1,
-            epochs=1,
-            print_step=1,
-            print_eval=True,
-            datasets=[dataset_config_en],
-        )
-        config.save_json(config_path)
-
-        # run test
-        run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"')
+def test_find_phonemes(tmp_path):
+    # prepare the config
+    config_path = str(tmp_path / "test_model_config.json")
+    config = VitsConfig(
+        batch_size=2,
+        eval_batch_size=2,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        datasets=[dataset_config_en],
+    )
+    config.save_json(config_path)
+
+    # run test
+    run_main(main, ["--config_path", config_path])
diff --git a/tests/aux_tests/test_numpy_transforms.py b/tests/aux_tests/test_numpy_transforms.py
index 00597a0f88..129ba5d86b 100644
--- a/tests/aux_tests/test_numpy_transforms.py
+++ b/tests/aux_tests/test_numpy_transforms.py
@@ -7,18 +7,12 @@
 import numpy as np
 from coqpit import Coqpit
 
-from tests import get_tests_input_path, get_tests_output_path, get_tests_path
+from tests import get_tests_input_path, get_tests_path
 from TTS.utils.audio import numpy_transforms as np_transforms
 
 TESTS_PATH = get_tests_path()
-OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 
-os.makedirs(OUT_PATH, exist_ok=True)
-
-
-# pylint: disable=no-self-use
-
 
 class TestNumpyTransforms(unittest.TestCase):
     def setUp(self) -> None:
diff --git a/tests/aux_tests/test_server.py b/tests/aux_tests/test_server.py
new file mode 100644
index 0000000000..1b691f9596
--- /dev/null
+++ b/tests/aux_tests/test_server.py
@@ -0,0 +1,47 @@
+import os
+import signal
+import socket
+import subprocess
+import time
+import wave
+
+import pytest
+import requests
+
+PORT = 5003
+
+
+def wait_for_server(host, port, timeout=30):
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            with socket.create_connection((host, port), timeout=2):
+                return True
+        except (OSError, ConnectionRefusedError):
+            time.sleep(1)
+    raise TimeoutError(f"Server at {host}:{port} did not start within {timeout} seconds.")
+
+
+@pytest.fixture(scope="module", autouse=True)
+def start_flask_server():
+    server_process = subprocess.Popen(
+        ["python", "-m", "TTS.server.server", "--port", str(PORT)],
+    )
+    wait_for_server("localhost", PORT)
+    yield
+    os.kill(server_process.pid, signal.SIGTERM)
+    server_process.wait()
+
+
+def test_flask_server(tmp_path):
+    url = f"http://localhost:{PORT}/api/tts?text=synthesis%20schmynthesis"
+    response = requests.get(url)
+    assert response.status_code == 200, f"Request failed with status code {response.status_code}"
+
+    wav_path = tmp_path / "output.wav"
+    with wav_path.open("wb") as f:
+        f.write(response.content)
+
+    with wave.open(str(wav_path), "rb") as wav_file:
+        num_frames = wav_file.getnframes()
+        assert num_frames > 0, "WAV file contains no frames."
diff --git a/tests/aux_tests/test_speaker_encoder_train.py b/tests/aux_tests/test_speaker_encoder_train.py
index 5d8626faa6..0e15db2ab0 100644
--- a/tests/aux_tests/test_speaker_encoder_train.py
+++ b/tests/aux_tests/test_speaker_encoder_train.py
@@ -1,88 +1,86 @@
-import glob
-import os
 import shutil
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig
 
 
-def run_test_train():
-    command = (
-        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} "
-        f"--coqpit.output_path {output_path} "
-        "--coqpit.datasets.0.formatter ljspeech_test "
-        "--coqpit.datasets.0.meta_file_train metadata.csv "
-        "--coqpit.datasets.0.meta_file_val metadata.csv "
-        "--coqpit.datasets.0.path tests/data/ljspeech "
+def test_train(tmp_path):
+    config_path = tmp_path / "test_speaker_encoder_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    def run_test_train():
+        command = (
+            f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} "
+            f"--coqpit.output_path {output_path} "
+            "--coqpit.datasets.0.formatter ljspeech_test "
+            "--coqpit.datasets.0.meta_file_train metadata.csv "
+            "--coqpit.datasets.0.meta_file_val metadata.csv "
+            "--coqpit.datasets.0.path tests/data/ljspeech "
+        )
+        run_cli(command)
+
+    config = SpeakerEncoderConfig(
+        batch_size=4,
+        num_classes_in_batch=4,
+        num_utter_per_class=2,
+        eval_num_classes_in_batch=4,
+        eval_num_utter_per_class=2,
+        num_loader_workers=1,
+        epochs=1,
+        print_step=1,
+        save_step=2,
+        print_eval=True,
+        run_eval=True,
+        audio=BaseAudioConfig(num_mels=80),
     )
-    run_cli(command)
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.loss = "ge2e"
+    config.save_json(config_path)
 
+    print(config)
+    # train the model for one epoch
+    run_test_train()
 
-config_path = os.path.join(get_tests_output_path(), "test_speaker_encoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-config = SpeakerEncoderConfig(
-    batch_size=4,
-    num_classes_in_batch=4,
-    num_utter_per_class=2,
-    eval_num_classes_in_batch=4,
-    eval_num_utter_per_class=2,
-    num_loader_workers=1,
-    epochs=1,
-    print_step=1,
-    save_step=2,
-    print_eval=True,
-    run_eval=True,
-    audio=BaseAudioConfig(num_mels=80),
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.loss = "ge2e"
-config.save_json(config_path)
-
-print(config)
-# train the model for one epoch
-run_test_train()
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
-
-# test resnet speaker encoder
-config.model_params["model_name"] = "resnet"
-config.save_json(config_path)
-
-# train the model for one epoch
-run_test_train()
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
 
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # test resnet speaker encoder
+    config.model_params["model_name"] = "resnet"
+    config.save_json(config_path)
 
-# test model with ge2e loss function
-# config.loss = "ge2e"
-# config.save_json(config_path)
-# run_test_train()
+    # train the model for one epoch
+    run_test_train()
 
-# test model with angleproto loss function
-# config.loss = "angleproto"
-# config.save_json(config_path)
-# run_test_train()
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# test model with softmaxproto loss function
-config.loss = "softmaxproto"
-config.save_json(config_path)
-run_test_train()
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
+
+    # test model with ge2e loss function
+    # config.loss = "ge2e"
+    # config.save_json(config_path)
+    # run_test_train()
+
+    # test model with angleproto loss function
+    # config.loss = "angleproto"
+    # config.save_json(config_path)
+    # run_test_train()
+
+    # test model with softmaxproto loss function
+    config.loss = "softmaxproto"
+    config.save_json(config_path)
+    run_test_train()
diff --git a/tests/bash_tests/test_compute_statistics.sh b/tests/bash_tests/test_compute_statistics.sh
deleted file mode 100755
index 721777f852..0000000000
--- a/tests/bash_tests/test_compute_statistics.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env bash
-set -xe
-BASEDIR=$(dirname "$0")
-echo "$BASEDIR"
-# run training
-CUDA_VISIBLE_DEVICES="" python TTS/bin/compute_statistics.py --config_path $BASEDIR/../inputs/test_glow_tts.json --out_path $BASEDIR/../outputs/scale_stats.npy
diff --git a/tests/bash_tests/test_demo_server.sh b/tests/bash_tests/test_demo_server.sh
deleted file mode 100755
index ebd0bc8b89..0000000000
--- a/tests/bash_tests/test_demo_server.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-set -xe
-
-python -m TTS.server.server &
-SERVER_PID=$!
-
-echo 'Waiting for server...'
-sleep 30
-
-curl -o /tmp/audio.wav "http://localhost:5002/api/tts?text=synthesis%20schmynthesis"
-python -c 'import sys; import wave; print(wave.open(sys.argv[1]).getnframes())' /tmp/audio.wav
-
-kill $SERVER_PID
-
-rm /tmp/audio.wav
diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py
index 252b429a16..975281c549 100644
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
@@ -1,12 +1,12 @@
 import os
 import shutil
-import unittest
 
 import numpy as np
+import pytest
 import torch
 from torch.utils.data import DataLoader
 
-from tests import get_tests_data_path, get_tests_output_path
+from tests import get_tests_data_path
 from TTS.tts.configs.shared_configs import BaseDatasetConfig, BaseTTSConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.datasets.dataset import TTSDataset
@@ -15,9 +15,6 @@
 
 # pylint: disable=unused-variable
 
-OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
-os.makedirs(OUTPATH, exist_ok=True)
-
 # create a dummy config for testing data loaders.
 c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False)
 c.r = 5
@@ -47,210 +44,210 @@
 
 dataset_configs = [dataset_config_wav, dataset_config_mp3, dataset_config_flac]
 
+ap = AudioProcessor(**c.audio)
+max_loader_iter = 4
+
 DATA_EXIST = True
 if not os.path.exists(c.data_path):
     DATA_EXIST = False
 
-print(" > Dynamic data loader test: {}".format(DATA_EXIST))
-
-
-class TestTTSDataset(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.max_loader_iter = 4
-        self.ap = AudioProcessor(**c.audio)
-
-    def _create_dataloader(self, batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False):
-        # load dataset
-        meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2)
-        items = meta_data_train + meta_data_eval
-        tokenizer, _ = TTSTokenizer.init_from_config(c)
-        dataset = TTSDataset(
-            outputs_per_step=r,
-            compute_linear_spec=True,
-            return_wav=True,
-            tokenizer=tokenizer,
-            ap=self.ap,
-            samples=items,
-            batch_group_size=bgs,
-            min_text_len=c.min_text_len,
-            max_text_len=c.max_text_len,
-            min_audio_len=c.min_audio_len,
-            max_audio_len=c.max_audio_len,
-            start_by_longest=start_by_longest,
-        )
-
-        # add preprocess to force the length computation
-        if preprocess_samples:
-            dataset.preprocess_samples()
-
-        dataloader = DataLoader(
-            dataset,
-            batch_size=batch_size,
-            shuffle=False,
-            collate_fn=dataset.collate_fn,
-            drop_last=True,
-            num_workers=c.num_loader_workers,
-        )
-        return dataloader, dataset
-
-    def test_loader(self):
-        for dataset_config in dataset_configs:
-            dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config, preprocess_samples=True)
-            for i, data in enumerate(dataloader):
-                if i == self.max_loader_iter:
-                    break
-                text_input = data["token_id"]
-                _ = data["token_id_lengths"]
-                speaker_name = data["speaker_names"]
-                linear_input = data["linear"]
-                mel_input = data["mel"]
-                mel_lengths = data["mel_lengths"]
-                _ = data["stop_targets"]
-                _ = data["item_idxs"]
-                wavs = data["waveform"]
-
-                neg_values = text_input[text_input < 0]
-                check_count = len(neg_values)
-
-                # check basic conditions
-                self.assertEqual(check_count, 0)
-                self.assertEqual(linear_input.shape[0], mel_input.shape[0], c.batch_size)
-                self.assertEqual(linear_input.shape[2], self.ap.fft_size // 2 + 1)
-                self.assertEqual(mel_input.shape[2], c.audio["num_mels"])
-                self.assertEqual(wavs.shape[1], mel_input.shape[1] * c.audio.hop_length)
-                self.assertIsInstance(speaker_name[0], str)
-
-                # make sure that the computed mels and the waveform match and correctly computed
-                mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy())
-                # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding
-                mel_new = mel_new[:, : mel_lengths[0]]
-                ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length)
-                mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg]
-                self.assertLess(abs(mel_diff.sum()), 1e-5)
-
-                # check normalization ranges
-                if self.ap.symmetric_norm:
-                    self.assertLessEqual(mel_input.max(), self.ap.max_norm)
-                    self.assertGreaterEqual(
-                        mel_input.min(), -self.ap.max_norm  # pylint: disable=invalid-unary-operand-type
-                    )
-                    self.assertLess(mel_input.min(), 0)
-                else:
-                    self.assertLessEqual(mel_input.max(), self.ap.max_norm)
-                    self.assertGreaterEqual(mel_input.min(), 0)
-
-    def test_batch_group_shuffle(self):
-        dataloader, dataset = self._create_dataloader(2, c.r, 16, dataset_config_wav)
-        last_length = 0
-        frames = dataset.samples
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            mel_lengths = data["mel_lengths"]
-            avg_length = mel_lengths.numpy().mean()
-        dataloader.dataset.preprocess_samples()
-        is_items_reordered = False
-        for idx, item in enumerate(dataloader.dataset.samples):
-            if item != frames[idx]:
-                is_items_reordered = True
-                break
-        self.assertGreaterEqual(avg_length, last_length)
-        self.assertTrue(is_items_reordered)
-
-    def test_start_by_longest(self):
-        """Test start_by_longest option.
-
-        Ther first item of the fist batch must be longer than all the other items.
-        """
-        dataloader, _ = self._create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True)
-        dataloader.dataset.preprocess_samples()
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            mel_lengths = data["mel_lengths"]
-            if i == 0:
-                max_len = mel_lengths[0]
-            print(mel_lengths)
-            self.assertTrue(all(max_len >= mel_lengths))
-
-    def test_padding_and_spectrograms(self):
-        def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths):
-            self.assertNotEqual(linear_input[idx, -1].sum(), 0)  # check padding
-            self.assertNotEqual(linear_input[idx, -2].sum(), 0)
-            self.assertNotEqual(mel_input[idx, -1].sum(), 0)
-            self.assertNotEqual(mel_input[idx, -2].sum(), 0)
-            self.assertEqual(stop_target[idx, -1], 1)
-            self.assertEqual(stop_target[idx, -2], 0)
-            self.assertEqual(stop_target[idx].sum(), 1)
-            self.assertEqual(len(mel_lengths.shape), 1)
-            self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0])
-            self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0])
-
-        dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config_wav)
-
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            linear_input = data["linear"]
-            mel_input = data["mel"]
-            mel_lengths = data["mel_lengths"]
-            stop_target = data["stop_targets"]
-            item_idx = data["item_idxs"]
-
-            # check mel_spec consistency
-            wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32)
-            mel = self.ap.melspectrogram(wav).astype("float32")
-            mel = torch.FloatTensor(mel).contiguous()
-            mel_dl = mel_input[0]
-            # NOTE: Below needs to check == 0 but due to an unknown reason
-            # there is a slight difference between two matrices.
-            # TODO: Check this assert cond more in detail.
-            self.assertLess(abs(mel.T - mel_dl).max(), 1e-5)
-
-            # check mel-spec correctness
-            mel_spec = mel_input[0].cpu().numpy()
-            wav = self.ap.inv_melspectrogram(mel_spec.T)
-            self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav")
-            shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav")
-
-            # check linear-spec
-            linear_spec = linear_input[0].cpu().numpy()
-            wav = self.ap.inv_spectrogram(linear_spec.T)
-            self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav")
-            shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav")
-
-            # check the outputs
-            check_conditions(0, linear_input, mel_input, stop_target, mel_lengths)
-
-        # Test for batch size 2
-        dataloader, _ = self._create_dataloader(2, 1, 0, dataset_config_wav)
-
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            linear_input = data["linear"]
-            mel_input = data["mel"]
-            mel_lengths = data["mel_lengths"]
-            stop_target = data["stop_targets"]
-            item_idx = data["item_idxs"]
-
-            # set id to the longest sequence in the batch
-            if mel_lengths[0] > mel_lengths[1]:
-                idx = 0
-            else:
-                idx = 1
-
-            # check the longer item in the batch
-            check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths)
-
-            # check the other item in the batch
-            self.assertEqual(linear_input[1 - idx, -1].sum(), 0)
-            self.assertEqual(mel_input[1 - idx, -1].sum(), 0)
-            self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1)
-            self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1])
-            self.assertEqual(len(mel_lengths.shape), 1)
-
-            # check batch zero-frame conditions (zero-frame disabled)
-            # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
-            # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
+print(f" > Dynamic data loader test: {DATA_EXIST}")
+
+
+def _create_dataloader(batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False):
+    # load dataset
+    meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2)
+    items = meta_data_train + meta_data_eval
+    tokenizer, _ = TTSTokenizer.init_from_config(c)
+    dataset = TTSDataset(
+        outputs_per_step=r,
+        compute_linear_spec=True,
+        return_wav=True,
+        tokenizer=tokenizer,
+        ap=ap,
+        samples=items,
+        batch_group_size=bgs,
+        min_text_len=c.min_text_len,
+        max_text_len=c.max_text_len,
+        min_audio_len=c.min_audio_len,
+        max_audio_len=c.max_audio_len,
+        start_by_longest=start_by_longest,
+    )
+
+    # add preprocess to force the length computation
+    if preprocess_samples:
+        dataset.preprocess_samples()
+
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=dataset.collate_fn,
+        drop_last=True,
+        num_workers=c.num_loader_workers,
+    )
+    return dataloader, dataset
+
+
+@pytest.mark.parametrize("dataset_config", dataset_configs)
+def test_loader(dataset_config: BaseDatasetConfig):
+    batch_size = 1
+    dataloader, _ = _create_dataloader(batch_size, 1, 0, dataset_config, preprocess_samples=True)
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        text_input = data["token_id"]
+        _ = data["token_id_lengths"]
+        speaker_name = data["speaker_names"]
+        linear_input = data["linear"]
+        mel_input = data["mel"]
+        mel_lengths = data["mel_lengths"]
+        _ = data["stop_targets"]
+        _ = data["item_idxs"]
+        wavs = data["waveform"]
+
+        neg_values = text_input[text_input < 0]
+        check_count = len(neg_values)
+
+        # check basic conditions
+        assert check_count == 0
+        assert linear_input.shape[0] == mel_input.shape[0] == batch_size
+        assert linear_input.shape[2] == ap.fft_size // 2 + 1
+        assert mel_input.shape[2] == c.audio["num_mels"]
+        assert wavs.shape[1] == mel_input.shape[1] * c.audio.hop_length
+        assert isinstance(speaker_name[0], str)
+
+        # make sure that the computed mels and the waveform match and correctly computed
+        mel_new = ap.melspectrogram(wavs[0].squeeze().numpy())
+        # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding
+        mel_new = mel_new[:, : mel_lengths[0]]
+        ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length)
+        mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg]
+        assert abs(mel_diff.sum()) < 1e-5
+
+        # check normalization ranges
+        if ap.symmetric_norm:
+            assert mel_input.max() <= ap.max_norm
+            assert mel_input.min() >= -ap.max_norm
+            assert mel_input.min() < 0
+        else:
+            assert mel_input.max() <= ap.max_norm
+            assert mel_input.min() >= 0
+
+
+def test_batch_group_shuffle():
+    dataloader, dataset = _create_dataloader(2, c.r, 16, dataset_config_wav)
+    last_length = 0
+    frames = dataset.samples
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        mel_lengths = data["mel_lengths"]
+        avg_length = mel_lengths.numpy().mean()
+    dataloader.dataset.preprocess_samples()
+    is_items_reordered = False
+    for idx, item in enumerate(dataloader.dataset.samples):
+        if item != frames[idx]:
+            is_items_reordered = True
+            break
+    assert avg_length >= last_length
+    assert is_items_reordered
+
+
+def test_start_by_longest():
+    """Test start_by_longest option.
+
+    The first item of the fist batch must be longer than all the other items.
+    """
+    dataloader, _ = _create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True)
+    dataloader.dataset.preprocess_samples()
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        mel_lengths = data["mel_lengths"]
+        if i == 0:
+            max_len = mel_lengths[0]
+        print(mel_lengths)
+        assert all(max_len >= mel_lengths)
+
+
+def test_padding_and_spectrograms(tmp_path):
+    def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths):
+        assert linear_input[idx, -1].sum() != 0  # check padding
+        assert linear_input[idx, -2].sum() != 0
+        assert mel_input[idx, -1].sum() != 0
+        assert mel_input[idx, -2].sum() != 0
+        assert stop_target[idx, -1] == 1
+        assert stop_target[idx, -2] == 0
+        assert stop_target[idx].sum() == 1
+        assert len(mel_lengths.shape) == 1
+        assert mel_lengths[idx] == linear_input[idx].shape[0]
+        assert mel_lengths[idx] == mel_input[idx].shape[0]
+
+    dataloader, _ = _create_dataloader(1, 1, 0, dataset_config_wav)
+
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        linear_input = data["linear"]
+        mel_input = data["mel"]
+        mel_lengths = data["mel_lengths"]
+        stop_target = data["stop_targets"]
+        item_idx = data["item_idxs"]
+
+        # check mel_spec consistency
+        wav = np.asarray(ap.load_wav(item_idx[0]), dtype=np.float32)
+        mel = ap.melspectrogram(wav).astype("float32")
+        mel = torch.FloatTensor(mel).contiguous()
+        mel_dl = mel_input[0]
+        # NOTE: Below needs to check == 0 but due to an unknown reason
+        # there is a slight difference between two matrices.
+        # TODO: Check this assert cond more in detail.
+        assert abs(mel.T - mel_dl).max() < 1e-5
+
+        # check mel-spec correctness
+        mel_spec = mel_input[0].cpu().numpy()
+        wav = ap.inv_melspectrogram(mel_spec.T)
+        ap.save_wav(wav, tmp_path / "mel_inv_dataloader.wav")
+        shutil.copy(item_idx[0], tmp_path / "mel_target_dataloader.wav")
+
+        # check linear-spec
+        linear_spec = linear_input[0].cpu().numpy()
+        wav = ap.inv_spectrogram(linear_spec.T)
+        ap.save_wav(wav, tmp_path / "linear_inv_dataloader.wav")
+        shutil.copy(item_idx[0], tmp_path / "linear_target_dataloader.wav")
+
+        # check the outputs
+        check_conditions(0, linear_input, mel_input, stop_target, mel_lengths)
+
+    # Test for batch size 2
+    dataloader, _ = _create_dataloader(2, 1, 0, dataset_config_wav)
+
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        linear_input = data["linear"]
+        mel_input = data["mel"]
+        mel_lengths = data["mel_lengths"]
+        stop_target = data["stop_targets"]
+        item_idx = data["item_idxs"]
+
+        # set id to the longest sequence in the batch
+        if mel_lengths[0] > mel_lengths[1]:
+            idx = 0
+        else:
+            idx = 1
+
+        # check the longer item in the batch
+        check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths)
+
+        # check the other item in the batch
+        assert linear_input[1 - idx, -1].sum() == 0
+        assert mel_input[1 - idx, -1].sum() == 0
+        assert stop_target[1, mel_lengths[1] - 1] == 1
+        assert stop_target[1, mel_lengths[1] :].sum() == stop_target.shape[1] - mel_lengths[1]
+        assert len(mel_lengths.shape) == 1
+
+        # check batch zero-frame conditions (zero-frame disabled)
+        # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
+        # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
diff --git a/tests/inference_tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py
index 28a4088c96..beb7df689b 100644
--- a/tests/inference_tests/test_synthesize.py
+++ b/tests/inference_tests/test_synthesize.py
@@ -1,20 +1,17 @@
-import os
+from tests import run_main
+from TTS.bin.synthesize import main
 
-from tests import get_tests_output_path, run_cli
 
-
-def test_synthesize():
+def test_synthesize(tmp_path):
     """Test synthesize.py with diffent arguments."""
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    run_cli("tts --list_models")
+    output_path = str(tmp_path / "output.wav")
+
+    run_main(main, ["--list_models"])
 
     # single speaker model
-    run_cli(f'tts --text "This is an example." --out_path "{output_path}"')
-    run_cli(
-        "tts --model_name tts_models/en/ljspeech/glow-tts " f'--text "This is an example." --out_path "{output_path}"'
-    )
-    run_cli(
-        "tts --model_name tts_models/en/ljspeech/glow-tts  "
-        "--vocoder_name vocoder_models/en/ljspeech/multiband-melgan "
-        f'--text "This is an example." --out_path "{output_path}"'
-    )
+    args = ["--text", "This is an example.", "--out_path", output_path]
+    run_main(main, args)
+
+    args = [*args, "--model_name", "tts_models/en/ljspeech/glow-tts"]
+    run_main(main, args)
+    run_main(main, [*args, "--vocoder_name", "vocoder_models/en/ljspeech/multiband-melgan"])
diff --git a/tests/inputs/test_align_tts.json b/tests/inputs/test_align_tts_config.json
similarity index 100%
rename from tests/inputs/test_align_tts.json
rename to tests/inputs/test_align_tts_config.json
diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts_config.json
similarity index 100%
rename from tests/inputs/test_glow_tts.json
rename to tests/inputs/test_glow_tts_config.json
diff --git a/tests/inputs/test_speedy_speech.json b/tests/inputs/test_speedy_speech_config.json
similarity index 100%
rename from tests/inputs/test_speedy_speech.json
rename to tests/inputs/test_speedy_speech_config.json
diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad_config.json
similarity index 100%
rename from tests/inputs/test_vocoder_wavegrad.json
rename to tests/inputs/test_vocoder_wavegrad_config.json
diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py
index f9067530e6..370a541b97 100644
--- a/tests/text_tests/test_phonemizer.py
+++ b/tests/text_tests/test_phonemizer.py
@@ -240,12 +240,8 @@ def test_is_available(self):
 class TestBN_Phonemizer(unittest.TestCase):
     def setUp(self):
         self.phonemizer = BN_Phonemizer()
-        self._TEST_CASES = (
-            "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন"
-        )
-        self._EXPECTED = (
-            "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।"
-        )
+        self._TEST_CASES = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন"
+        self._EXPECTED = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।"
 
     def test_phonemize(self):
         self.assertEqual(self.phonemizer.phonemize(self._TEST_CASES, separator=""), self._EXPECTED)
diff --git a/tests/text_tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py
index 9be1f0bf41..f5d342bb00 100644
--- a/tests/text_tests/test_text_cleaners.py
+++ b/tests/text_tests/test_text_cleaners.py
@@ -24,6 +24,8 @@ def test_currency() -> None:
 def test_expand_numbers() -> None:
     assert phoneme_cleaners("-1") == "minus one"
     assert phoneme_cleaners("1") == "one"
+    assert phoneme_cleaners("1" + "0" * 35) == "one hundred decillion"
+    assert phoneme_cleaners("1" + "0" * 36) == "one" + " zero" * 36
 
 
 def test_multilingual_phoneme_cleaners() -> None:
@@ -43,11 +45,11 @@ def test_normalize_unicode() -> None:
         ("na\u0303", "nã"),
         ("o\u0302u", "ôu"),
         ("n\u0303", "ñ"),
-        ("\u4E2D\u56FD", "中国"),
+        ("\u4e2d\u56fd", "中国"),
         ("niño", "niño"),
         ("a\u0308", "ä"),
         ("\u3053\u3093\u306b\u3061\u306f", "こんにちは"),
-        ("\u03B1\u03B2", "αβ"),
+        ("\u03b1\u03b2", "αβ"),
     ]
     for arg, expect in test_cases:
         assert normalize_unicode(arg) == expect
diff --git a/tests/tts_tests/test_losses.py b/tests/tts_tests/test_losses.py
index 794478dca3..2290e9a6cc 100644
--- a/tests/tts_tests/test_losses.py
+++ b/tests/tts_tests/test_losses.py
@@ -21,7 +21,7 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_target = T.zeros(4, 8, 128).float()
         dummy_length = (T.ones(4) * 8).long()
         output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+        assert output.item() == 1.0, f"1.0 vs {output.item()}"
 
         # test if padded values of input makes any difference
         dummy_input = T.ones(4, 8, 128).float()
@@ -29,14 +29,14 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+        assert output.item() == 1.0, f"1.0 vs {output.item()}"
 
         dummy_input = T.rand(4, 8, 128).float()
         dummy_target = dummy_input.detach()
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
+        assert output.item() == 0, f"0 vs {output.item()}"
 
         # seq_len_norm = True
         # test input == target
@@ -52,7 +52,7 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_target = T.zeros(4, 8, 128).float()
         dummy_length = (T.ones(4) * 8).long()
         output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+        assert output.item() == 1.0, f"1.0 vs {output.item()}"
 
         # test if padded values of input makes any difference
         dummy_input = T.ones(4, 8, 128).float()
@@ -60,14 +60,14 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
+        assert abs(output.item() - 1.0) < 1e-5, f"1.0 vs {output.item()}"
 
         dummy_input = T.rand(4, 8, 128).float()
         dummy_target = dummy_input.detach()
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
+        assert output.item() == 0, f"0 vs {output.item()}"
 
 
 class MSELossMaskedTests(unittest.TestCase):
@@ -85,7 +85,7 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_target = T.zeros(4, 8, 128).float()
         dummy_length = (T.ones(4) * 8).long()
         output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+        assert output.item() == 1.0, f"1.0 vs {output.item()}"
 
         # test if padded values of input makes any difference
         dummy_input = T.ones(4, 8, 128).float()
@@ -93,14 +93,14 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+        assert output.item() == 1.0, f"1.0 vs {output.item()}"
 
         dummy_input = T.rand(4, 8, 128).float()
         dummy_target = dummy_input.detach()
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
+        assert output.item() == 0, f"0 vs {output.item()}"
 
         # seq_len_norm = True
         # test input == target
@@ -116,7 +116,7 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_target = T.zeros(4, 8, 128).float()
         dummy_length = (T.ones(4) * 8).long()
         output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+        assert output.item() == 1.0, f"1.0 vs {output.item()}"
 
         # test if padded values of input makes any difference
         dummy_input = T.ones(4, 8, 128).float()
@@ -124,14 +124,14 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
+        assert abs(output.item() - 1.0) < 1e-5, f"1.0 vs {output.item()}"
 
         dummy_input = T.rand(4, 8, 128).float()
         dummy_target = dummy_input.detach()
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
+        assert output.item() == 0, f"0 vs {output.item()}"
 
 
 class SSIMLossTests(unittest.TestCase):
@@ -153,7 +153,7 @@ def test_in_out(self):  # pylint: disable=no-self-use
 
         dummy_length = (T.ones(4) * 58).long()
         output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() >= 1.0, "0 vs {}".format(output.item())
+        assert output.item() >= 1.0, f"0 vs {output.item()}"
 
         # test if padded values of input makes any difference
         dummy_input = T.ones(4, 57, 128).float()
@@ -168,7 +168,7 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_length = (T.arange(54, 58)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
+        assert output.item() == 0, f"0 vs {output.item()}"
 
         # seq_len_norm = True
         # test input == target
@@ -184,7 +184,7 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_target = T.zeros(4, 57, 128).float()
         dummy_length = (T.ones(4) * 8).long()
         output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+        assert output.item() == 1.0, f"1.0 vs {output.item()}"
 
         # test if padded values of input makes any difference
         dummy_input = T.ones(4, 57, 128).float()
@@ -192,14 +192,14 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_length = (T.arange(54, 58)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
+        assert abs(output.item() - 1.0) < 1e-5, f"1.0 vs {output.item()}"
 
         dummy_input = T.rand(4, 57, 128).float()
         dummy_target = dummy_input.detach()
         dummy_length = (T.arange(54, 58)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
+        assert output.item() == 0, f"0 vs {output.item()}"
 
 
 class BCELossTest(unittest.TestCase):
diff --git a/tests/tts_tests/test_neuralhmm_tts_train.py b/tests/tts_tests/test_neuralhmm_tts_train.py
index 4789d53d9e..f4b8d5cadd 100644
--- a/tests/tts_tests/test_neuralhmm_tts_train.py
+++ b/tests/tts_tests/test_neuralhmm_tts_train.py
@@ -1,92 +1,92 @@
-import glob
 import json
-import os
 import shutil
 
 import torch
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.neuralhmm_tts_config import NeuralhmmTTSConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-parameter_path = os.path.join(get_tests_output_path(), "lj_parameters.pt")
 
-torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+    parameter_path = tmp_path / "lj_parameters.pt"
 
-config = NeuralhmmTTSConfig(
-    batch_size=3,
-    eval_batch_size=3,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="phoneme_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    mel_statistics_parameter_path=parameter_path,
-    epochs=1,
-    print_step=1,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    print_eval=True,
-    max_sampling_time=50,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path)
 
+    config = NeuralhmmTTSConfig(
+        batch_size=3,
+        eval_batch_size=3,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="phoneme_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        mel_statistics_parameter_path=parameter_path,
+        epochs=1,
+        print_step=1,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        print_eval=True,
+        max_sampling_time=50,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch when mel parameters exists
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    # train the model for one epoch when mel parameters exists
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
+    # train the model for one epoch when mel parameters have to be computed from the dataset
+    if parameter_path.is_file():
+        parameter_path.unlink()
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
-# train the model for one epoch when mel parameters have to be computed from the dataset
-if os.path.exists(parameter_path):
-    os.remove(parameter_path)
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_overflow_train.py b/tests/tts_tests/test_overflow_train.py
index d86bde6854..e2dec3c899 100644
--- a/tests/tts_tests/test_overflow_train.py
+++ b/tests/tts_tests/test_overflow_train.py
@@ -1,92 +1,92 @@
-import glob
 import json
-import os
 import shutil
 
 import torch
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.overflow_config import OverflowConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-parameter_path = os.path.join(get_tests_output_path(), "lj_parameters.pt")
 
-torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+    parameter_path = tmp_path / "lj_parameters.pt"
 
-config = OverflowConfig(
-    batch_size=3,
-    eval_batch_size=3,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="phoneme_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    mel_statistics_parameter_path=parameter_path,
-    epochs=1,
-    print_step=1,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    print_eval=True,
-    max_sampling_time=50,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path)
 
+    config = OverflowConfig(
+        batch_size=3,
+        eval_batch_size=3,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="phoneme_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        mel_statistics_parameter_path=parameter_path,
+        epochs=1,
+        print_step=1,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        print_eval=True,
+        max_sampling_time=50,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch when mel parameters exists
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    # train the model for one epoch when mel parameters exists
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
+    # train the model for one epoch when mel parameters have to be computed from the dataset
+    if parameter_path.is_file():
+        parameter_path.unlink()
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
-# train the model for one epoch when mel parameters have to be computed from the dataset
-if os.path.exists(parameter_path):
-    os.remove(parameter_path)
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py
index 2aac7f101d..30efe38d9f 100644
--- a/tests/tts_tests/test_speedy_speech_train.py
+++ b/tests/tts_tests/test_speedy_speech_train.py
@@ -1,72 +1,73 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_speedy_speech_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = SpeedySpeechConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = SpeedySpeechConfig(
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example for it.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example for it.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py
index d2d1d5c35f..12791feda4 100644
--- a/tests/tts_tests/test_tacotron2_d-vectors_train.py
+++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py
@@ -1,79 +1,81 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.tacotron2_config import Tacotron2Config
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-config = Tacotron2Config(
-    r=5,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    use_speaker_embedding=False,
-    use_d_vector_file=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    d_vector_file="tests/data/ljspeech/speakers.json",
-    d_vector_dim=256,
-    max_decoder_steps=50,
-)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = Tacotron2Config(
+        r=5,
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=False,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        use_speaker_embedding=False,
+        use_d_vector_file=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        d_vector_file="tests/data/ljspeech/speakers.json",
+        d_vector_dim=256,
+        max_decoder_steps=50,
+    )
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech_test "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = config.d_vector_file
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = config.d_vector_file
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Check integrity of the config
+    with open(continue_config_path, encoding="utf-8") as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py
index 72b6bcd46b..72069bf943 100644
--- a/tests/tts_tests/test_tacotron2_model.py
+++ b/tests/tts_tests/test_tacotron2_model.py
@@ -72,8 +72,8 @@ def test_train_step(self):  # pylint: disable=no-self-use
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
             # if count not in [145, 59]:
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -131,8 +131,8 @@ def test_train_step():
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
             # if count not in [145, 59]:
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -198,8 +198,8 @@ def test_train_step(self):
             if name == "gst_layer.encoder.recurrence.weight_hh_l0":
                 # print(param.grad)
                 continue
-            assert (param != param_ref).any(), "param {} {} with shape {} not updated!! \n{}\n{}".format(
-                name, count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -254,8 +254,8 @@ def test_train_step(self):
             if name == "gst_layer.encoder.recurrence.weight_hh_l0":
                 # print(param.grad)
                 continue
-            assert (param != param_ref).any(), "param {} {} with shape {} not updated!! \n{}\n{}".format(
-                name, count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -321,8 +321,8 @@ def test_train_step():
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -384,7 +384,7 @@ def test_train_step():
             name, param = name_param
             if name == "gst_layer.encoder.recurrence.weight_hh_l0":
                 continue
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py
index 83a07d1a6c..2696edb1b6 100644
--- a/tests/tts_tests/test_tacotron2_speaker_emb_train.py
+++ b/tests/tts_tests/test_tacotron2_speaker_emb_train.py
@@ -1,77 +1,79 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.tacotron2_config import Tacotron2Config
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-config = Tacotron2Config(
-    r=5,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    use_speaker_embedding=True,
-    num_speakers=4,
-    max_decoder_steps=50,
-)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = Tacotron2Config(
+        r=5,
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=False,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        use_speaker_embedding=True,
+        num_speakers=4,
+        max_decoder_steps=50,
+    )
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech_test "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = continue_path / "speakers.json"
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py
index df0e934d8e..f8667b6d02 100644
--- a/tests/tts_tests/test_tacotron2_train.py
+++ b/tests/tts_tests/test_tacotron2_train.py
@@ -1,72 +1,72 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.tacotron2_config import Tacotron2Config
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-config = Tacotron2Config(
-    r=5,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    print_eval=True,
-    max_decoder_steps=50,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    config = Tacotron2Config(
+        r=5,
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=False,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        print_eval=True,
+        max_decoder_steps=50,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_tacotron_layers.py b/tests/tts_tests/test_tacotron_layers.py
index 43e72417c2..9521cfea26 100644
--- a/tests/tts_tests/test_tacotron_layers.py
+++ b/tests/tts_tests/test_tacotron_layers.py
@@ -67,8 +67,8 @@ def test_in_out():
         output, alignment, stop_tokens = layer(dummy_input, dummy_memory, mask=None)
 
         assert output.shape[0] == 4
-        assert output.shape[1] == 80, "size not {}".format(output.shape[1])
-        assert output.shape[2] == 2, "size not {}".format(output.shape[2])
+        assert output.shape[1] == 80, f"size not {output.shape[1]}"
+        assert output.shape[2] == 2, f"size not {output.shape[2]}"
         assert stop_tokens.shape[0] == 4
 
 
diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py
index 7ec3f0df1b..5f9af86e7e 100644
--- a/tests/tts_tests/test_tacotron_model.py
+++ b/tests/tts_tests/test_tacotron_model.py
@@ -51,7 +51,7 @@ def test_train_step():
         criterion_st = nn.BCEWithLogitsLoss().to(device)
         model = Tacotron(config).to(device)  # FIXME: missing num_speakers parameter to Tacotron ctor
         model.train()
-        print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for Tacotron model:{count_parameters(model)}")
         model_ref = copy.deepcopy(model)
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
@@ -71,8 +71,8 @@ def test_train_step():
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
             # if count not in [145, 59]:
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -105,7 +105,7 @@ def test_train_step():
         config.d_vector_dim = 55
         model = Tacotron(config).to(device)  # FIXME: missing num_speakers parameter to Tacotron ctor
         model.train()
-        print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for Tacotron model:{count_parameters(model)}")
         model_ref = copy.deepcopy(model)
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
@@ -127,8 +127,8 @@ def test_train_step():
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
             # if count not in [145, 59]:
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -165,7 +165,7 @@ def test_train_step():
         model = Tacotron(config).to(device)  # FIXME: missing num_speakers parameter to Tacotron ctor
         model.train()
         # print(model)
-        print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for Tacotron GST model:{count_parameters(model)}")
         model_ref = copy.deepcopy(model)
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
@@ -186,8 +186,8 @@ def test_train_step():
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -217,7 +217,7 @@ def test_train_step():
         model = Tacotron(config).to(device)  # FIXME: missing num_speakers parameter to Tacotron ctor
         model.train()
         # print(model)
-        print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for Tacotron GST model:{count_parameters(model)}")
         model_ref = copy.deepcopy(model)
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
@@ -238,8 +238,8 @@ def test_train_step():
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -288,7 +288,7 @@ def test_train_step():
         criterion = model.get_criterion()
         optimizer = model.get_optimizer()
         model.train()
-        print(" > Num parameters for Tacotron with Capacitron VAE model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for Tacotron with Capacitron VAE model:{count_parameters(model)}")
         model_ref = copy.deepcopy(model)
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
@@ -305,8 +305,8 @@ def test_train_step():
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -341,7 +341,7 @@ def test_train_step():
         config.d_vector_dim = 55
         model = Tacotron(config).to(device)  # FIXME: missing num_speakers parameter to Tacotron ctor
         model.train()
-        print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for Tacotron model:{count_parameters(model)}")
         model_ref = copy.deepcopy(model)
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
@@ -366,7 +366,7 @@ def test_train_step():
             name, param = name_param
             if name == "gst_layer.encoder.recurrence.weight_hh_l0":
                 continue
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py
index 17f1fd46a6..cc91b18c34 100644
--- a/tests/tts_tests/test_tacotron_train.py
+++ b/tests/tts_tests/test_tacotron_train.py
@@ -1,64 +1,63 @@
-import glob
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.tacotron_config import TacotronConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-
-config = TacotronConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    print_eval=True,
-    r=5,
-    max_decoder_steps=50,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    config = TacotronConfig(
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=False,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        print_eval=True,
+        r=5,
+        max_decoder_steps=50,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
+
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py
index c8a52e1c1b..790439ecb2 100644
--- a/tests/tts_tests/test_vits.py
+++ b/tests/tts_tests/test_vits.py
@@ -373,8 +373,8 @@ def _check_parameter_changes(model, model_ref):
             name = item1[0]
             param = item1[1]
             param_ref = item2[1]
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                name, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {name} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count = count + 1
 
diff --git a/tests/tts_tests/test_vits_d-vectors_train.py b/tests/tts_tests/test_vits_d-vectors_train.py
index 741bda91e9..b95e1deed3 100644
--- a/tests/tts_tests/test_vits_d-vectors_train.py
+++ b/tests/tts_tests/test_vits_d-vectors_train.py
@@ -1,61 +1,61 @@
-import glob
-import os
 import shutil
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.vits_config import VitsConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = VitsConfig(
-    batch_size=2,
-    eval_batch_size=2,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech-0"],
-    ],
-)
-# set audio config
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-
-# active multispeaker d-vec mode
-config.model_args.use_d_vector_file = True
-config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
-config.model_args.d_vector_dim = 256
-
-
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    config = VitsConfig(
+        batch_size=2,
+        eval_batch_size=2,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            ["Be a voice, not an echo.", "ljspeech-0"],
+        ],
+    )
+    # set audio config
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+
+    # active multispeaker d-vec mode
+    config.model_args.use_d_vector_file = True
+    config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
+    config.model_args.d_vector_dim = 256
+
+    config.save_json(config_path)
+
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py
index 09df7d29f2..189e6cfb4d 100644
--- a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py
+++ b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py
@@ -1,110 +1,111 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-dataset_config_en = BaseDatasetConfig(
-    formatter="ljspeech",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    path="tests/data/ljspeech",
-    language="en",
-)
-
-dataset_config_pt = BaseDatasetConfig(
-    formatter="ljspeech",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    path="tests/data/ljspeech",
-    language="pt-br",
-)
-
-config = VitsConfig(
-    batch_size=2,
-    eval_batch_size=2,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech", None, "en"],
-        ["Be a voice, not an echo.", "ljspeech", None, "pt-br"],
-    ],
-    datasets=[dataset_config_en, dataset_config_pt],
-)
-# set audio config
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-
-# active multilingual mode
-config.model_args.use_language_embedding = True
-config.use_language_embedding = True
-# active multispeaker mode
-config.model_args.use_speaker_embedding = True
-config.use_speaker_embedding = True
-
-# deactivate multispeaker d-vec mode
-config.model_args.use_d_vector_file = False
-config.use_d_vector_file = False
-
-# duration predictor
-config.model_args.use_sdp = False
-config.use_sdp = False
-
-# active language sampler
-config.use_language_weighted_sampler = True
-
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech"
-languae_id = "en"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
-continue_languages_path = os.path.join(continue_path, "language_ids.json")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    dataset_config_en = BaseDatasetConfig(
+        formatter="ljspeech",
+        meta_file_train="metadata.csv",
+        meta_file_val="metadata.csv",
+        path="tests/data/ljspeech",
+        language="en",
+    )
+
+    dataset_config_pt = BaseDatasetConfig(
+        formatter="ljspeech",
+        meta_file_train="metadata.csv",
+        meta_file_val="metadata.csv",
+        path="tests/data/ljspeech",
+        language="pt-br",
+    )
+
+    config = VitsConfig(
+        batch_size=2,
+        eval_batch_size=2,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            ["Be a voice, not an echo.", "ljspeech", None, "en"],
+            ["Be a voice, not an echo.", "ljspeech", None, "pt-br"],
+        ],
+        datasets=[dataset_config_en, dataset_config_pt],
+    )
+    # set audio config
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+
+    # active multilingual mode
+    config.model_args.use_language_embedding = True
+    config.use_language_embedding = True
+    # active multispeaker mode
+    config.model_args.use_speaker_embedding = True
+    config.use_speaker_embedding = True
+
+    # deactivate multispeaker d-vec mode
+    config.model_args.use_d_vector_file = False
+    config.use_d_vector_file = False
+
+    # duration predictor
+    config.model_args.use_sdp = False
+    config.use_sdp = False
+
+    # active language sampler
+    config.use_language_weighted_sampler = True
+
+    config.save_json(config_path)
+
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech"
+    languae_id = "en"
+    continue_speakers_path = continue_path / "speakers.json"
+    continue_languages_path = continue_path / "language_ids.json"
+
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
+
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
index 7ae09c0e5c..8b8757422c 100644
--- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
+++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
@@ -1,117 +1,117 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-dataset_config_en = BaseDatasetConfig(
-    formatter="ljspeech_test",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    path="tests/data/ljspeech",
-    language="en",
-)
-
-dataset_config_pt = BaseDatasetConfig(
-    formatter="ljspeech_test",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    path="tests/data/ljspeech",
-    language="pt-br",
-)
-
-config = VitsConfig(
-    batch_size=2,
-    eval_batch_size=2,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="multilingual_cleaners",
-    use_phonemes=False,
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech-0", None, "en"],
-        ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"],
-    ],
-    datasets=[dataset_config_en, dataset_config_en, dataset_config_en, dataset_config_pt],
-)
-# set audio config
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-
-# active multilingual mode
-config.model_args.use_language_embedding = True
-config.use_language_embedding = True
-
-# deactivate multispeaker mode
-config.model_args.use_speaker_embedding = False
-config.use_speaker_embedding = False
-
-# active multispeaker d-vec mode
-config.model_args.use_d_vector_file = True
-config.use_d_vector_file = True
-config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
-config.d_vector_file = ["tests/data/ljspeech/speakers.json"]
-config.model_args.d_vector_dim = 256
-config.d_vector_dim = 256
-
-# duration predictor
-config.model_args.use_sdp = True
-config.use_sdp = True
-
-# activate language and speaker samplers
-config.use_language_weighted_sampler = True
-config.language_weighted_sampler_alpha = 10
-config.use_speaker_weighted_sampler = True
-config.speaker_weighted_sampler_alpha = 5
-
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-languae_id = "en"
-continue_speakers_path = config.d_vector_file
-continue_languages_path = os.path.join(continue_path, "language_ids.json")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    dataset_config_en = BaseDatasetConfig(
+        formatter="ljspeech_test",
+        meta_file_train="metadata.csv",
+        meta_file_val="metadata.csv",
+        path="tests/data/ljspeech",
+        language="en",
+    )
+
+    dataset_config_pt = BaseDatasetConfig(
+        formatter="ljspeech_test",
+        meta_file_train="metadata.csv",
+        meta_file_val="metadata.csv",
+        path="tests/data/ljspeech",
+        language="pt-br",
+    )
+
+    config = VitsConfig(
+        batch_size=2,
+        eval_batch_size=2,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="multilingual_cleaners",
+        use_phonemes=False,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            ["Be a voice, not an echo.", "ljspeech-0", None, "en"],
+            ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"],
+        ],
+        datasets=[dataset_config_en, dataset_config_en, dataset_config_en, dataset_config_pt],
+    )
+    # set audio config
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+
+    # active multilingual mode
+    config.model_args.use_language_embedding = True
+    config.use_language_embedding = True
+
+    # deactivate multispeaker mode
+    config.model_args.use_speaker_embedding = False
+    config.use_speaker_embedding = False
+
+    # active multispeaker d-vec mode
+    config.model_args.use_d_vector_file = True
+    config.use_d_vector_file = True
+    config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
+    config.d_vector_file = ["tests/data/ljspeech/speakers.json"]
+    config.model_args.d_vector_dim = 256
+    config.d_vector_dim = 256
+
+    # duration predictor
+    config.model_args.use_sdp = True
+    config.use_sdp = True
+
+    # activate language and speaker samplers
+    config.use_language_weighted_sampler = True
+    config.language_weighted_sampler_alpha = 10
+    config.use_speaker_weighted_sampler = True
+    config.speaker_weighted_sampler_alpha = 5
+
+    config.save_json(config_path)
+
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    languae_id = "en"
+    continue_speakers_path = config.d_vector_file
+    continue_languages_path = continue_path / "language_ids.json"
+
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
+
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py
index 69fae21f8d..6678cca90c 100644
--- a/tests/tts_tests/test_vits_speaker_emb_train.py
+++ b/tests/tts_tests/test_vits_speaker_emb_train.py
@@ -1,83 +1,83 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.vits_config import VitsConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = VitsConfig(
-    batch_size=2,
-    eval_batch_size=2,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech-1"],
-    ],
-)
-# set audio config
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
+    config = VitsConfig(
+        batch_size=2,
+        eval_batch_size=2,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            ["Be a voice, not an echo.", "ljspeech-1"],
+        ],
+    )
+    # set audio config
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
 
-# active multispeaker d-vec mode
-config.model_args.use_speaker_embedding = True
-config.model_args.use_d_vector_file = False
-config.model_args.d_vector_file = None
-config.model_args.d_vector_dim = 256
+    # active multispeaker d-vec mode
+    config.model_args.use_speaker_embedding = True
+    config.model_args.use_d_vector_file = False
+    config.model_args.d_vector_file = None
+    config.model_args.d_vector_dim = 256
 
+    config.save_json(config_path)
 
-config.save_json(config_path)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech_test "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = continue_path / "speakers.json"
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py
index 78f42d154b..e0f7a656b0 100644
--- a/tests/tts_tests/test_vits_train.py
+++ b/tests/tts_tests/test_vits_train.py
@@ -1,72 +1,73 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.vits_config import VitsConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = VitsConfig(
-    batch_size=2,
-    eval_batch_size=2,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo."],
-    ],
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = VitsConfig(
+        batch_size=2,
+        eval_batch_size=2,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            ["Be a voice, not an echo."],
+        ],
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests2/test_align_tts_train.py b/tests/tts_tests2/test_align_tts_train.py
index 91c3c35bc6..1582f51fd4 100644
--- a/tests/tts_tests2/test_align_tts_train.py
+++ b/tests/tts_tests2/test_align_tts_train.py
@@ -1,72 +1,71 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.align_tts_config import AlignTTSConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = AlignTTSConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-)
+    config = AlignTTSConfig(
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=False,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+    )
 
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py
index 1e5cd49f73..74d7a0a734 100644
--- a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py
+++ b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py
@@ -1,100 +1,98 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig
 from TTS.tts.models.delightful_tts import DelightfulTtsArgs, VocoderConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-
-audio_config = DelightfulTtsAudioConfig()
-model_args = DelightfulTtsArgs(
-    use_speaker_embedding=False, d_vector_dim=256, use_d_vector_file=True, speaker_embedding_channels=256
-)
-
-vocoder_config = VocoderConfig()
-
-config = DelightfulTTSConfig(
-    model_args=model_args,
-    audio=audio_config,
-    vocoder=vocoder_config,
-    batch_size=2,
-    eval_batch_size=8,
-    compute_f0=True,
-    run_eval=True,
-    test_delay_epochs=-1,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache_delightful/",  ## delightful f0 cache is incompatible with other models
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    binary_align_loss_alpha=0.0,
-    use_attn_priors=False,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech-0"],
-    ],
-    output_path=output_path,
-    use_speaker_embedding=False,
-    use_d_vector_file=True,
-    d_vector_file="tests/data/ljspeech/speakers.json",
-    d_vector_dim=256,
-    speaker_embedding_channels=256,
-)
-
-# active multispeaker d-vec mode
-config.model_args.use_speaker_embedding = False
-config.model_args.use_d_vector_file = True
-config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
-config.model_args.d_vector_dim = 256
-
-
-config.save_json(config_path)
-
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-speaker_id = "ljspeech-1"
-continue_speakers_path = config.d_vector_file
-
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --speakers_file_path {continue_speakers_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
-shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/")
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    audio_config = DelightfulTtsAudioConfig()
+    model_args = DelightfulTtsArgs(
+        use_speaker_embedding=False, d_vector_dim=256, use_d_vector_file=True, speaker_embedding_channels=256
+    )
+
+    vocoder_config = VocoderConfig()
+
+    config = DelightfulTTSConfig(
+        model_args=model_args,
+        audio=audio_config,
+        vocoder=vocoder_config,
+        batch_size=2,
+        eval_batch_size=8,
+        compute_f0=True,
+        run_eval=True,
+        test_delay_epochs=-1,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        f0_cache_path=tmp_path / "f0_cache",  # delightful f0 cache is incompatible with other models
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        binary_align_loss_alpha=0.0,
+        use_attn_priors=False,
+        test_sentences=[
+            ["Be a voice, not an echo.", "ljspeech-0"],
+        ],
+        output_path=output_path,
+        use_speaker_embedding=False,
+        use_d_vector_file=True,
+        d_vector_file="tests/data/ljspeech/speakers.json",
+        d_vector_dim=256,
+        speaker_embedding_channels=256,
+    )
+
+    # active multispeaker d-vec mode
+    config.model_args.use_speaker_embedding = False
+    config.model_args.use_d_vector_file = True
+    config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
+    config.model_args.d_vector_dim = 256
+    config.save_json(config_path)
+
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+
+    run_cli(command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = config.d_vector_file
+
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
+
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --speakers_file_path {continue_speakers_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_delightful_tts_emb_spk.py b/tests/tts_tests2/test_delightful_tts_emb_spk.py
index 9bbf7a55ea..68f790599e 100644
--- a/tests/tts_tests2/test_delightful_tts_emb_spk.py
+++ b/tests/tts_tests2/test_delightful_tts_emb_spk.py
@@ -1,94 +1,93 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig
 from TTS.tts.models.delightful_tts import DelightfulTtsArgs, VocoderConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-
-audio_config = DelightfulTtsAudioConfig()
-model_args = DelightfulTtsArgs(use_speaker_embedding=False)
-
-vocoder_config = VocoderConfig()
-
-config = DelightfulTTSConfig(
-    model_args=model_args,
-    audio=audio_config,
-    vocoder=vocoder_config,
-    batch_size=2,
-    eval_batch_size=8,
-    compute_f0=True,
-    run_eval=True,
-    test_delay_epochs=-1,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache_delightful/",  ## delightful f0 cache is incompatible with other models
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    binary_align_loss_alpha=0.0,
-    use_attn_priors=False,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech"],
-    ],
-    output_path=output_path,
-    num_speakers=4,
-    use_speaker_embedding=True,
-)
-
-# active multispeaker d-vec mode
-config.model_args.use_speaker_embedding = True
-config.model_args.use_d_vector_file = False
-config.model_args.d_vector_file = None
-config.model_args.d_vector_dim = 256
-
-
-config.save_json(config_path)
-
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.dataset_name ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech"
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
-shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/")
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    audio_config = DelightfulTtsAudioConfig()
+    model_args = DelightfulTtsArgs(use_speaker_embedding=False)
+
+    vocoder_config = VocoderConfig()
+
+    config = DelightfulTTSConfig(
+        model_args=model_args,
+        audio=audio_config,
+        vocoder=vocoder_config,
+        batch_size=2,
+        eval_batch_size=8,
+        compute_f0=True,
+        run_eval=True,
+        test_delay_epochs=-1,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        f0_cache_path=tmp_path / "f0_cache",  # delightful f0 cache is incompatible with other models
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        binary_align_loss_alpha=0.0,
+        use_attn_priors=False,
+        test_sentences=[
+            ["Be a voice, not an echo.", "ljspeech"],
+        ],
+        output_path=output_path,
+        num_speakers=4,
+        use_speaker_embedding=True,
+    )
+
+    # active multispeaker d-vec mode
+    config.model_args.use_speaker_embedding = True
+    config.model_args.use_d_vector_file = False
+    config.model_args.d_vector_file = None
+    config.model_args.d_vector_dim = 256
+    config.save_json(config_path)
+
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.dataset_name ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+
+    run_cli(command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech"
+
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
+
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_delightful_tts_train.py b/tests/tts_tests2/test_delightful_tts_train.py
index 3e6fbd2e86..4676ee4869 100644
--- a/tests/tts_tests2/test_delightful_tts_train.py
+++ b/tests/tts_tests2/test_delightful_tts_train.py
@@ -1,97 +1,97 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.delightful_tts_config import DelightfulTTSConfig
 from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    do_trim_silence=True,
-    trim_db=60.0,
-    signal_norm=False,
-    mel_fmin=0.0,
-    mel_fmax=8000,
-    spec_gain=1.0,
-    log_func="np.log",
-    ref_level_db=20,
-    preemphasis=0.0,
-)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-audio_config = DelightfulTtsAudioConfig()
-model_args = DelightfulTtsArgs()
+    audio_config = BaseAudioConfig(
+        sample_rate=22050,
+        do_trim_silence=True,
+        trim_db=60.0,
+        signal_norm=False,
+        mel_fmin=0.0,
+        mel_fmax=8000,
+        spec_gain=1.0,
+        log_func="np.log",
+        ref_level_db=20,
+        preemphasis=0.0,
+    )
 
-vocoder_config = VocoderConfig()
+    audio_config = DelightfulTtsAudioConfig()
+    model_args = DelightfulTtsArgs()
 
+    vocoder_config = VocoderConfig()
 
-config = DelightfulTTSConfig(
-    audio=audio_config,
-    batch_size=2,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache_delightful/",  ## delightful f0 cache is incompatible with other models
-    run_eval=True,
-    test_delay_epochs=-1,
-    binary_align_loss_alpha=0.0,
-    epochs=1,
-    print_step=1,
-    use_attn_priors=False,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo."],
-    ],
-    use_speaker_embedding=False,
-)
-config.save_json(config_path)
+    config = DelightfulTTSConfig(
+        audio=audio_config,
+        batch_size=2,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        f0_cache_path=tmp_path / "f0_cache",  # delightful f0 cache is incompatible with other models
+        run_eval=True,
+        test_delay_epochs=-1,
+        binary_align_loss_alpha=0.0,
+        epochs=1,
+        print_step=1,
+        use_attn_priors=False,
+        print_eval=True,
+        test_sentences=[
+            ["Be a voice, not an echo."],
+        ],
+        use_speaker_embedding=False,
+    )
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{'cpu'}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs -1"
-)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{'cpu'}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs -1"
+    )
 
-run_cli(command_train)
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == -1
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == -1
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
-shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/")
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py b/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py
index e6bc9f9feb..379e2f346b 100644
--- a/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py
+++ b/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py
@@ -1,92 +1,94 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.fast_pitch_config import FastPitchConfig
 
-config_path = os.path.join(get_tests_output_path(), "fast_pitch_speaker_emb_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    do_trim_silence=True,
-    trim_db=60.0,
-    signal_norm=False,
-    mel_fmin=0.0,
-    mel_fmax=8000,
-    spec_gain=1.0,
-    log_func="np.log",
-    ref_level_db=20,
-    preemphasis=0.0,
-)
+def test_train(tmp_path):
+    config_path = tmp_path / "fast_pitch_speaker_emb_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = FastPitchConfig(
-    audio=audio_config,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    use_speaker_embedding=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-)
-config.audio.do_trim_silence = True
-config.use_speaker_embedding = True
-config.model_args.use_speaker_embedding = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    audio_config = BaseAudioConfig(
+        sample_rate=22050,
+        do_trim_silence=True,
+        trim_db=60.0,
+        signal_norm=False,
+        mel_fmin=0.0,
+        mel_fmax=8000,
+        spec_gain=1.0,
+        log_func="np.log",
+        ref_level_db=20,
+        preemphasis=0.0,
+    )
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    config = FastPitchConfig(
+        audio=audio_config,
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        f0_cache_path="tests/data/ljspeech/f0_cache/",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        use_speaker_embedding=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+    )
+    config.audio.do_trim_silence = True
+    config.use_speaker_embedding = True
+    config.model_args.use_speaker_embedding = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech_test "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = continue_path / "speakers.json"
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_fast_pitch_train.py b/tests/tts_tests2/test_fast_pitch_train.py
index fe87c8b600..e0838a2049 100644
--- a/tests/tts_tests2/test_fast_pitch_train.py
+++ b/tests/tts_tests2/test_fast_pitch_train.py
@@ -1,91 +1,93 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.fast_pitch_config import FastPitchConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    do_trim_silence=True,
-    trim_db=60.0,
-    signal_norm=False,
-    mel_fmin=0.0,
-    mel_fmax=8000,
-    spec_gain=1.0,
-    log_func="np.log",
-    ref_level_db=20,
-    preemphasis=0.0,
-)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = FastPitchConfig(
-    audio=audio_config,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    use_speaker_embedding=False,
-)
-config.audio.do_trim_silence = True
-config.use_speaker_embedding = False
-config.model_args.use_speaker_embedding = False
-config.audio.trim_db = 60
-config.save_json(config_path)
+    audio_config = BaseAudioConfig(
+        sample_rate=22050,
+        do_trim_silence=True,
+        trim_db=60.0,
+        signal_norm=False,
+        mel_fmin=0.0,
+        mel_fmax=8000,
+        spec_gain=1.0,
+        log_func="np.log",
+        ref_level_db=20,
+        preemphasis=0.0,
+    )
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
+    config = FastPitchConfig(
+        audio=audio_config,
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        f0_cache_path="tests/data/ljspeech/f0_cache/",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        use_speaker_embedding=False,
+    )
+    config.audio.do_trim_silence = True
+    config.use_speaker_embedding = False
+    config.model_args.use_speaker_embedding = False
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    run_cli(command_train)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py b/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py
index 735d2fc4c6..348729c6f4 100644
--- a/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py
+++ b/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py
@@ -1,95 +1,97 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.fastspeech2_config import Fastspeech2Config
 
-config_path = os.path.join(get_tests_output_path(), "fast_pitch_speaker_emb_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    do_trim_silence=True,
-    trim_db=60.0,
-    signal_norm=False,
-    mel_fmin=0.0,
-    mel_fmax=8000,
-    spec_gain=1.0,
-    log_func="np.log",
-    ref_level_db=20,
-    preemphasis=0.0,
-)
+def test_train(tmp_path):
+    config_path = tmp_path / "fast_pitch_speaker_emb_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = Fastspeech2Config(
-    audio=audio_config,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache/",
-    compute_f0=True,
-    compute_energy=True,
-    energy_cache_path="tests/data/ljspeech/energy_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    use_speaker_embedding=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-)
-config.audio.do_trim_silence = True
-config.use_speaker_embedding = True
-config.model_args.use_speaker_embedding = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    audio_config = BaseAudioConfig(
+        sample_rate=22050,
+        do_trim_silence=True,
+        trim_db=60.0,
+        signal_norm=False,
+        mel_fmin=0.0,
+        mel_fmax=8000,
+        spec_gain=1.0,
+        log_func="np.log",
+        ref_level_db=20,
+        preemphasis=0.0,
+    )
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    config = Fastspeech2Config(
+        audio=audio_config,
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        f0_cache_path="tests/data/ljspeech/f0_cache/",
+        compute_f0=True,
+        compute_energy=True,
+        energy_cache_path=tmp_path / "energy_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        use_speaker_embedding=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+    )
+    config.audio.do_trim_silence = True
+    config.use_speaker_embedding = True
+    config.model_args.use_speaker_embedding = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech_test "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = continue_path / "speakers.json"
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_fastspeech_2_train.py b/tests/tts_tests2/test_fastspeech_2_train.py
index 07fc5a1a2c..ab513ec827 100644
--- a/tests/tts_tests2/test_fastspeech_2_train.py
+++ b/tests/tts_tests2/test_fastspeech_2_train.py
@@ -1,94 +1,96 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.fastspeech2_config import Fastspeech2Config
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    do_trim_silence=True,
-    trim_db=60.0,
-    signal_norm=False,
-    mel_fmin=0.0,
-    mel_fmax=8000,
-    spec_gain=1.0,
-    log_func="np.log",
-    ref_level_db=20,
-    preemphasis=0.0,
-)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = Fastspeech2Config(
-    audio=audio_config,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache/",
-    compute_f0=True,
-    compute_energy=True,
-    energy_cache_path="tests/data/ljspeech/energy_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    use_speaker_embedding=False,
-)
-config.audio.do_trim_silence = True
-config.use_speaker_embedding = False
-config.model_args.use_speaker_embedding = False
-config.audio.trim_db = 60
-config.save_json(config_path)
+    audio_config = BaseAudioConfig(
+        sample_rate=22050,
+        do_trim_silence=True,
+        trim_db=60.0,
+        signal_norm=False,
+        mel_fmin=0.0,
+        mel_fmax=8000,
+        spec_gain=1.0,
+        log_func="np.log",
+        ref_level_db=20,
+        preemphasis=0.0,
+    )
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
+    config = Fastspeech2Config(
+        audio=audio_config,
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        f0_cache_path="tests/data/ljspeech/f0_cache/",
+        compute_f0=True,
+        compute_energy=True,
+        energy_cache_path=output_path / "energy_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        use_speaker_embedding=False,
+    )
+    config.audio.do_trim_silence = True
+    config.use_speaker_embedding = False
+    config.model_args.use_speaker_embedding = False
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    run_cli(command_train)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_glow_tts.py b/tests/tts_tests2/test_glow_tts.py
index 3c7ac51556..c92063576f 100644
--- a/tests/tts_tests2/test_glow_tts.py
+++ b/tests/tts_tests2/test_glow_tts.py
@@ -42,8 +42,8 @@ def _create_inputs(batch_size=8):
     def _check_parameter_changes(model, model_ref):
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -107,7 +107,7 @@ def _test_forward(self, batch_size):
         config = GlowTTSConfig(num_chars=32)
         model = GlowTTS(config).to(device)
         model.train()
-        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for GlowTTS model:{count_parameters(model)}")
         # inference encoder and decoder with MAS
         y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths)
         self.assertEqual(y["z"].shape, mel_spec.shape)
@@ -134,7 +134,7 @@ def _test_forward_with_d_vector(self, batch_size):
         )
         model = GlowTTS.init_from_config(config).to(device)
         model.train()
-        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for GlowTTS model:{count_parameters(model)}")
         # inference encoder and decoder with MAS
         y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"d_vectors": d_vector})
         self.assertEqual(y["z"].shape, mel_spec.shape)
@@ -160,7 +160,7 @@ def _test_forward_with_speaker_id(self, batch_size):
         )
         model = GlowTTS.init_from_config(config).to(device)
         model.train()
-        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for GlowTTS model:{count_parameters(model)}")
         # inference encoder and decoder with MAS
         y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"speaker_ids": speaker_ids})
         self.assertEqual(y["z"].shape, mel_spec.shape)
@@ -241,10 +241,10 @@ def _test_inference_with_MAS(self, batch_size):
         # inference encoder and decoder with MAS
         y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths)
         y2 = model.decoder_inference(mel_spec, mel_lengths)
-        assert (
-            y2["model_outputs"].shape == y["model_outputs"].shape
-        ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format(
-            y["model_outputs"].shape, y2["model_outputs"].shape
+        assert y2["model_outputs"].shape == y["model_outputs"].shape, (
+            "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format(
+                y["model_outputs"].shape, y2["model_outputs"].shape
+            )
         )
 
     def test_inference_with_MAS(self):
@@ -261,7 +261,7 @@ def test_train_step(self):
         # reference model to compare model weights
         model_ref = GlowTTS(config).to(device)
         model.train()
-        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for GlowTTS model:{count_parameters(model)}")
         # pass the state to ref model
         model_ref.load_state_dict(copy.deepcopy(model.state_dict()))
         count = 0
diff --git a/tests/tts_tests2/test_glow_tts_d-vectors_train.py b/tests/tts_tests2/test_glow_tts_d-vectors_train.py
index 8236607c25..f03139ac77 100644
--- a/tests/tts_tests2/test_glow_tts_d-vectors_train.py
+++ b/tests/tts_tests2/test_glow_tts_d-vectors_train.py
@@ -1,79 +1,80 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.glow_tts_config import GlowTTSConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = GlowTTSConfig(
-    batch_size=2,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    data_dep_init_steps=1.0,
-    use_speaker_embedding=False,
-    use_d_vector_file=True,
-    d_vector_file="tests/data/ljspeech/speakers.json",
-    d_vector_dim=256,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = GlowTTSConfig(
+        batch_size=2,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        data_dep_init_steps=1.0,
+        use_speaker_embedding=False,
+        use_d_vector_file=True,
+        d_vector_file="tests/data/ljspeech/speakers.json",
+        d_vector_dim=256,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech_test "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = config.d_vector_file
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = config.d_vector_file
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_glow_tts_speaker_emb_train.py b/tests/tts_tests2/test_glow_tts_speaker_emb_train.py
index 4a8bd0658d..b9fe93a2fa 100644
--- a/tests/tts_tests2/test_glow_tts_speaker_emb_train.py
+++ b/tests/tts_tests2/test_glow_tts_speaker_emb_train.py
@@ -1,76 +1,77 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.glow_tts_config import GlowTTSConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = GlowTTSConfig(
-    batch_size=2,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    data_dep_init_steps=1.0,
-    use_speaker_embedding=True,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = GlowTTSConfig(
+        batch_size=2,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        data_dep_init_steps=1.0,
+        use_speaker_embedding=True,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech_test "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = continue_path / "speakers.json"
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_glow_tts_train.py b/tests/tts_tests2/test_glow_tts_train.py
index 1d7f913575..3f1bf3a794 100644
--- a/tests/tts_tests2/test_glow_tts_train.py
+++ b/tests/tts_tests2/test_glow_tts_train.py
@@ -1,73 +1,74 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.glow_tts_config import GlowTTSConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = GlowTTSConfig(
-    batch_size=2,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    data_dep_init_steps=1.0,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = GlowTTSConfig(
+        batch_size=2,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        data_dep_init_steps=1.0,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/vc_tests/test_freevc.py b/tests/vc_tests/test_freevc.py
index fe07b2723c..784e32a68d 100644
--- a/tests/vc_tests/test_freevc.py
+++ b/tests/vc_tests/test_freevc.py
@@ -55,7 +55,7 @@ def _test_forward(self, batch_size):
         config = FreeVCConfig()
         model = FreeVC(config).to(device)
         model.train()
-        print(" > Num parameters for FreeVC model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for FreeVC model:{count_parameters(model)}")
 
         mel, spec, spec_lengths, waveform = self._create_inputs(config, batch_size)
 
@@ -80,9 +80,9 @@ def _test_inference(self, batch_size):
         wavlm_vec_lengths = torch.ones(batch_size, dtype=torch.long)
 
         output_wav = model.inference(wavlm_vec, None, mel, wavlm_vec_lengths)
-        assert (
-            output_wav.shape[-1] // config.audio.hop_length == wavlm_vec.shape[-1]
-        ), f"{output_wav.shape[-1] // config.audio.hop_length} != {wavlm_vec.shape}"
+        assert output_wav.shape[-1] // config.audio.hop_length == wavlm_vec.shape[-1], (
+            f"{output_wav.shape[-1] // config.audio.hop_length} != {wavlm_vec.shape}"
+        )
 
     def test_inference(self):
         self._test_inference(1)
@@ -95,9 +95,9 @@ def test_voice_conversion(self):
 
         source_wav, target_wav = self._create_inputs_inference()
         output_wav = model.voice_conversion(source_wav, target_wav)
-        assert (
-            output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length
-        ), f"{output_wav.shape} != {source_wav.shape}, {config.audio.hop_length}"
+        assert output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length, (
+            f"{output_wav.shape} != {source_wav.shape}, {config.audio.hop_length}"
+        )
 
     def test_train_step(self): ...
 
diff --git a/tests/vc_tests/test_openvoice.py b/tests/vc_tests/test_openvoice.py
index c9f7ae3931..703873ea47 100644
--- a/tests/vc_tests/test_openvoice.py
+++ b/tests/vc_tests/test_openvoice.py
@@ -16,7 +16,6 @@
 
 
 class TestOpenVoice(unittest.TestCase):
-
     @staticmethod
     def _create_inputs_inference():
         source_wav = torch.rand(16100)
@@ -37,6 +36,6 @@ def test_voice_conversion(self):
 
         source_wav, target_wav = self._create_inputs_inference()
         output_wav = model.voice_conversion(source_wav, target_wav)
-        assert (
-            output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length
-        ), f"{output_wav.shape} != {source_wav.shape}"
+        assert output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length, (
+            f"{output_wav.shape} != {source_wav.shape}"
+        )
diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py
deleted file mode 100644
index 9d4e193382..0000000000
--- a/tests/vocoder_tests/test_fullband_melgan_train.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import FullbandMelganConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = FullbandMelganConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=8192,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_hifigan_train.py b/tests/vocoder_tests/test_hifigan_train.py
deleted file mode 100644
index c506fb48dc..0000000000
--- a/tests/vocoder_tests/test_hifigan_train.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import HifiganConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = HifiganConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=1024,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py
deleted file mode 100644
index 6ef9cd495b..0000000000
--- a/tests/vocoder_tests/test_melgan_train.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import MelganConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = MelganConfig(
-    batch_size=4,
-    eval_batch_size=4,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=2048,
-    eval_split_size=1,
-    print_step=1,
-    discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py
deleted file mode 100644
index 8002760706..0000000000
--- a/tests/vocoder_tests/test_multiband_melgan_train.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import MultibandMelganConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = MultibandMelganConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=8192,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    steps_to_start_discriminator=1,
-    data_path="tests/data/ljspeech",
-    discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_parallel_wavegan_train.py b/tests/vocoder_tests/test_parallel_wavegan_train.py
deleted file mode 100644
index a126befe2e..0000000000
--- a/tests/vocoder_tests/test_parallel_wavegan_train.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import ParallelWaveganConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = ParallelWaveganConfig(
-    batch_size=4,
-    eval_batch_size=4,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=2048,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_training.py b/tests/vocoder_tests/test_training.py
new file mode 100644
index 0000000000..8965de01ee
--- /dev/null
+++ b/tests/vocoder_tests/test_training.py
@@ -0,0 +1,112 @@
+import glob
+import os
+
+import pytest
+
+from tests import run_main
+from TTS.bin.train_vocoder import main
+from TTS.vocoder.configs import (
+    FullbandMelganConfig,
+    HifiganConfig,
+    MelganConfig,
+    MultibandMelganConfig,
+    ParallelWaveganConfig,
+    WavegradConfig,
+    WavernnConfig,
+)
+from TTS.vocoder.models.wavernn import WavernnArgs
+
+GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
+
+BASE_CONFIG = {
+    "batch_size": 8,
+    "eval_batch_size": 8,
+    "num_loader_workers": 0,
+    "num_eval_loader_workers": 0,
+    "run_eval": True,
+    "test_delay_epochs": -1,
+    "epochs": 1,
+    "seq_len": 8192,
+    "eval_split_size": 1,
+    "print_step": 1,
+    "print_eval": True,
+    "data_path": "tests/data/ljspeech",
+}
+
+DISCRIMINATOR_MODEL_PARAMS = {
+    "base_channels": 16,
+    "max_channels": 64,
+    "downsample_factors": [4, 4, 4],
+}
+
+
+def create_config(config_class, **overrides):
+    params = {**BASE_CONFIG, **overrides}
+    return config_class(**params)
+
+
+def run_train(tmp_path, config):
+    config_path = str(tmp_path / "test_vocoder_config.json")
+    output_path = tmp_path / "train_outputs"
+    config.output_path = output_path
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
+
+    # Train the model for one epoch
+    run_main(main, ["--config_path", config_path])
+
+    # Find the latest folder
+    continue_path = str(max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime))
+
+    # Restore the model and continue training for one more epoch
+    run_main(main, ["--continue_path", continue_path])
+
+
+def test_train_hifigan(tmp_path):
+    config = create_config(HifiganConfig, seq_len=1024)
+    run_train(tmp_path, config)
+
+
+def test_train_melgan(tmp_path):
+    config = create_config(
+        MelganConfig,
+        batch_size=4,
+        eval_batch_size=4,
+        seq_len=2048,
+        discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS,
+    )
+    run_train(tmp_path, config)
+
+
+def test_train_multiband_melgan(tmp_path):
+    config = create_config(
+        MultibandMelganConfig, steps_to_start_discriminator=1, discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS
+    )
+    run_train(tmp_path, config)
+
+
+def test_train_fullband_melgan(tmp_path):
+    config = create_config(FullbandMelganConfig, discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS)
+    run_train(tmp_path, config)
+
+
+def test_train_parallel_wavegan(tmp_path):
+    config = create_config(ParallelWaveganConfig, batch_size=4, eval_batch_size=4, seq_len=2048)
+    run_train(tmp_path, config)
+
+
+# TODO: Reactivate after improving CI run times
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Takes ~2h on CI (15min/step vs 8sec/step locally)")
+def test_train_wavegrad(tmp_path):
+    config = create_config(WavegradConfig, test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2})
+    run_train(tmp_path, config)
+
+
+def test_train_wavernn(tmp_path):
+    config = create_config(
+        WavernnConfig,
+        model_args=WavernnArgs(),
+        seq_len=256,  # For shorter test time
+    )
+    run_train(tmp_path, config)
diff --git a/tests/vocoder_tests/test_vocoder_gan_datasets.py b/tests/vocoder_tests/test_vocoder_gan_datasets.py
index c39d70e94c..d540667ee8 100644
--- a/tests/vocoder_tests/test_vocoder_gan_datasets.py
+++ b/tests/vocoder_tests/test_vocoder_gan_datasets.py
@@ -3,16 +3,12 @@
 import numpy as np
 from torch.utils.data import DataLoader
 
-from tests import get_tests_output_path, get_tests_path
+from tests import get_tests_path
 from TTS.utils.audio import AudioProcessor
 from TTS.vocoder.configs import BaseGANVocoderConfig
 from TTS.vocoder.datasets.gan_dataset import GANDataset
 from TTS.vocoder.datasets.preprocess import load_wav_data
 
-file_path = os.path.dirname(os.path.realpath(__file__))
-OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
-os.makedirs(OUTPATH, exist_ok=True)
-
 C = BaseGANVocoderConfig()
 
 test_data_path = os.path.join(get_tests_path(), "data/ljspeech/")
diff --git a/tests/vocoder_tests/test_vocoder_losses.py b/tests/vocoder_tests/test_vocoder_losses.py
index 95501c2d39..c9432d7f4b 100644
--- a/tests/vocoder_tests/test_vocoder_losses.py
+++ b/tests/vocoder_tests/test_vocoder_losses.py
@@ -2,17 +2,12 @@
 
 import torch
 
-from tests import get_tests_input_path, get_tests_output_path, get_tests_path
+from tests import get_tests_input_path
 from TTS.config import BaseAudioConfig
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.audio.numpy_transforms import stft
 from TTS.vocoder.layers.losses import MelganFeatureLoss, MultiScaleSTFTLoss, STFTLoss, TorchSTFT
 
-TESTS_PATH = get_tests_path()
-
-OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
-os.makedirs(OUT_PATH, exist_ok=True)
-
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 
 ap = AudioProcessor(**BaseAudioConfig().to_dict())
diff --git a/tests/vocoder_tests/test_vocoder_pqmf.py b/tests/vocoder_tests/test_vocoder_pqmf.py
index afe8d1dc8f..9be492927d 100644
--- a/tests/vocoder_tests/test_vocoder_pqmf.py
+++ b/tests/vocoder_tests/test_vocoder_pqmf.py
@@ -4,14 +4,13 @@
 import torch
 from librosa.core import load
 
-from tests import get_tests_input_path, get_tests_output_path, get_tests_path
+from tests import get_tests_input_path
 from TTS.vocoder.layers.pqmf import PQMF
 
-TESTS_PATH = get_tests_path()
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 
 
-def test_pqmf():
+def test_pqmf(tmp_path):
     w, sr = load(WAV_FILE)
 
     layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0)
@@ -23,4 +22,4 @@ def test_pqmf():
     print(w2_.max())
     print(w2_.min())
     print(w2_.mean())
-    sf.write(os.path.join(get_tests_output_path(), "pqmf_output.wav"), w2_.flatten().detach(), sr)
+    sf.write(tmp_path / "pqmf_output.wav", w2_.flatten().detach(), sr)
diff --git a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py
index 503b4e2483..c3ae1309dc 100644
--- a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py
+++ b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py
@@ -1,29 +1,38 @@
 import os
-import shutil
 
 import numpy as np
+import pytest
 from torch.utils.data import DataLoader
 
-from tests import get_tests_output_path, get_tests_path
+from tests import get_tests_path
 from TTS.utils.audio import AudioProcessor
 from TTS.vocoder.configs import WavernnConfig
 from TTS.vocoder.datasets.preprocess import load_wav_feat_data, preprocess_wav_files
 from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
 
-file_path = os.path.dirname(os.path.realpath(__file__))
-OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
-os.makedirs(OUTPATH, exist_ok=True)
-
 C = WavernnConfig()
 
 test_data_path = os.path.join(get_tests_path(), "data/ljspeech/")
-test_mel_feat_path = os.path.join(test_data_path, "mel")
-test_quant_feat_path = os.path.join(test_data_path, "quant")
-ok_ljspeech = os.path.exists(test_data_path)
 
+params = [
+    [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0],
+    [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4],
+    [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0],
+    [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0],
+    [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0],
+    [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2],
+    [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0],
+]
+
+
+@pytest.mark.parametrize("params", params)
+def test_parametrized_wavernn_dataset(tmp_path, params):
+    """Run dataloader with given parameters and check conditions"""
+    print(params)
+    batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers = params
+    test_mel_feat_path = tmp_path / "mel"
+    test_quant_feat_path = tmp_path / "quant"
 
-def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers):
-    """run dataloader with given parameters and check conditions"""
     ap = AudioProcessor(**C.audio)
 
     C.batch_size = batch_size
@@ -31,7 +40,7 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor
     C.seq_len = seq_len
     C.data_path = test_data_path
 
-    preprocess_wav_files(test_data_path, C, ap)
+    preprocess_wav_files(tmp_path, C, ap)
     _, train_items = load_wav_feat_data(test_data_path, test_mel_feat_path, 5)
 
     dataset = WaveRNNDataset(
@@ -50,35 +59,12 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor
     max_iter = 10
     count_iter = 0
 
-    try:
-        for data in loader:
-            x_input, mels, _ = data
-            expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2))
-            assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}"
-
-            assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1]
-            count_iter += 1
-            if count_iter == max_iter:
-                break
-    # except AssertionError:
-    #     shutil.rmtree(test_mel_feat_path)
-    #     shutil.rmtree(test_quant_feat_path)
-    finally:
-        shutil.rmtree(test_mel_feat_path)
-        shutil.rmtree(test_quant_feat_path)
-
+    for data in loader:
+        x_input, mels, _ = data
+        expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2))
+        assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}"
 
-def test_parametrized_wavernn_dataset():
-    """test dataloader with different parameters"""
-    params = [
-        [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0],
-        [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4],
-        [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0],
-        [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0],
-        [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0],
-        [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2],
-        [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0],
-    ]
-    for param in params:
-        print(param)
-        wavernn_dataset_case(*param)
+        assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1]
+        count_iter += 1
+        if count_iter == max_iter:
+            break
diff --git a/tests/vocoder_tests/test_wavegrad.py b/tests/vocoder_tests/test_wavegrad.py
index 43b5f08042..d1d3610b70 100644
--- a/tests/vocoder_tests/test_wavegrad.py
+++ b/tests/vocoder_tests/test_wavegrad.py
@@ -1,5 +1,3 @@
-import unittest
-
 import numpy as np
 import torch
 from torch import optim
@@ -10,50 +8,43 @@
 # pylint: disable=unused-variable
 
 torch.manual_seed(1)
-use_cuda = torch.cuda.is_available()
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
 
-class WavegradTrainTest(unittest.TestCase):
-    def test_train_step(self):  # pylint: disable=no-self-use
-        """Test if all layers are updated in a basic training cycle"""
-        input_dummy = torch.rand(8, 1, 20 * 300).to(device)
-        mel_spec = torch.rand(8, 80, 20).to(device)
-
-        criterion = torch.nn.L1Loss().to(device)
-        args = WavegradArgs(
-            in_channels=80,
-            out_channels=1,
-            upsample_factors=[5, 5, 3, 2, 2],
-            upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
-        )
-        config = WavegradConfig(model_params=args)
-        model = Wavegrad(config)
-
-        model_ref = Wavegrad(config)
-        model.train()
-        model.to(device)
-        betas = np.linspace(1e-6, 1e-2, 1000)
-        model.compute_noise_level(betas)
-        model_ref.load_state_dict(model.state_dict())
-        model_ref.to(device)
-        count = 0
-        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
-            assert (param - param_ref).sum() == 0, param
-            count += 1
-        optimizer = optim.Adam(model.parameters(), lr=0.001)
-        for i in range(5):
-            y_hat = model.forward(input_dummy, mel_spec, torch.rand(8).to(device))
-            optimizer.zero_grad()
-            loss = criterion(y_hat, input_dummy)
-            loss.backward()
-            optimizer.step()
-        # check parameter changes
-        count = 0
-        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
-            # ignore pre-higway layer since it works conditional
-            # if count not in [145, 59]:
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
-            )
-            count += 1
+def test_train_step():
+    """Test if all layers are updated in a basic training cycle"""
+    torch.set_grad_enabled(True)
+    input_dummy = torch.rand(8, 1, 20 * 300).to(device)
+    mel_spec = torch.rand(8, 80, 20).to(device)
+
+    criterion = torch.nn.L1Loss().to(device)
+    args = WavegradArgs(
+        in_channels=80,
+        out_channels=1,
+        upsample_factors=[5, 5, 3, 2, 2],
+        upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
+    )
+    config = WavegradConfig(model_params=args)
+    model = Wavegrad(config)
+
+    model_ref = Wavegrad(config)
+    model.train()
+    model.to(device)
+    betas = np.linspace(1e-6, 1e-2, 1000)
+    model.compute_noise_level(betas)
+    model_ref.load_state_dict(model.state_dict())
+    model_ref.to(device)
+    for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+        assert (param - param_ref).sum() == 0, param
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+    for _ in range(5):
+        y_hat = model.forward(input_dummy, mel_spec, torch.rand(8).to(device))
+        optimizer.zero_grad()
+        loss = criterion(y_hat, input_dummy)
+        loss.backward()
+        optimizer.step()
+    # check parameter changes
+    for i, (param, param_ref) in enumerate(zip(model.parameters(), model_ref.parameters())):
+        # ignore pre-higway layer since it works conditional
+        # if count not in [145, 59]:
+        assert (param != param_ref).any(), f"param {i} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py
deleted file mode 100644
index 9b10759505..0000000000
--- a/tests/vocoder_tests/test_wavegrad_train.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import glob
-import os
-import shutil
-import unittest
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import WavegradConfig
-
-
-class WavegradTrainingTest(unittest.TestCase):
-    # TODO: Reactivate after improving CI run times
-    # This test currently takes ~2h on CI (15min/step vs 8sec/step locally)
-    if os.getenv("GITHUB_ACTIONS") == "true":
-        __test__ = False
-
-    def test_train(self):  # pylint: disable=no-self-use
-        config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-        output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-        config = WavegradConfig(
-            batch_size=8,
-            eval_batch_size=8,
-            num_loader_workers=0,
-            num_eval_loader_workers=0,
-            run_eval=True,
-            test_delay_epochs=-1,
-            epochs=1,
-            seq_len=8192,
-            eval_split_size=1,
-            print_step=1,
-            print_eval=True,
-            data_path="tests/data/ljspeech",
-            output_path=output_path,
-            test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2},
-        )
-        config.audio.do_trim_silence = True
-        config.audio.trim_db = 60
-        config.save_json(config_path)
-
-        # train the model for one epoch
-        command_train = (
-            f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-        )
-        run_cli(command_train)
-
-        # Find latest folder
-        continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-        # restore the model and continue training for one more epoch
-        command_train = (
-            f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-        )
-        run_cli(command_train)
-        shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_wavernn_train.py b/tests/vocoder_tests/test_wavernn_train.py
deleted file mode 100644
index 337e24259f..0000000000
--- a/tests/vocoder_tests/test_wavernn_train.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import WavernnConfig
-from TTS.vocoder.models.wavernn import WavernnArgs
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = WavernnConfig(
-    model_args=WavernnArgs(),
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=256,  # for shorter test time
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/xtts_tests/test_xtts_gpt_train.py b/tests/xtts_tests/test_xtts_gpt_train.py
index bb592f1f2d..4d22b8102f 100644
--- a/tests/xtts_tests/test_xtts_gpt_train.py
+++ b/tests/xtts_tests/test_xtts_gpt_train.py
@@ -1,10 +1,9 @@
-import os
-import shutil
+from pathlib import Path
 
+import pytest
 import torch
 from trainer import Trainer, TrainerArgs
 
-from tests import get_tests_output_path
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.layers.xtts.dvae import DiscreteVAE
@@ -28,37 +27,9 @@
 DASHBOARD_LOGGER = "tensorboard"
 LOGGER_URI = None
 
-# Set here the path that the checkpoints will be saved. Default: ./run/training/
-OUT_PATH = os.path.join(get_tests_output_path(), "train_outputs", "xtts_tests")
-os.makedirs(OUT_PATH, exist_ok=True)
-
-# Create DVAE checkpoint and mel_norms on test time
-# DVAE parameters: For the training we need the dvae to extract the dvae tokens, given that you must provide the paths for this model
-DVAE_CHECKPOINT = os.path.join(OUT_PATH, "dvae.pth")  # DVAE checkpoint
-MEL_NORM_FILE = os.path.join(
-    OUT_PATH, "mel_stats.pth"
-)  # Mel spectrogram norms, required for dvae mel spectrogram extraction
-dvae = DiscreteVAE(
-    channels=80,
-    normalization=None,
-    positional_dims=1,
-    num_tokens=8192,
-    codebook_dim=512,
-    hidden_dim=512,
-    num_resnet_blocks=3,
-    kernel_size=3,
-    num_layers=2,
-    use_transposed_convs=False,
-)
-torch.save(dvae.state_dict(), DVAE_CHECKPOINT)
-mel_stats = torch.ones(80)
-torch.save(mel_stats, MEL_NORM_FILE)
-
-
 # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
 TOKENIZER_FILE = "tests/inputs/xtts_vocab.json"  # vocab.json file
-XTTS_CHECKPOINT = None  # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_style_emb_repetition_fix_gt/132500_gpt_ema_coqui_tts_with_enhanced_hifigan.pth"  # model.pth file
-
+XTTS_CHECKPOINT = None  # model.pth file
 
 # Training sentences generations
 SPEAKER_REFERENCE = [
@@ -66,99 +37,122 @@
 ]  # speaker reference to be used in training test sentences
 LANGUAGE = config_dataset.language
 
-
 # Training Parameters
 OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
 START_WITH_EVAL = False  # if True it will star with evaluation
 BATCH_SIZE = 2  # set here the batch size
 GRAD_ACUMM_STEPS = 1  # set here the grad accumulation steps
-# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
-
-
-# init args and config
-model_args = GPTArgs(
-    max_conditioning_length=132300,  # 6 secs
-    min_conditioning_length=66150,  # 3 secs
-    debug_loading_failures=False,
-    max_wav_length=255995,  # ~11.6 seconds
-    max_text_length=200,
-    mel_norm_file=MEL_NORM_FILE,
-    dvae_checkpoint=DVAE_CHECKPOINT,
-    xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
-    tokenizer_file=TOKENIZER_FILE,
-    gpt_num_audio_tokens=8194,
-    gpt_start_audio_token=8192,
-    gpt_stop_audio_token=8193,
-)
-audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
-config = GPTTrainerConfig(
-    epochs=1,
-    output_path=OUT_PATH,
-    model_args=model_args,
-    run_name=RUN_NAME,
-    project_name=PROJECT_NAME,
-    run_description="""
-        GPT XTTS training
-        """,
-    dashboard_logger=DASHBOARD_LOGGER,
-    logger_uri=LOGGER_URI,
-    audio=audio_config,
-    batch_size=BATCH_SIZE,
-    batch_group_size=48,
-    eval_batch_size=BATCH_SIZE,
-    num_loader_workers=8,
-    eval_split_max_size=256,
-    print_step=50,
-    plot_step=100,
-    log_model_step=1000,
-    save_step=10000,
-    save_n_checkpoints=1,
-    save_checkpoints=True,
-    # target_loss="loss",
-    print_eval=False,
-    # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
-    optimizer="AdamW",
-    optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
-    optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
-    lr=5e-06,  # learning rate
-    lr_scheduler="MultiStepLR",
-    # it was adjusted accordly for the new step scheme
-    lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
-    test_sentences=[
-        {
-            "text": "This cake is great. It's so delicious and moist.",
-            "speaker_wav": SPEAKER_REFERENCE,
-            "language": LANGUAGE,
-        },
-    ],
-)
-
-# init the model from config
-model = GPTTrainer.init_from_config(config)
+# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252
+# for more efficient training. You can increase/decrease BATCH_SIZE but then set
+# GRAD_ACUMM_STEPS accordingly.
 
-# load training samples
-train_samples, eval_samples = load_tts_samples(
-    DATASETS_CONFIG_LIST,
-    eval_split=True,
-    eval_split_max_size=config.eval_split_max_size,
-    eval_split_size=config.eval_split_size,
-)
+audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
 
-# init the trainer and 🚀
-trainer = Trainer(
-    TrainerArgs(
-        restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
-        skip_train_epoch=False,
-        start_with_eval=True,
-        grad_accum_steps=GRAD_ACUMM_STEPS,
-    ),
-    config,
-    output_path=OUT_PATH,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-)
-trainer.fit()
 
-# remove output path
-shutil.rmtree(OUT_PATH)
+@pytest.mark.parametrize("use_perceiver", [False, True])
+def test_xtts_gpt_train(tmp_path: Path, use_perceiver: bool):
+    # Create DVAE checkpoint and mel_norms on test time
+    # DVAE parameters: For the training we need the dvae to extract the dvae tokens,
+    #                  given that you must provide the paths for this model
+    DVAE_CHECKPOINT = tmp_path / "dvae.pth"
+    # Mel spectrogram norms for dvae mel spectrogram extraction
+    MEL_NORM_FILE = tmp_path / "mel_stats.pth"
+    dvae = DiscreteVAE(
+        channels=80,
+        normalization=None,
+        positional_dims=1,
+        num_tokens=8192,
+        codebook_dim=512,
+        hidden_dim=512,
+        num_resnet_blocks=3,
+        kernel_size=3,
+        num_layers=2,
+        use_transposed_convs=False,
+    )
+    torch.save(dvae.state_dict(), DVAE_CHECKPOINT)
+    mel_stats = torch.ones(80)
+    torch.save(mel_stats, MEL_NORM_FILE)
+
+    # init args and config
+    model_args = GPTArgs(
+        max_conditioning_length=132300,  # 6 secs
+        min_conditioning_length=66150,  # 3 secs
+        debug_loading_failures=False,
+        max_wav_length=255995,  # ~11.6 seconds
+        max_text_length=200,
+        mel_norm_file=MEL_NORM_FILE,
+        dvae_checkpoint=DVAE_CHECKPOINT,
+        xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
+        tokenizer_file=TOKENIZER_FILE,
+        gpt_num_audio_tokens=8194,
+        gpt_start_audio_token=8192,
+        gpt_stop_audio_token=8193,
+        gpt_use_perceiver_resampler=use_perceiver,
+    )
+
+    config = GPTTrainerConfig(
+        epochs=1,
+        output_path=tmp_path,
+        model_args=model_args,
+        run_name=RUN_NAME,
+        project_name=PROJECT_NAME,
+        run_description="GPT XTTS training",
+        dashboard_logger=DASHBOARD_LOGGER,
+        logger_uri=LOGGER_URI,
+        audio=audio_config,
+        batch_size=BATCH_SIZE,
+        batch_group_size=48,
+        eval_batch_size=BATCH_SIZE,
+        num_loader_workers=8,
+        eval_split_max_size=256,
+        print_step=50,
+        plot_step=100,
+        log_model_step=1000,
+        save_step=10000,
+        save_n_checkpoints=1,
+        save_checkpoints=True,
+        # target_loss="loss",
+        print_eval=False,
+        # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
+        optimizer="AdamW",
+        optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
+        optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
+        lr=5e-06,  # learning rate
+        lr_scheduler="MultiStepLR",
+        # it was adjusted accordly for the new step scheme
+        lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
+        test_sentences=[
+            {
+                "text": "This cake is great. It's so delicious and moist.",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+        ],
+    )
+
+    # init the model from config
+    model = GPTTrainer.init_from_config(config)
+
+    # load training samples
+    train_samples, eval_samples = load_tts_samples(
+        DATASETS_CONFIG_LIST,
+        eval_split=True,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
+    )
+
+    # init the trainer and 🚀
+    trainer = Trainer(
+        TrainerArgs(
+            restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
+            skip_train_epoch=False,
+            start_with_eval=True,
+            grad_accum_steps=GRAD_ACUMM_STEPS,
+        ),
+        config,
+        output_path=tmp_path,
+        model=model,
+        train_samples=train_samples,
+        eval_samples=eval_samples,
+    )
+    trainer.fit()
diff --git a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py b/tests/xtts_tests/test_xtts_v2-0_gpt_train.py
deleted file mode 100644
index 454e867385..0000000000
--- a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import os
-import shutil
-
-import torch
-from trainer import Trainer, TrainerArgs
-
-from tests import get_tests_output_path
-from TTS.config.shared_configs import BaseDatasetConfig
-from TTS.tts.datasets import load_tts_samples
-from TTS.tts.layers.xtts.dvae import DiscreteVAE
-from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig
-from TTS.tts.models.xtts import XttsAudioConfig
-
-config_dataset = BaseDatasetConfig(
-    formatter="ljspeech",
-    dataset_name="ljspeech",
-    path="tests/data/ljspeech/",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    language="en",
-)
-
-DATASETS_CONFIG_LIST = [config_dataset]
-
-# Logging parameters
-RUN_NAME = "GPT_XTTS_LJSpeech_FT"
-PROJECT_NAME = "XTTS_trainer"
-DASHBOARD_LOGGER = "tensorboard"
-LOGGER_URI = None
-
-OUT_PATH = os.path.join(get_tests_output_path(), "train_outputs", "xtts_tests")
-os.makedirs(OUT_PATH, exist_ok=True)
-
-# Create DVAE checkpoint and mel_norms on test time
-# DVAE parameters: For the training we need the dvae to extract the dvae tokens, given that you must provide the paths for this model
-DVAE_CHECKPOINT = os.path.join(OUT_PATH, "dvae.pth")  # DVAE checkpoint
-# Mel spectrogram norms, required for dvae mel spectrogram extraction
-MEL_NORM_FILE = os.path.join(OUT_PATH, "mel_stats.pth")
-dvae = DiscreteVAE(
-    channels=80,
-    normalization=None,
-    positional_dims=1,
-    num_tokens=8192,
-    codebook_dim=512,
-    hidden_dim=512,
-    num_resnet_blocks=3,
-    kernel_size=3,
-    num_layers=2,
-    use_transposed_convs=False,
-)
-torch.save(dvae.state_dict(), DVAE_CHECKPOINT)
-mel_stats = torch.ones(80)
-torch.save(mel_stats, MEL_NORM_FILE)
-
-
-# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
-TOKENIZER_FILE = "tests/inputs/xtts_vocab.json"  # vocab.json file
-XTTS_CHECKPOINT = None  # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_style_emb_repetition_fix_gt/132500_gpt_ema_coqui_tts_with_enhanced_hifigan.pth"  # model.pth file
-
-
-# Training sentences generations
-SPEAKER_REFERENCE = [
-    "tests/data/ljspeech/wavs/LJ001-0002.wav"
-]  # speaker reference to be used in training test sentences
-LANGUAGE = config_dataset.language
-
-
-# Training Parameters
-OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
-START_WITH_EVAL = False  # if True it will star with evaluation
-BATCH_SIZE = 2  # set here the batch size
-GRAD_ACUMM_STEPS = 1  # set here the grad accumulation steps
-# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
-
-
-# init args and config
-model_args = GPTArgs(
-    max_conditioning_length=132300,  # 6 secs
-    min_conditioning_length=66150,  # 3 secs
-    debug_loading_failures=False,
-    max_wav_length=255995,  # ~11.6 seconds
-    max_text_length=200,
-    mel_norm_file=MEL_NORM_FILE,
-    dvae_checkpoint=DVAE_CHECKPOINT,
-    xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
-    tokenizer_file=TOKENIZER_FILE,
-    gpt_num_audio_tokens=8194,
-    gpt_start_audio_token=8192,
-    gpt_stop_audio_token=8193,
-    gpt_use_masking_gt_prompt_approach=True,
-    gpt_use_perceiver_resampler=True,
-)
-
-audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
-
-config = GPTTrainerConfig(
-    epochs=1,
-    output_path=OUT_PATH,
-    model_args=model_args,
-    run_name=RUN_NAME,
-    project_name=PROJECT_NAME,
-    run_description="GPT XTTS training",
-    dashboard_logger=DASHBOARD_LOGGER,
-    logger_uri=LOGGER_URI,
-    audio=audio_config,
-    batch_size=BATCH_SIZE,
-    batch_group_size=48,
-    eval_batch_size=BATCH_SIZE,
-    num_loader_workers=8,
-    eval_split_max_size=256,
-    print_step=50,
-    plot_step=100,
-    log_model_step=1000,
-    save_step=10000,
-    save_n_checkpoints=1,
-    save_checkpoints=True,
-    # target_loss="loss",
-    print_eval=False,
-    # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
-    optimizer="AdamW",
-    optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
-    optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
-    lr=5e-06,  # learning rate
-    lr_scheduler="MultiStepLR",
-    # it was adjusted accordly for the new step scheme
-    lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
-    test_sentences=[
-        {
-            "text": "This cake is great. It's so delicious and moist.",
-            "speaker_wav": SPEAKER_REFERENCE,
-            "language": LANGUAGE,
-        },
-    ],
-)
-
-# init the model from config
-model = GPTTrainer.init_from_config(config)
-
-# load training samples
-train_samples, eval_samples = load_tts_samples(
-    DATASETS_CONFIG_LIST,
-    eval_split=True,
-    eval_split_max_size=config.eval_split_max_size,
-    eval_split_size=config.eval_split_size,
-)
-
-# init the trainer and 🚀
-trainer = Trainer(
-    TrainerArgs(
-        restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
-        skip_train_epoch=False,
-        start_with_eval=True,
-        grad_accum_steps=GRAD_ACUMM_STEPS,
-    ),
-    config,
-    output_path=OUT_PATH,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-)
-trainer.fit()
-
-# remove output path
-shutil.rmtree(OUT_PATH)
diff --git a/tests/zoo_tests/test_big_models.py b/tests/zoo_tests/test_big_models.py
new file mode 100644
index 0000000000..8a9780b4f0
--- /dev/null
+++ b/tests/zoo_tests/test_big_models.py
@@ -0,0 +1,193 @@
+"""These tests should be run locally because the models are too big for CI."""
+
+import os
+
+import pytest
+import torch
+
+from tests import get_tests_data_path, run_main
+from TTS.bin.synthesize import main
+from TTS.utils.manage import ModelManager
+
+GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
+
+
+@pytest.fixture(scope="session", autouse=True)
+def set_env():
+    os.environ["COQUI_TOS_AGREED"] = "1"
+
+
+@pytest.fixture
+def manager():
+    """Set up model manager."""
+    return ModelManager(progress_bar=False)
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_xtts(tmp_path):
+    """XTTS is too big to run on github actions. We need to test it locally"""
+    args = [
+        "--model_name",
+        "tts_models/multilingual/multi-dataset/xtts_v1.1",
+        "--text",
+        "C'est un exemple.",
+        "--language_idx",
+        "fr",
+        "--out_path",
+        str(tmp_path / "output.wav"),
+        "--no-progress_bar",
+        "--speaker_wav",
+        os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"),
+    ]
+    if torch.cuda.is_available():
+        args.append("--use_cuda")
+    run_main(main, args)
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_xtts_streaming(manager):
+    """Testing the new inference_stream method"""
+    from TTS.tts.configs.xtts_config import XttsConfig
+    from TTS.tts.models.xtts import Xtts
+
+    speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
+    speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
+    speaker_wav.append(speaker_wav_2)
+    model_path, _, _ = manager.download_model("tts_models/multilingual/multi-dataset/xtts_v1.1")
+    config = XttsConfig()
+    config.load_json(model_path / "config.json")
+    model = Xtts.init_from_config(config)
+    model.load_checkpoint(config, checkpoint_dir=str(model_path))
+    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+
+    print("Computing speaker latents...")
+    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
+
+    print("Inference...")
+    chunks = model.inference_stream(
+        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
+        "en",
+        gpt_cond_latent,
+        speaker_embedding,
+    )
+    wav_chunks = []
+    for i, chunk in enumerate(chunks):
+        if i == 0:
+            assert chunk.shape[-1] > 5000
+        wav_chunks.append(chunk)
+    assert len(wav_chunks) > 1
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_xtts_v2(tmp_path):
+    """XTTS is too big to run on github actions. We need to test it locally"""
+    args = [
+        "--model_name",
+        "tts_models/multilingual/multi-dataset/xtts_v2",
+        "--text",
+        "C'est un exemple.",
+        "--language_idx",
+        "fr",
+        "--out_path",
+        str(tmp_path / "output.wav"),
+        "--no-progress_bar",
+        "--speaker_wav",
+        os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"),
+        os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav"),
+    ]
+    if torch.cuda.is_available():
+        args.append("--use_cuda")
+    run_main(main, args)
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_xtts_v2_streaming(manager):
+    """Testing the new inference_stream method"""
+    from TTS.tts.configs.xtts_config import XttsConfig
+    from TTS.tts.models.xtts import Xtts
+
+    speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
+    model_path, _, _ = manager.download_model("tts_models/multilingual/multi-dataset/xtts_v2")
+    config = XttsConfig()
+    config.load_json(model_path / "config.json")
+    model = Xtts.init_from_config(config)
+    model.load_checkpoint(config, checkpoint_dir=str(model_path))
+    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+
+    print("Computing speaker latents...")
+    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
+
+    print("Inference...")
+    chunks = model.inference_stream(
+        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
+        "en",
+        gpt_cond_latent,
+        speaker_embedding,
+    )
+    wav_chunks = []
+    for i, chunk in enumerate(chunks):
+        if i == 0:
+            assert chunk.shape[-1] > 5000
+        wav_chunks.append(chunk)
+    assert len(wav_chunks) > 1
+    normal_len = sum([len(chunk) for chunk in wav_chunks])
+
+    chunks = model.inference_stream(
+        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
+        "en",
+        gpt_cond_latent,
+        speaker_embedding,
+        speed=1.5,
+    )
+    wav_chunks = []
+    for i, chunk in enumerate(chunks):
+        wav_chunks.append(chunk)
+    fast_len = sum([len(chunk) for chunk in wav_chunks])
+
+    chunks = model.inference_stream(
+        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
+        "en",
+        gpt_cond_latent,
+        speaker_embedding,
+        speed=0.66,
+    )
+    wav_chunks = []
+    for i, chunk in enumerate(chunks):
+        wav_chunks.append(chunk)
+    slow_len = sum([len(chunk) for chunk in wav_chunks])
+
+    assert slow_len > normal_len
+    assert normal_len > fast_len
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_tortoise(tmp_path):
+    args = [
+        "--model_name",
+        "tts_models/en/multi-dataset/tortoise-v2",
+        "--text",
+        "This is an example.",
+        "--out_path",
+        str(tmp_path / "output.wav"),
+        "--no-progress_bar",
+    ]
+    if torch.cuda.is_available():
+        args.append("--use_cuda")
+    run_main(main, args)
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_bark(tmp_path):
+    """Bark is too big to run on github actions. We need to test it locally"""
+    args = [
+        "--model_name",
+        "tts_models/multilingual/multi-dataset/bark",
+        "--text",
+        "This is an example.",
+        "--out_path",
+        str(tmp_path / "output.wav"),
+        "--no-progress_bar",
+    ]
+    if torch.cuda.is_available():
+        args.append("--use_cuda")
+    run_main(main, args)
diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py
index 461b4fbe12..9f02672ef1 100644
--- a/tests/zoo_tests/test_models.py
+++ b/tests/zoo_tests/test_models.py
@@ -2,10 +2,11 @@
 import os
 import shutil
 
-import torch
-from trainer.io import get_user_data_dir
+import pytest
 
-from tests import get_tests_data_path, get_tests_output_path, run_cli
+from tests import get_tests_data_path, run_main
+from TTS.api import TTS
+from TTS.bin.synthesize import main
 from TTS.tts.utils.languages import LanguageManager
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.manage import ModelManager
@@ -18,249 +19,81 @@
 ]
 
 
-def run_models(offset=0, step=1):
-    """Check if all the models are downloadable and tts models run correctly."""
-    print(" > Run synthesizer with all the models.")
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    manager = ModelManager(output_prefix=get_tests_output_path(), progress_bar=False)
-    model_names = [name for name in manager.list_models() if name not in MODELS_WITH_SEP_TESTS]
-    print("Model names:", model_names)
-    for model_name in model_names[offset::step]:
-        print(f"\n > Run - {model_name}")
-        model_path, _, _ = manager.download_model(model_name)
-        if "tts_models" in model_name:
-            local_download_dir = model_path.parent
-            # download and run the model
-            speaker_files = list(local_download_dir.glob("speaker*"))
-            language_files = list(local_download_dir.glob("language*"))
-            speaker_arg = ""
-            language_arg = ""
-            if len(speaker_files) > 0:
-                # multi-speaker model
-                if "speaker_ids" in speaker_files[0].stem:
-                    speaker_manager = SpeakerManager(speaker_id_file_path=speaker_files[0])
-                elif "speakers" in speaker_files[0].stem:
-                    speaker_manager = SpeakerManager(d_vectors_file_path=speaker_files[0])
-                speakers = list(speaker_manager.name_to_id.keys())
-                if len(speakers) > 1:
-                    speaker_arg = f'--speaker_idx "{speakers[0]}"'
-            if len(language_files) > 0 and "language_ids" in language_files[0].stem:
-                # multi-lingual model
-                language_manager = LanguageManager(language_ids_file_path=language_files[0])
-                languages = language_manager.language_names
-                if len(languages) > 1:
-                    language_arg = f'--language_idx "{languages[0]}"'
-            run_cli(
-                f'tts --model_name  {model_name} --text "This is an example." '
-                f'--out_path "{output_path}" {speaker_arg} {language_arg} --no-progress_bar'
-            )
-            # remove downloaded models
-            shutil.rmtree(local_download_dir)
-            shutil.rmtree(get_user_data_dir("tts"))
-        elif "voice_conversion_models" in model_name:
-            speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
-            reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav")
-            run_cli(
-                f"tts --model_name  {model_name} "
-                f'--out_path "{output_path}" --source_wav "{speaker_wav}" --target_wav "{reference_wav}" --no-progress_bar'
-            )
-        else:
-            # only download the model
-            manager.download_model(model_name)
-        print(f" | > OK: {model_name}")
-
-
-def test_xtts():
-    """XTTS is too big to run on github actions. We need to test it locally"""
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
-    use_gpu = torch.cuda.is_available()
-    if use_gpu:
-        run_cli(
-            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1.1 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda '
-            f'--speaker_wav "{speaker_wav}" --language_idx "en"'
-        )
+@pytest.fixture(autouse=True)
+def run_around_tests(tmp_path):
+    """Download models to a temp folder and delete it afterwards."""
+    os.environ["TTS_HOME"] = str(tmp_path)
+    yield
+    shutil.rmtree(tmp_path)
+
+
+@pytest.fixture
+def manager(tmp_path):
+    """Set up model manager."""
+    return ModelManager(output_prefix=tmp_path, progress_bar=False)
+
+
+# To split tests into different CI jobs
+num_partitions = int(os.getenv("NUM_PARTITIONS", "1"))
+partition = int(os.getenv("TEST_PARTITION", "0"))
+model_names = [name for name in TTS.list_models() if name not in MODELS_WITH_SEP_TESTS]
+model_names.extend(["tts_models/deu/fairseq/vits", "tts_models/sqi/fairseq/vits"])
+model_names = [name for i, name in enumerate(model_names) if i % num_partitions == partition]
+
+
+@pytest.mark.parametrize("model_name", model_names)
+def test_models(tmp_path, model_name, manager):
+    print(f"\n > Run - {model_name}")
+    output_path = str(tmp_path / "output.wav")
+    model_path, _, _ = manager.download_model(model_name)
+    args = ["--model_name", model_name, "--out_path", output_path, "--no-progress_bar"]
+    if "tts_models" in model_name:
+        local_download_dir = model_path.parent
+        # download and run the model
+        speaker_files = list(local_download_dir.glob("speaker*"))
+        language_files = list(local_download_dir.glob("language*"))
+        speaker_arg = []
+        language_arg = []
+        if len(speaker_files) > 0:
+            # multi-speaker model
+            if "speaker_ids" in speaker_files[0].stem:
+                speaker_manager = SpeakerManager(speaker_id_file_path=speaker_files[0])
+            elif "speakers" in speaker_files[0].stem:
+                speaker_manager = SpeakerManager(d_vectors_file_path=speaker_files[0])
+            speakers = list(speaker_manager.name_to_id.keys())
+            if len(speakers) > 1:
+                speaker_arg = ["--speaker_idx", speakers[0]]
+        if len(language_files) > 0 and "language_ids" in language_files[0].stem:
+            # multi-lingual model
+            language_manager = LanguageManager(language_ids_file_path=language_files[0])
+            languages = language_manager.language_names
+            if len(languages) > 1:
+                language_arg = ["--language_idx", languages[0]]
+        run_main(main, [*args, "--text", "This is an example.", *speaker_arg, *language_arg])
+    elif "voice_conversion_models" in model_name:
+        speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
+        reference_wav1 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0028.wav")
+        reference_wav2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav")
+        run_main(main, [*args, "--source_wav", speaker_wav, "--target_wav", reference_wav1, reference_wav2])
     else:
-        run_cli(
-            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1.1 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar '
-            f'--speaker_wav "{speaker_wav}" --language_idx "en"'
-        )
-
-
-def test_xtts_streaming():
-    """Testing the new inference_stream method"""
-    from TTS.tts.configs.xtts_config import XttsConfig
-    from TTS.tts.models.xtts import Xtts
-
-    speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
-    speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
-    speaker_wav.append(speaker_wav_2)
-    model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1.1")
-    config = XttsConfig()
-    config.load_json(os.path.join(model_path, "config.json"))
-    model = Xtts.init_from_config(config)
-    model.load_checkpoint(config, checkpoint_dir=model_path)
-    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+        # only download the model
+        manager.download_model(model_name)
+    print(f" | > OK: {model_name}")
 
-    print("Computing speaker latents...")
-    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
-
-    print("Inference...")
-    chunks = model.inference_stream(
-        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
-        "en",
-        gpt_cond_latent,
-        speaker_embedding,
-    )
-    wav_chuncks = []
-    for i, chunk in enumerate(chunks):
-        if i == 0:
-            assert chunk.shape[-1] > 5000
-        wav_chuncks.append(chunk)
-    assert len(wav_chuncks) > 1
-
-
-def test_xtts_v2():
-    """XTTS is too big to run on github actions. We need to test it locally"""
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
-    speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
-    use_gpu = torch.cuda.is_available()
-    if use_gpu:
-        run_cli(
-            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v2 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda '
-            f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}"  --language_idx "en"'
-        )
-    else:
-        run_cli(
-            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v2 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar '
-            f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"'
-        )
 
-
-def test_xtts_v2_streaming():
-    """Testing the new inference_stream method"""
-    from TTS.tts.configs.xtts_config import XttsConfig
-    from TTS.tts.models.xtts import Xtts
-
-    speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
-    model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2")
-    config = XttsConfig()
-    config.load_json(os.path.join(model_path, "config.json"))
-    model = Xtts.init_from_config(config)
-    model.load_checkpoint(config, checkpoint_dir=model_path)
-    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-
-    print("Computing speaker latents...")
-    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
-
-    print("Inference...")
-    chunks = model.inference_stream(
-        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
-        "en",
-        gpt_cond_latent,
-        speaker_embedding,
-    )
-    wav_chuncks = []
-    for i, chunk in enumerate(chunks):
-        if i == 0:
-            assert chunk.shape[-1] > 5000
-        wav_chuncks.append(chunk)
-    assert len(wav_chuncks) > 1
-    normal_len = sum([len(chunk) for chunk in wav_chuncks])
-
-    chunks = model.inference_stream(
-        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
-        "en",
-        gpt_cond_latent,
-        speaker_embedding,
-        speed=1.5,
-    )
-    wav_chuncks = []
-    for i, chunk in enumerate(chunks):
-        wav_chuncks.append(chunk)
-    fast_len = sum([len(chunk) for chunk in wav_chuncks])
-
-    chunks = model.inference_stream(
-        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
-        "en",
-        gpt_cond_latent,
-        speaker_embedding,
-        speed=0.66,
-    )
-    wav_chuncks = []
-    for i, chunk in enumerate(chunks):
-        wav_chuncks.append(chunk)
-    slow_len = sum([len(chunk) for chunk in wav_chuncks])
-
-    assert slow_len > normal_len
-    assert normal_len > fast_len
-
-
-def test_tortoise():
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    use_gpu = torch.cuda.is_available()
-    if use_gpu:
-        run_cli(
-            f" tts --model_name  tts_models/en/multi-dataset/tortoise-v2 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda'
-        )
-    else:
-        run_cli(
-            f" tts --model_name  tts_models/en/multi-dataset/tortoise-v2 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar'
-        )
-
-
-def test_bark():
-    """Bark is too big to run on github actions. We need to test it locally"""
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    use_gpu = torch.cuda.is_available()
-    if use_gpu:
-        run_cli(
-            f" tts --model_name  tts_models/multilingual/multi-dataset/bark "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda'
-        )
-    else:
-        run_cli(
-            f" tts --model_name  tts_models/multilingual/multi-dataset/bark "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar'
-        )
-
-
-def test_voice_conversion():
+def test_voice_conversion(tmp_path):
     print(" > Run voice conversion inference using YourTTS model.")
-    model_name = "tts_models/multilingual/multi-dataset/your_tts"
-    language_id = "en"
-    speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
-    reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav")
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    run_cli(
-        f"tts --model_name  {model_name}"
-        f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} --no-progress_bar"
-    )
-
-
-"""
-These are used to split tests into different actions on Github.
-"""
-
-
-def test_models_offset_0_step_3():
-    run_models(offset=0, step=3)
-
-
-def test_models_offset_1_step_3():
-    run_models(offset=1, step=3)
-
-
-def test_models_offset_2_step_3():
-    run_models(offset=2, step=3)
+    args = [
+        "--model_name",
+        "tts_models/multilingual/multi-dataset/your_tts",
+        "--out_path",
+        str(tmp_path / "output.wav"),
+        "--speaker_wav",
+        os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"),
+        "--reference_wav",
+        os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav"),
+        "--language_idx",
+        "en",
+        "--no-progress_bar",
+    ]
+    run_main(main, args)