From 6f34c74367e43a74ef7d745d5d9645573072fee1 Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Thu, 12 Dec 2024 22:34:59 +0100
Subject: [PATCH] test: use temp output folders and pytest.parametrize

---
 tests/data_tests/test_loader.py               | 407 +++++++++---------
 tests/inference_tests/test_synthesize.py      |   8 +-
 .../test_fullband_melgan_train.py             |  73 ++--
 tests/vocoder_tests/test_hifigan_train.py     |  70 ++-
 tests/vocoder_tests/test_melgan_train.py      |  73 ++--
 .../test_multiband_melgan_train.py            |  75 ++--
 .../test_parallel_wavegan_train.py            |  67 +--
 .../test_vocoder_gan_datasets.py              |   6 +-
 tests/vocoder_tests/test_vocoder_losses.py    |   7 +-
 tests/vocoder_tests/test_vocoder_pqmf.py      |   7 +-
 .../test_vocoder_wavernn_datasets.py          |  72 ++--
 tests/vocoder_tests/test_wavegrad_train.py    |  96 ++---
 tests/vocoder_tests/test_wavernn_train.py     |  72 ++--
 13 files changed, 497 insertions(+), 536 deletions(-)

diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py
index 252b429a16..f260af161e 100644
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
@@ -1,12 +1,12 @@
 import os
 import shutil
-import unittest
 
 import numpy as np
+import pytest
 import torch
 from torch.utils.data import DataLoader
 
-from tests import get_tests_data_path, get_tests_output_path
+from tests import get_tests_data_path
 from TTS.tts.configs.shared_configs import BaseDatasetConfig, BaseTTSConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.datasets.dataset import TTSDataset
@@ -15,9 +15,6 @@
 
 # pylint: disable=unused-variable
 
-OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
-os.makedirs(OUTPATH, exist_ok=True)
-
 # create a dummy config for testing data loaders.
 c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False)
 c.r = 5
@@ -47,6 +44,9 @@
 
 dataset_configs = [dataset_config_wav, dataset_config_mp3, dataset_config_flac]
 
+ap = AudioProcessor(**c.audio)
+max_loader_iter = 4
+
 DATA_EXIST = True
 if not os.path.exists(c.data_path):
     DATA_EXIST = False
@@ -54,203 +54,200 @@
 print(" > Dynamic data loader test: {}".format(DATA_EXIST))
 
 
-class TestTTSDataset(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.max_loader_iter = 4
-        self.ap = AudioProcessor(**c.audio)
-
-    def _create_dataloader(self, batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False):
-        # load dataset
-        meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2)
-        items = meta_data_train + meta_data_eval
-        tokenizer, _ = TTSTokenizer.init_from_config(c)
-        dataset = TTSDataset(
-            outputs_per_step=r,
-            compute_linear_spec=True,
-            return_wav=True,
-            tokenizer=tokenizer,
-            ap=self.ap,
-            samples=items,
-            batch_group_size=bgs,
-            min_text_len=c.min_text_len,
-            max_text_len=c.max_text_len,
-            min_audio_len=c.min_audio_len,
-            max_audio_len=c.max_audio_len,
-            start_by_longest=start_by_longest,
-        )
-
-        # add preprocess to force the length computation
-        if preprocess_samples:
-            dataset.preprocess_samples()
-
-        dataloader = DataLoader(
-            dataset,
-            batch_size=batch_size,
-            shuffle=False,
-            collate_fn=dataset.collate_fn,
-            drop_last=True,
-            num_workers=c.num_loader_workers,
-        )
-        return dataloader, dataset
-
-    def test_loader(self):
-        for dataset_config in dataset_configs:
-            dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config, preprocess_samples=True)
-            for i, data in enumerate(dataloader):
-                if i == self.max_loader_iter:
-                    break
-                text_input = data["token_id"]
-                _ = data["token_id_lengths"]
-                speaker_name = data["speaker_names"]
-                linear_input = data["linear"]
-                mel_input = data["mel"]
-                mel_lengths = data["mel_lengths"]
-                _ = data["stop_targets"]
-                _ = data["item_idxs"]
-                wavs = data["waveform"]
-
-                neg_values = text_input[text_input < 0]
-                check_count = len(neg_values)
-
-                # check basic conditions
-                self.assertEqual(check_count, 0)
-                self.assertEqual(linear_input.shape[0], mel_input.shape[0], c.batch_size)
-                self.assertEqual(linear_input.shape[2], self.ap.fft_size // 2 + 1)
-                self.assertEqual(mel_input.shape[2], c.audio["num_mels"])
-                self.assertEqual(wavs.shape[1], mel_input.shape[1] * c.audio.hop_length)
-                self.assertIsInstance(speaker_name[0], str)
-
-                # make sure that the computed mels and the waveform match and correctly computed
-                mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy())
-                # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding
-                mel_new = mel_new[:, : mel_lengths[0]]
-                ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length)
-                mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg]
-                self.assertLess(abs(mel_diff.sum()), 1e-5)
-
-                # check normalization ranges
-                if self.ap.symmetric_norm:
-                    self.assertLessEqual(mel_input.max(), self.ap.max_norm)
-                    self.assertGreaterEqual(
-                        mel_input.min(), -self.ap.max_norm  # pylint: disable=invalid-unary-operand-type
-                    )
-                    self.assertLess(mel_input.min(), 0)
-                else:
-                    self.assertLessEqual(mel_input.max(), self.ap.max_norm)
-                    self.assertGreaterEqual(mel_input.min(), 0)
-
-    def test_batch_group_shuffle(self):
-        dataloader, dataset = self._create_dataloader(2, c.r, 16, dataset_config_wav)
-        last_length = 0
-        frames = dataset.samples
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            mel_lengths = data["mel_lengths"]
-            avg_length = mel_lengths.numpy().mean()
-        dataloader.dataset.preprocess_samples()
-        is_items_reordered = False
-        for idx, item in enumerate(dataloader.dataset.samples):
-            if item != frames[idx]:
-                is_items_reordered = True
-                break
-        self.assertGreaterEqual(avg_length, last_length)
-        self.assertTrue(is_items_reordered)
-
-    def test_start_by_longest(self):
-        """Test start_by_longest option.
-
-        Ther first item of the fist batch must be longer than all the other items.
-        """
-        dataloader, _ = self._create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True)
-        dataloader.dataset.preprocess_samples()
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            mel_lengths = data["mel_lengths"]
-            if i == 0:
-                max_len = mel_lengths[0]
-            print(mel_lengths)
-            self.assertTrue(all(max_len >= mel_lengths))
-
-    def test_padding_and_spectrograms(self):
-        def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths):
-            self.assertNotEqual(linear_input[idx, -1].sum(), 0)  # check padding
-            self.assertNotEqual(linear_input[idx, -2].sum(), 0)
-            self.assertNotEqual(mel_input[idx, -1].sum(), 0)
-            self.assertNotEqual(mel_input[idx, -2].sum(), 0)
-            self.assertEqual(stop_target[idx, -1], 1)
-            self.assertEqual(stop_target[idx, -2], 0)
-            self.assertEqual(stop_target[idx].sum(), 1)
-            self.assertEqual(len(mel_lengths.shape), 1)
-            self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0])
-            self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0])
-
-        dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config_wav)
-
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            linear_input = data["linear"]
-            mel_input = data["mel"]
-            mel_lengths = data["mel_lengths"]
-            stop_target = data["stop_targets"]
-            item_idx = data["item_idxs"]
-
-            # check mel_spec consistency
-            wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32)
-            mel = self.ap.melspectrogram(wav).astype("float32")
-            mel = torch.FloatTensor(mel).contiguous()
-            mel_dl = mel_input[0]
-            # NOTE: Below needs to check == 0 but due to an unknown reason
-            # there is a slight difference between two matrices.
-            # TODO: Check this assert cond more in detail.
-            self.assertLess(abs(mel.T - mel_dl).max(), 1e-5)
-
-            # check mel-spec correctness
-            mel_spec = mel_input[0].cpu().numpy()
-            wav = self.ap.inv_melspectrogram(mel_spec.T)
-            self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav")
-            shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav")
-
-            # check linear-spec
-            linear_spec = linear_input[0].cpu().numpy()
-            wav = self.ap.inv_spectrogram(linear_spec.T)
-            self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav")
-            shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav")
-
-            # check the outputs
-            check_conditions(0, linear_input, mel_input, stop_target, mel_lengths)
-
-        # Test for batch size 2
-        dataloader, _ = self._create_dataloader(2, 1, 0, dataset_config_wav)
-
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            linear_input = data["linear"]
-            mel_input = data["mel"]
-            mel_lengths = data["mel_lengths"]
-            stop_target = data["stop_targets"]
-            item_idx = data["item_idxs"]
-
-            # set id to the longest sequence in the batch
-            if mel_lengths[0] > mel_lengths[1]:
-                idx = 0
-            else:
-                idx = 1
-
-            # check the longer item in the batch
-            check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths)
-
-            # check the other item in the batch
-            self.assertEqual(linear_input[1 - idx, -1].sum(), 0)
-            self.assertEqual(mel_input[1 - idx, -1].sum(), 0)
-            self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1)
-            self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1])
-            self.assertEqual(len(mel_lengths.shape), 1)
-
-            # check batch zero-frame conditions (zero-frame disabled)
-            # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
-            # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
+def _create_dataloader(batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False):
+    # load dataset
+    meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2)
+    items = meta_data_train + meta_data_eval
+    tokenizer, _ = TTSTokenizer.init_from_config(c)
+    dataset = TTSDataset(
+        outputs_per_step=r,
+        compute_linear_spec=True,
+        return_wav=True,
+        tokenizer=tokenizer,
+        ap=ap,
+        samples=items,
+        batch_group_size=bgs,
+        min_text_len=c.min_text_len,
+        max_text_len=c.max_text_len,
+        min_audio_len=c.min_audio_len,
+        max_audio_len=c.max_audio_len,
+        start_by_longest=start_by_longest,
+    )
+
+    # add preprocess to force the length computation
+    if preprocess_samples:
+        dataset.preprocess_samples()
+
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=dataset.collate_fn,
+        drop_last=True,
+        num_workers=c.num_loader_workers,
+    )
+    return dataloader, dataset
+
+
+@pytest.mark.parametrize("dataset_config", dataset_configs)
+def test_loader(dataset_config: BaseDatasetConfig):
+    batch_size = 1
+    dataloader, _ = _create_dataloader(batch_size, 1, 0, dataset_config, preprocess_samples=True)
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        text_input = data["token_id"]
+        _ = data["token_id_lengths"]
+        speaker_name = data["speaker_names"]
+        linear_input = data["linear"]
+        mel_input = data["mel"]
+        mel_lengths = data["mel_lengths"]
+        _ = data["stop_targets"]
+        _ = data["item_idxs"]
+        wavs = data["waveform"]
+
+        neg_values = text_input[text_input < 0]
+        check_count = len(neg_values)
+
+        # check basic conditions
+        assert check_count == 0
+        assert linear_input.shape[0] == mel_input.shape[0] == batch_size
+        assert linear_input.shape[2] == ap.fft_size // 2 + 1
+        assert mel_input.shape[2] == c.audio["num_mels"]
+        assert wavs.shape[1] == mel_input.shape[1] * c.audio.hop_length
+        assert isinstance(speaker_name[0], str)
+
+        # make sure that the computed mels and the waveform match and correctly computed
+        mel_new = ap.melspectrogram(wavs[0].squeeze().numpy())
+        # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding
+        mel_new = mel_new[:, : mel_lengths[0]]
+        ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length)
+        mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg]
+        assert abs(mel_diff.sum()) < 1e-5
+
+        # check normalization ranges
+        if ap.symmetric_norm:
+            assert mel_input.max() <= ap.max_norm
+            assert mel_input.min() >= -ap.max_norm
+            assert mel_input.min() < 0
+        else:
+            assert mel_input.max() <= ap.max_norm
+            assert mel_input.min() >= 0
+
+
+def test_batch_group_shuffle():
+    dataloader, dataset = _create_dataloader(2, c.r, 16, dataset_config_wav)
+    last_length = 0
+    frames = dataset.samples
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        mel_lengths = data["mel_lengths"]
+        avg_length = mel_lengths.numpy().mean()
+    dataloader.dataset.preprocess_samples()
+    is_items_reordered = False
+    for idx, item in enumerate(dataloader.dataset.samples):
+        if item != frames[idx]:
+            is_items_reordered = True
+            break
+    assert avg_length >= last_length
+    assert is_items_reordered
+
+
+def test_start_by_longest():
+    """Test start_by_longest option.
+
+    The first item of the fist batch must be longer than all the other items.
+    """
+    dataloader, _ = _create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True)
+    dataloader.dataset.preprocess_samples()
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        mel_lengths = data["mel_lengths"]
+        if i == 0:
+            max_len = mel_lengths[0]
+        print(mel_lengths)
+        assert all(max_len >= mel_lengths)
+
+
+def test_padding_and_spectrograms(tmp_path):
+    def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths):
+        assert linear_input[idx, -1].sum() != 0  # check padding
+        assert linear_input[idx, -2].sum() != 0
+        assert mel_input[idx, -1].sum() != 0
+        assert mel_input[idx, -2].sum() != 0
+        assert stop_target[idx, -1] == 1
+        assert stop_target[idx, -2] == 0
+        assert stop_target[idx].sum() == 1
+        assert len(mel_lengths.shape) == 1
+        assert mel_lengths[idx] == linear_input[idx].shape[0]
+        assert mel_lengths[idx] == mel_input[idx].shape[0]
+
+    dataloader, _ = _create_dataloader(1, 1, 0, dataset_config_wav)
+
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        linear_input = data["linear"]
+        mel_input = data["mel"]
+        mel_lengths = data["mel_lengths"]
+        stop_target = data["stop_targets"]
+        item_idx = data["item_idxs"]
+
+        # check mel_spec consistency
+        wav = np.asarray(ap.load_wav(item_idx[0]), dtype=np.float32)
+        mel = ap.melspectrogram(wav).astype("float32")
+        mel = torch.FloatTensor(mel).contiguous()
+        mel_dl = mel_input[0]
+        # NOTE: Below needs to check == 0 but due to an unknown reason
+        # there is a slight difference between two matrices.
+        # TODO: Check this assert cond more in detail.
+        assert abs(mel.T - mel_dl).max() < 1e-5
+
+        # check mel-spec correctness
+        mel_spec = mel_input[0].cpu().numpy()
+        wav = ap.inv_melspectrogram(mel_spec.T)
+        ap.save_wav(wav, tmp_path / "mel_inv_dataloader.wav")
+        shutil.copy(item_idx[0], tmp_path / "mel_target_dataloader.wav")
+
+        # check linear-spec
+        linear_spec = linear_input[0].cpu().numpy()
+        wav = ap.inv_spectrogram(linear_spec.T)
+        ap.save_wav(wav, tmp_path / "linear_inv_dataloader.wav")
+        shutil.copy(item_idx[0], tmp_path / "linear_target_dataloader.wav")
+
+        # check the outputs
+        check_conditions(0, linear_input, mel_input, stop_target, mel_lengths)
+
+    # Test for batch size 2
+    dataloader, _ = _create_dataloader(2, 1, 0, dataset_config_wav)
+
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        linear_input = data["linear"]
+        mel_input = data["mel"]
+        mel_lengths = data["mel_lengths"]
+        stop_target = data["stop_targets"]
+        item_idx = data["item_idxs"]
+
+        # set id to the longest sequence in the batch
+        if mel_lengths[0] > mel_lengths[1]:
+            idx = 0
+        else:
+            idx = 1
+
+        # check the longer item in the batch
+        check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths)
+
+        # check the other item in the batch
+        assert linear_input[1 - idx, -1].sum() == 0
+        assert mel_input[1 - idx, -1].sum() == 0
+        assert stop_target[1, mel_lengths[1] - 1] == 1
+        assert stop_target[1, mel_lengths[1] :].sum() == stop_target.shape[1] - mel_lengths[1]
+        assert len(mel_lengths.shape) == 1
+
+        # check batch zero-frame conditions (zero-frame disabled)
+        # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
+        # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
diff --git a/tests/inference_tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py
index 28a4088c96..c49ea5ab43 100644
--- a/tests/inference_tests/test_synthesize.py
+++ b/tests/inference_tests/test_synthesize.py
@@ -1,11 +1,9 @@
-import os
+from tests import run_cli
 
-from tests import get_tests_output_path, run_cli
 
-
-def test_synthesize():
+def test_synthesize(tmp_path):
     """Test synthesize.py with diffent arguments."""
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
+    output_path = tmp_path / "output.wav"
     run_cli("tts --list_models")
 
     # single speaker model
diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py
index 9d4e193382..972a47b2af 100644
--- a/tests/vocoder_tests/test_fullband_melgan_train.py
+++ b/tests/vocoder_tests/test_fullband_melgan_train.py
@@ -1,43 +1,42 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.vocoder.configs import FullbandMelganConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-config = FullbandMelganConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=8192,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_vocoder_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    config = FullbandMelganConfig(
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        seq_len=8192,
+        eval_split_size=1,
+        print_step=1,
+        print_eval=True,
+        data_path="tests/data/ljspeech",
+        discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
+        output_path=output_path,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
diff --git a/tests/vocoder_tests/test_hifigan_train.py b/tests/vocoder_tests/test_hifigan_train.py
index c506fb48dc..b110f9bd61 100644
--- a/tests/vocoder_tests/test_hifigan_train.py
+++ b/tests/vocoder_tests/test_hifigan_train.py
@@ -1,43 +1,41 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.vocoder.configs import HifiganConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_vocoder_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = HifiganConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=1024,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = HifiganConfig(
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        seq_len=1024,
+        eval_split_size=1,
+        print_step=1,
+        print_eval=True,
+        data_path="tests/data/ljspeech",
+        output_path=output_path,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py
index 6ef9cd495b..019c511da7 100644
--- a/tests/vocoder_tests/test_melgan_train.py
+++ b/tests/vocoder_tests/test_melgan_train.py
@@ -1,43 +1,42 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.vocoder.configs import MelganConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-config = MelganConfig(
-    batch_size=4,
-    eval_batch_size=4,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=2048,
-    eval_split_size=1,
-    print_step=1,
-    discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_vocoder_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    config = MelganConfig(
+        batch_size=4,
+        eval_batch_size=4,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        seq_len=2048,
+        eval_split_size=1,
+        print_step=1,
+        discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
+        print_eval=True,
+        data_path="tests/data/ljspeech",
+        output_path=output_path,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py
index 8002760706..4f9de80dfc 100644
--- a/tests/vocoder_tests/test_multiband_melgan_train.py
+++ b/tests/vocoder_tests/test_multiband_melgan_train.py
@@ -1,44 +1,43 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.vocoder.configs import MultibandMelganConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-config = MultibandMelganConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=8192,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    steps_to_start_discriminator=1,
-    data_path="tests/data/ljspeech",
-    discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_vocoder_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    config = MultibandMelganConfig(
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        seq_len=8192,
+        eval_split_size=1,
+        print_step=1,
+        print_eval=True,
+        steps_to_start_discriminator=1,
+        data_path="tests/data/ljspeech",
+        discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
+        output_path=output_path,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
diff --git a/tests/vocoder_tests/test_parallel_wavegan_train.py b/tests/vocoder_tests/test_parallel_wavegan_train.py
index a126befe2e..1df44a11de 100644
--- a/tests/vocoder_tests/test_parallel_wavegan_train.py
+++ b/tests/vocoder_tests/test_parallel_wavegan_train.py
@@ -1,42 +1,43 @@
 import glob
 import os
-import shutil
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.vocoder.configs import ParallelWaveganConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-config = ParallelWaveganConfig(
-    batch_size=4,
-    eval_batch_size=4,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=2048,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_vocoder_config.json"
+    output_path = tmp_path / "train_outputs"
+    config = ParallelWaveganConfig(
+        batch_size=4,
+        eval_batch_size=4,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        seq_len=2048,
+        eval_split_size=1,
+        print_step=1,
+        print_eval=True,
+        data_path="tests/data/ljspeech",
+        output_path=output_path,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
 
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
diff --git a/tests/vocoder_tests/test_vocoder_gan_datasets.py b/tests/vocoder_tests/test_vocoder_gan_datasets.py
index c39d70e94c..d540667ee8 100644
--- a/tests/vocoder_tests/test_vocoder_gan_datasets.py
+++ b/tests/vocoder_tests/test_vocoder_gan_datasets.py
@@ -3,16 +3,12 @@
 import numpy as np
 from torch.utils.data import DataLoader
 
-from tests import get_tests_output_path, get_tests_path
+from tests import get_tests_path
 from TTS.utils.audio import AudioProcessor
 from TTS.vocoder.configs import BaseGANVocoderConfig
 from TTS.vocoder.datasets.gan_dataset import GANDataset
 from TTS.vocoder.datasets.preprocess import load_wav_data
 
-file_path = os.path.dirname(os.path.realpath(__file__))
-OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
-os.makedirs(OUTPATH, exist_ok=True)
-
 C = BaseGANVocoderConfig()
 
 test_data_path = os.path.join(get_tests_path(), "data/ljspeech/")
diff --git a/tests/vocoder_tests/test_vocoder_losses.py b/tests/vocoder_tests/test_vocoder_losses.py
index 95501c2d39..c9432d7f4b 100644
--- a/tests/vocoder_tests/test_vocoder_losses.py
+++ b/tests/vocoder_tests/test_vocoder_losses.py
@@ -2,17 +2,12 @@
 
 import torch
 
-from tests import get_tests_input_path, get_tests_output_path, get_tests_path
+from tests import get_tests_input_path
 from TTS.config import BaseAudioConfig
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.audio.numpy_transforms import stft
 from TTS.vocoder.layers.losses import MelganFeatureLoss, MultiScaleSTFTLoss, STFTLoss, TorchSTFT
 
-TESTS_PATH = get_tests_path()
-
-OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
-os.makedirs(OUT_PATH, exist_ok=True)
-
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 
 ap = AudioProcessor(**BaseAudioConfig().to_dict())
diff --git a/tests/vocoder_tests/test_vocoder_pqmf.py b/tests/vocoder_tests/test_vocoder_pqmf.py
index afe8d1dc8f..9be492927d 100644
--- a/tests/vocoder_tests/test_vocoder_pqmf.py
+++ b/tests/vocoder_tests/test_vocoder_pqmf.py
@@ -4,14 +4,13 @@
 import torch
 from librosa.core import load
 
-from tests import get_tests_input_path, get_tests_output_path, get_tests_path
+from tests import get_tests_input_path
 from TTS.vocoder.layers.pqmf import PQMF
 
-TESTS_PATH = get_tests_path()
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 
 
-def test_pqmf():
+def test_pqmf(tmp_path):
     w, sr = load(WAV_FILE)
 
     layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0)
@@ -23,4 +22,4 @@ def test_pqmf():
     print(w2_.max())
     print(w2_.min())
     print(w2_.mean())
-    sf.write(os.path.join(get_tests_output_path(), "pqmf_output.wav"), w2_.flatten().detach(), sr)
+    sf.write(tmp_path / "pqmf_output.wav", w2_.flatten().detach(), sr)
diff --git a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py
index 503b4e2483..c3ae1309dc 100644
--- a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py
+++ b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py
@@ -1,29 +1,38 @@
 import os
-import shutil
 
 import numpy as np
+import pytest
 from torch.utils.data import DataLoader
 
-from tests import get_tests_output_path, get_tests_path
+from tests import get_tests_path
 from TTS.utils.audio import AudioProcessor
 from TTS.vocoder.configs import WavernnConfig
 from TTS.vocoder.datasets.preprocess import load_wav_feat_data, preprocess_wav_files
 from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
 
-file_path = os.path.dirname(os.path.realpath(__file__))
-OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
-os.makedirs(OUTPATH, exist_ok=True)
-
 C = WavernnConfig()
 
 test_data_path = os.path.join(get_tests_path(), "data/ljspeech/")
-test_mel_feat_path = os.path.join(test_data_path, "mel")
-test_quant_feat_path = os.path.join(test_data_path, "quant")
-ok_ljspeech = os.path.exists(test_data_path)
 
+params = [
+    [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0],
+    [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4],
+    [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0],
+    [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0],
+    [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0],
+    [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2],
+    [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0],
+]
+
+
+@pytest.mark.parametrize("params", params)
+def test_parametrized_wavernn_dataset(tmp_path, params):
+    """Run dataloader with given parameters and check conditions"""
+    print(params)
+    batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers = params
+    test_mel_feat_path = tmp_path / "mel"
+    test_quant_feat_path = tmp_path / "quant"
 
-def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers):
-    """run dataloader with given parameters and check conditions"""
     ap = AudioProcessor(**C.audio)
 
     C.batch_size = batch_size
@@ -31,7 +40,7 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor
     C.seq_len = seq_len
     C.data_path = test_data_path
 
-    preprocess_wav_files(test_data_path, C, ap)
+    preprocess_wav_files(tmp_path, C, ap)
     _, train_items = load_wav_feat_data(test_data_path, test_mel_feat_path, 5)
 
     dataset = WaveRNNDataset(
@@ -50,35 +59,12 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor
     max_iter = 10
     count_iter = 0
 
-    try:
-        for data in loader:
-            x_input, mels, _ = data
-            expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2))
-            assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}"
-
-            assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1]
-            count_iter += 1
-            if count_iter == max_iter:
-                break
-    # except AssertionError:
-    #     shutil.rmtree(test_mel_feat_path)
-    #     shutil.rmtree(test_quant_feat_path)
-    finally:
-        shutil.rmtree(test_mel_feat_path)
-        shutil.rmtree(test_quant_feat_path)
-
+    for data in loader:
+        x_input, mels, _ = data
+        expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2))
+        assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}"
 
-def test_parametrized_wavernn_dataset():
-    """test dataloader with different parameters"""
-    params = [
-        [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0],
-        [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4],
-        [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0],
-        [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0],
-        [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0],
-        [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2],
-        [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0],
-    ]
-    for param in params:
-        print(param)
-        wavernn_dataset_case(*param)
+        assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1]
+        count_iter += 1
+        if count_iter == max_iter:
+            break
diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py
index 9b10759505..02a1fc1228 100644
--- a/tests/vocoder_tests/test_wavegrad_train.py
+++ b/tests/vocoder_tests/test_wavegrad_train.py
@@ -1,54 +1,50 @@
-import glob
 import os
-import shutil
-import unittest
 
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import WavegradConfig
+import pytest
 
+from tests import get_device_id, run_cli
+from TTS.vocoder.configs import WavegradConfig
 
-class WavegradTrainingTest(unittest.TestCase):
-    # TODO: Reactivate after improving CI run times
-    # This test currently takes ~2h on CI (15min/step vs 8sec/step locally)
-    if os.getenv("GITHUB_ACTIONS") == "true":
-        __test__ = False
-
-    def test_train(self):  # pylint: disable=no-self-use
-        config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-        output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-        config = WavegradConfig(
-            batch_size=8,
-            eval_batch_size=8,
-            num_loader_workers=0,
-            num_eval_loader_workers=0,
-            run_eval=True,
-            test_delay_epochs=-1,
-            epochs=1,
-            seq_len=8192,
-            eval_split_size=1,
-            print_step=1,
-            print_eval=True,
-            data_path="tests/data/ljspeech",
-            output_path=output_path,
-            test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2},
-        )
-        config.audio.do_trim_silence = True
-        config.audio.trim_db = 60
-        config.save_json(config_path)
-
-        # train the model for one epoch
-        command_train = (
-            f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-        )
-        run_cli(command_train)
-
-        # Find latest folder
-        continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-        # restore the model and continue training for one more epoch
-        command_train = (
-            f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-        )
-        run_cli(command_train)
-        shutil.rmtree(continue_path)
+GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
+
+
+# TODO: Reactivate after improving CI run times
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Takes ~2h on CI (15min/step vs 8sec/step locally)")
+def test_train(tmp_path):
+    config_path = tmp_path / "test_vocoder_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    config = WavegradConfig(
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        seq_len=8192,
+        eval_split_size=1,
+        print_step=1,
+        print_eval=True,
+        data_path="tests/data/ljspeech",
+        output_path=output_path,
+        test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2},
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
+
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
+    )
+    run_cli(command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
diff --git a/tests/vocoder_tests/test_wavernn_train.py b/tests/vocoder_tests/test_wavernn_train.py
index 337e24259f..85e91efa36 100644
--- a/tests/vocoder_tests/test_wavernn_train.py
+++ b/tests/vocoder_tests/test_wavernn_train.py
@@ -1,45 +1,43 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.vocoder.configs import WavernnConfig
 from TTS.vocoder.models.wavernn import WavernnArgs
 
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_vocoder_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = WavernnConfig(
-    model_args=WavernnArgs(),
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=256,  # for shorter test time
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = WavernnConfig(
+        model_args=WavernnArgs(),
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        seq_len=256,  # for shorter test time
+        eval_split_size=1,
+        print_step=1,
+        print_eval=True,
+        data_path="tests/data/ljspeech",
+        output_path=output_path,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)