From 6f34c74367e43a74ef7d745d5d9645573072fee1 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Thu, 12 Dec 2024 22:34:59 +0100 Subject: [PATCH] test: use temp output folders and pytest.parametrize --- tests/data_tests/test_loader.py | 407 +++++++++--------- tests/inference_tests/test_synthesize.py | 8 +- .../test_fullband_melgan_train.py | 73 ++-- tests/vocoder_tests/test_hifigan_train.py | 70 ++- tests/vocoder_tests/test_melgan_train.py | 73 ++-- .../test_multiband_melgan_train.py | 75 ++-- .../test_parallel_wavegan_train.py | 67 +-- .../test_vocoder_gan_datasets.py | 6 +- tests/vocoder_tests/test_vocoder_losses.py | 7 +- tests/vocoder_tests/test_vocoder_pqmf.py | 7 +- .../test_vocoder_wavernn_datasets.py | 72 ++-- tests/vocoder_tests/test_wavegrad_train.py | 96 ++--- tests/vocoder_tests/test_wavernn_train.py | 72 ++-- 13 files changed, 497 insertions(+), 536 deletions(-) diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index 252b429a16..f260af161e 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -1,12 +1,12 @@ import os import shutil -import unittest import numpy as np +import pytest import torch from torch.utils.data import DataLoader -from tests import get_tests_data_path, get_tests_output_path +from tests import get_tests_data_path from TTS.tts.configs.shared_configs import BaseDatasetConfig, BaseTTSConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets.dataset import TTSDataset @@ -15,9 +15,6 @@ # pylint: disable=unused-variable -OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") -os.makedirs(OUTPATH, exist_ok=True) - # create a dummy config for testing data loaders. c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False) c.r = 5 @@ -47,6 +44,9 @@ dataset_configs = [dataset_config_wav, dataset_config_mp3, dataset_config_flac] +ap = AudioProcessor(**c.audio) +max_loader_iter = 4 + DATA_EXIST = True if not os.path.exists(c.data_path): DATA_EXIST = False @@ -54,203 +54,200 @@ print(" > Dynamic data loader test: {}".format(DATA_EXIST)) -class TestTTSDataset(unittest.TestCase): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.max_loader_iter = 4 - self.ap = AudioProcessor(**c.audio) - - def _create_dataloader(self, batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False): - # load dataset - meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) - items = meta_data_train + meta_data_eval - tokenizer, _ = TTSTokenizer.init_from_config(c) - dataset = TTSDataset( - outputs_per_step=r, - compute_linear_spec=True, - return_wav=True, - tokenizer=tokenizer, - ap=self.ap, - samples=items, - batch_group_size=bgs, - min_text_len=c.min_text_len, - max_text_len=c.max_text_len, - min_audio_len=c.min_audio_len, - max_audio_len=c.max_audio_len, - start_by_longest=start_by_longest, - ) - - # add preprocess to force the length computation - if preprocess_samples: - dataset.preprocess_samples() - - dataloader = DataLoader( - dataset, - batch_size=batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=True, - num_workers=c.num_loader_workers, - ) - return dataloader, dataset - - def test_loader(self): - for dataset_config in dataset_configs: - dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config, preprocess_samples=True) - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - text_input = data["token_id"] - _ = data["token_id_lengths"] - speaker_name = data["speaker_names"] - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - _ = data["stop_targets"] - _ = data["item_idxs"] - wavs = data["waveform"] - - neg_values = text_input[text_input < 0] - check_count = len(neg_values) - - # check basic conditions - self.assertEqual(check_count, 0) - self.assertEqual(linear_input.shape[0], mel_input.shape[0], c.batch_size) - self.assertEqual(linear_input.shape[2], self.ap.fft_size // 2 + 1) - self.assertEqual(mel_input.shape[2], c.audio["num_mels"]) - self.assertEqual(wavs.shape[1], mel_input.shape[1] * c.audio.hop_length) - self.assertIsInstance(speaker_name[0], str) - - # make sure that the computed mels and the waveform match and correctly computed - mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy()) - # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding - mel_new = mel_new[:, : mel_lengths[0]] - ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length) - mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg] - self.assertLess(abs(mel_diff.sum()), 1e-5) - - # check normalization ranges - if self.ap.symmetric_norm: - self.assertLessEqual(mel_input.max(), self.ap.max_norm) - self.assertGreaterEqual( - mel_input.min(), -self.ap.max_norm # pylint: disable=invalid-unary-operand-type - ) - self.assertLess(mel_input.min(), 0) - else: - self.assertLessEqual(mel_input.max(), self.ap.max_norm) - self.assertGreaterEqual(mel_input.min(), 0) - - def test_batch_group_shuffle(self): - dataloader, dataset = self._create_dataloader(2, c.r, 16, dataset_config_wav) - last_length = 0 - frames = dataset.samples - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - mel_lengths = data["mel_lengths"] - avg_length = mel_lengths.numpy().mean() - dataloader.dataset.preprocess_samples() - is_items_reordered = False - for idx, item in enumerate(dataloader.dataset.samples): - if item != frames[idx]: - is_items_reordered = True - break - self.assertGreaterEqual(avg_length, last_length) - self.assertTrue(is_items_reordered) - - def test_start_by_longest(self): - """Test start_by_longest option. - - Ther first item of the fist batch must be longer than all the other items. - """ - dataloader, _ = self._create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True) - dataloader.dataset.preprocess_samples() - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - mel_lengths = data["mel_lengths"] - if i == 0: - max_len = mel_lengths[0] - print(mel_lengths) - self.assertTrue(all(max_len >= mel_lengths)) - - def test_padding_and_spectrograms(self): - def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): - self.assertNotEqual(linear_input[idx, -1].sum(), 0) # check padding - self.assertNotEqual(linear_input[idx, -2].sum(), 0) - self.assertNotEqual(mel_input[idx, -1].sum(), 0) - self.assertNotEqual(mel_input[idx, -2].sum(), 0) - self.assertEqual(stop_target[idx, -1], 1) - self.assertEqual(stop_target[idx, -2], 0) - self.assertEqual(stop_target[idx].sum(), 1) - self.assertEqual(len(mel_lengths.shape), 1) - self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0]) - self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0]) - - dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config_wav) - - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - stop_target = data["stop_targets"] - item_idx = data["item_idxs"] - - # check mel_spec consistency - wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) - mel = self.ap.melspectrogram(wav).astype("float32") - mel = torch.FloatTensor(mel).contiguous() - mel_dl = mel_input[0] - # NOTE: Below needs to check == 0 but due to an unknown reason - # there is a slight difference between two matrices. - # TODO: Check this assert cond more in detail. - self.assertLess(abs(mel.T - mel_dl).max(), 1e-5) - - # check mel-spec correctness - mel_spec = mel_input[0].cpu().numpy() - wav = self.ap.inv_melspectrogram(mel_spec.T) - self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav") - shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav") - - # check linear-spec - linear_spec = linear_input[0].cpu().numpy() - wav = self.ap.inv_spectrogram(linear_spec.T) - self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav") - shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav") - - # check the outputs - check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) - - # Test for batch size 2 - dataloader, _ = self._create_dataloader(2, 1, 0, dataset_config_wav) - - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - stop_target = data["stop_targets"] - item_idx = data["item_idxs"] - - # set id to the longest sequence in the batch - if mel_lengths[0] > mel_lengths[1]: - idx = 0 - else: - idx = 1 - - # check the longer item in the batch - check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) - - # check the other item in the batch - self.assertEqual(linear_input[1 - idx, -1].sum(), 0) - self.assertEqual(mel_input[1 - idx, -1].sum(), 0) - self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1) - self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1]) - self.assertEqual(len(mel_lengths.shape), 1) - - # check batch zero-frame conditions (zero-frame disabled) - # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 - # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 +def _create_dataloader(batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False): + # load dataset + meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) + items = meta_data_train + meta_data_eval + tokenizer, _ = TTSTokenizer.init_from_config(c) + dataset = TTSDataset( + outputs_per_step=r, + compute_linear_spec=True, + return_wav=True, + tokenizer=tokenizer, + ap=ap, + samples=items, + batch_group_size=bgs, + min_text_len=c.min_text_len, + max_text_len=c.max_text_len, + min_audio_len=c.min_audio_len, + max_audio_len=c.max_audio_len, + start_by_longest=start_by_longest, + ) + + # add preprocess to force the length computation + if preprocess_samples: + dataset.preprocess_samples() + + dataloader = DataLoader( + dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=dataset.collate_fn, + drop_last=True, + num_workers=c.num_loader_workers, + ) + return dataloader, dataset + + +@pytest.mark.parametrize("dataset_config", dataset_configs) +def test_loader(dataset_config: BaseDatasetConfig): + batch_size = 1 + dataloader, _ = _create_dataloader(batch_size, 1, 0, dataset_config, preprocess_samples=True) + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + text_input = data["token_id"] + _ = data["token_id_lengths"] + speaker_name = data["speaker_names"] + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + _ = data["stop_targets"] + _ = data["item_idxs"] + wavs = data["waveform"] + + neg_values = text_input[text_input < 0] + check_count = len(neg_values) + + # check basic conditions + assert check_count == 0 + assert linear_input.shape[0] == mel_input.shape[0] == batch_size + assert linear_input.shape[2] == ap.fft_size // 2 + 1 + assert mel_input.shape[2] == c.audio["num_mels"] + assert wavs.shape[1] == mel_input.shape[1] * c.audio.hop_length + assert isinstance(speaker_name[0], str) + + # make sure that the computed mels and the waveform match and correctly computed + mel_new = ap.melspectrogram(wavs[0].squeeze().numpy()) + # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding + mel_new = mel_new[:, : mel_lengths[0]] + ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length) + mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg] + assert abs(mel_diff.sum()) < 1e-5 + + # check normalization ranges + if ap.symmetric_norm: + assert mel_input.max() <= ap.max_norm + assert mel_input.min() >= -ap.max_norm + assert mel_input.min() < 0 + else: + assert mel_input.max() <= ap.max_norm + assert mel_input.min() >= 0 + + +def test_batch_group_shuffle(): + dataloader, dataset = _create_dataloader(2, c.r, 16, dataset_config_wav) + last_length = 0 + frames = dataset.samples + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + mel_lengths = data["mel_lengths"] + avg_length = mel_lengths.numpy().mean() + dataloader.dataset.preprocess_samples() + is_items_reordered = False + for idx, item in enumerate(dataloader.dataset.samples): + if item != frames[idx]: + is_items_reordered = True + break + assert avg_length >= last_length + assert is_items_reordered + + +def test_start_by_longest(): + """Test start_by_longest option. + + The first item of the fist batch must be longer than all the other items. + """ + dataloader, _ = _create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True) + dataloader.dataset.preprocess_samples() + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + mel_lengths = data["mel_lengths"] + if i == 0: + max_len = mel_lengths[0] + print(mel_lengths) + assert all(max_len >= mel_lengths) + + +def test_padding_and_spectrograms(tmp_path): + def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): + assert linear_input[idx, -1].sum() != 0 # check padding + assert linear_input[idx, -2].sum() != 0 + assert mel_input[idx, -1].sum() != 0 + assert mel_input[idx, -2].sum() != 0 + assert stop_target[idx, -1] == 1 + assert stop_target[idx, -2] == 0 + assert stop_target[idx].sum() == 1 + assert len(mel_lengths.shape) == 1 + assert mel_lengths[idx] == linear_input[idx].shape[0] + assert mel_lengths[idx] == mel_input[idx].shape[0] + + dataloader, _ = _create_dataloader(1, 1, 0, dataset_config_wav) + + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + stop_target = data["stop_targets"] + item_idx = data["item_idxs"] + + # check mel_spec consistency + wav = np.asarray(ap.load_wav(item_idx[0]), dtype=np.float32) + mel = ap.melspectrogram(wav).astype("float32") + mel = torch.FloatTensor(mel).contiguous() + mel_dl = mel_input[0] + # NOTE: Below needs to check == 0 but due to an unknown reason + # there is a slight difference between two matrices. + # TODO: Check this assert cond more in detail. + assert abs(mel.T - mel_dl).max() < 1e-5 + + # check mel-spec correctness + mel_spec = mel_input[0].cpu().numpy() + wav = ap.inv_melspectrogram(mel_spec.T) + ap.save_wav(wav, tmp_path / "mel_inv_dataloader.wav") + shutil.copy(item_idx[0], tmp_path / "mel_target_dataloader.wav") + + # check linear-spec + linear_spec = linear_input[0].cpu().numpy() + wav = ap.inv_spectrogram(linear_spec.T) + ap.save_wav(wav, tmp_path / "linear_inv_dataloader.wav") + shutil.copy(item_idx[0], tmp_path / "linear_target_dataloader.wav") + + # check the outputs + check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) + + # Test for batch size 2 + dataloader, _ = _create_dataloader(2, 1, 0, dataset_config_wav) + + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + stop_target = data["stop_targets"] + item_idx = data["item_idxs"] + + # set id to the longest sequence in the batch + if mel_lengths[0] > mel_lengths[1]: + idx = 0 + else: + idx = 1 + + # check the longer item in the batch + check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) + + # check the other item in the batch + assert linear_input[1 - idx, -1].sum() == 0 + assert mel_input[1 - idx, -1].sum() == 0 + assert stop_target[1, mel_lengths[1] - 1] == 1 + assert stop_target[1, mel_lengths[1] :].sum() == stop_target.shape[1] - mel_lengths[1] + assert len(mel_lengths.shape) == 1 + + # check batch zero-frame conditions (zero-frame disabled) + # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 + # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 diff --git a/tests/inference_tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py index 28a4088c96..c49ea5ab43 100644 --- a/tests/inference_tests/test_synthesize.py +++ b/tests/inference_tests/test_synthesize.py @@ -1,11 +1,9 @@ -import os +from tests import run_cli -from tests import get_tests_output_path, run_cli - -def test_synthesize(): +def test_synthesize(tmp_path): """Test synthesize.py with diffent arguments.""" - output_path = os.path.join(get_tests_output_path(), "output.wav") + output_path = tmp_path / "output.wav" run_cli("tts --list_models") # single speaker model diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py index 9d4e193382..972a47b2af 100644 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ b/tests/vocoder_tests/test_fullband_melgan_train.py @@ -1,43 +1,42 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.vocoder.configs import FullbandMelganConfig -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -config = FullbandMelganConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) +def test_train(tmp_path): + config_path = tmp_path / "test_vocoder_config.json" + output_path = tmp_path / "train_outputs" + + config = FullbandMelganConfig( + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + seq_len=8192, + eval_split_size=1, + print_step=1, + print_eval=True, + data_path="tests/data/ljspeech", + discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, + output_path=output_path, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " + ) + run_cli(command_train) diff --git a/tests/vocoder_tests/test_hifigan_train.py b/tests/vocoder_tests/test_hifigan_train.py index c506fb48dc..b110f9bd61 100644 --- a/tests/vocoder_tests/test_hifigan_train.py +++ b/tests/vocoder_tests/test_hifigan_train.py @@ -1,43 +1,41 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.vocoder.configs import HifiganConfig -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_vocoder_config.json" + output_path = tmp_path / "train_outputs" -config = HifiganConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=1024, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = HifiganConfig( + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + seq_len=1024, + eval_split_size=1, + print_step=1, + print_eval=True, + data_path="tests/data/ljspeech", + output_path=output_path, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " + ) + run_cli(command_train) diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index 6ef9cd495b..019c511da7 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -1,43 +1,42 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.vocoder.configs import MelganConfig -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -config = MelganConfig( - batch_size=4, - eval_batch_size=4, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=2048, - eval_split_size=1, - print_step=1, - discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) +def test_train(tmp_path): + config_path = tmp_path / "test_vocoder_config.json" + output_path = tmp_path / "train_outputs" + + config = MelganConfig( + batch_size=4, + eval_batch_size=4, + num_loader_workers=0, + num_eval_loader_workers=0, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + seq_len=2048, + eval_split_size=1, + print_step=1, + discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, + print_eval=True, + data_path="tests/data/ljspeech", + output_path=output_path, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " + ) + run_cli(command_train) diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py index 8002760706..4f9de80dfc 100644 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ b/tests/vocoder_tests/test_multiband_melgan_train.py @@ -1,44 +1,43 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.vocoder.configs import MultibandMelganConfig -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -config = MultibandMelganConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - steps_to_start_discriminator=1, - data_path="tests/data/ljspeech", - discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) +def test_train(tmp_path): + config_path = tmp_path / "test_vocoder_config.json" + output_path = tmp_path / "train_outputs" + + config = MultibandMelganConfig( + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + seq_len=8192, + eval_split_size=1, + print_step=1, + print_eval=True, + steps_to_start_discriminator=1, + data_path="tests/data/ljspeech", + discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, + output_path=output_path, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " + ) + run_cli(command_train) diff --git a/tests/vocoder_tests/test_parallel_wavegan_train.py b/tests/vocoder_tests/test_parallel_wavegan_train.py index a126befe2e..1df44a11de 100644 --- a/tests/vocoder_tests/test_parallel_wavegan_train.py +++ b/tests/vocoder_tests/test_parallel_wavegan_train.py @@ -1,42 +1,43 @@ import glob import os -import shutil -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.vocoder.configs import ParallelWaveganConfig -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -config = ParallelWaveganConfig( - batch_size=4, - eval_batch_size=4, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=2048, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) +def test_train(tmp_path): + config_path = tmp_path / "test_vocoder_config.json" + output_path = tmp_path / "train_outputs" + config = ParallelWaveganConfig( + batch_size=4, + eval_batch_size=4, + num_loader_workers=0, + num_eval_loader_workers=0, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + seq_len=2048, + eval_split_size=1, + print_step=1, + print_eval=True, + data_path="tests/data/ljspeech", + output_path=output_path, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " + ) + run_cli(command_train) diff --git a/tests/vocoder_tests/test_vocoder_gan_datasets.py b/tests/vocoder_tests/test_vocoder_gan_datasets.py index c39d70e94c..d540667ee8 100644 --- a/tests/vocoder_tests/test_vocoder_gan_datasets.py +++ b/tests/vocoder_tests/test_vocoder_gan_datasets.py @@ -3,16 +3,12 @@ import numpy as np from torch.utils.data import DataLoader -from tests import get_tests_output_path, get_tests_path +from tests import get_tests_path from TTS.utils.audio import AudioProcessor from TTS.vocoder.configs import BaseGANVocoderConfig from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data -file_path = os.path.dirname(os.path.realpath(__file__)) -OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") -os.makedirs(OUTPATH, exist_ok=True) - C = BaseGANVocoderConfig() test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") diff --git a/tests/vocoder_tests/test_vocoder_losses.py b/tests/vocoder_tests/test_vocoder_losses.py index 95501c2d39..c9432d7f4b 100644 --- a/tests/vocoder_tests/test_vocoder_losses.py +++ b/tests/vocoder_tests/test_vocoder_losses.py @@ -2,17 +2,12 @@ import torch -from tests import get_tests_input_path, get_tests_output_path, get_tests_path +from tests import get_tests_input_path from TTS.config import BaseAudioConfig from TTS.utils.audio import AudioProcessor from TTS.utils.audio.numpy_transforms import stft from TTS.vocoder.layers.losses import MelganFeatureLoss, MultiScaleSTFTLoss, STFTLoss, TorchSTFT -TESTS_PATH = get_tests_path() - -OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") -os.makedirs(OUT_PATH, exist_ok=True) - WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") ap = AudioProcessor(**BaseAudioConfig().to_dict()) diff --git a/tests/vocoder_tests/test_vocoder_pqmf.py b/tests/vocoder_tests/test_vocoder_pqmf.py index afe8d1dc8f..9be492927d 100644 --- a/tests/vocoder_tests/test_vocoder_pqmf.py +++ b/tests/vocoder_tests/test_vocoder_pqmf.py @@ -4,14 +4,13 @@ import torch from librosa.core import load -from tests import get_tests_input_path, get_tests_output_path, get_tests_path +from tests import get_tests_input_path from TTS.vocoder.layers.pqmf import PQMF -TESTS_PATH = get_tests_path() WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -def test_pqmf(): +def test_pqmf(tmp_path): w, sr = load(WAV_FILE) layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) @@ -23,4 +22,4 @@ def test_pqmf(): print(w2_.max()) print(w2_.min()) print(w2_.mean()) - sf.write(os.path.join(get_tests_output_path(), "pqmf_output.wav"), w2_.flatten().detach(), sr) + sf.write(tmp_path / "pqmf_output.wav", w2_.flatten().detach(), sr) diff --git a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py index 503b4e2483..c3ae1309dc 100644 --- a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py +++ b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py @@ -1,29 +1,38 @@ import os -import shutil import numpy as np +import pytest from torch.utils.data import DataLoader -from tests import get_tests_output_path, get_tests_path +from tests import get_tests_path from TTS.utils.audio import AudioProcessor from TTS.vocoder.configs import WavernnConfig from TTS.vocoder.datasets.preprocess import load_wav_feat_data, preprocess_wav_files from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -file_path = os.path.dirname(os.path.realpath(__file__)) -OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") -os.makedirs(OUTPATH, exist_ok=True) - C = WavernnConfig() test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") -test_mel_feat_path = os.path.join(test_data_path, "mel") -test_quant_feat_path = os.path.join(test_data_path, "quant") -ok_ljspeech = os.path.exists(test_data_path) +params = [ + [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0], + [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4], + [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0], + [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0], + [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0], + [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2], + [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0], +] + + +@pytest.mark.parametrize("params", params) +def test_parametrized_wavernn_dataset(tmp_path, params): + """Run dataloader with given parameters and check conditions""" + print(params) + batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers = params + test_mel_feat_path = tmp_path / "mel" + test_quant_feat_path = tmp_path / "quant" -def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers): - """run dataloader with given parameters and check conditions""" ap = AudioProcessor(**C.audio) C.batch_size = batch_size @@ -31,7 +40,7 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor C.seq_len = seq_len C.data_path = test_data_path - preprocess_wav_files(test_data_path, C, ap) + preprocess_wav_files(tmp_path, C, ap) _, train_items = load_wav_feat_data(test_data_path, test_mel_feat_path, 5) dataset = WaveRNNDataset( @@ -50,35 +59,12 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor max_iter = 10 count_iter = 0 - try: - for data in loader: - x_input, mels, _ = data - expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2)) - assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}" - - assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1] - count_iter += 1 - if count_iter == max_iter: - break - # except AssertionError: - # shutil.rmtree(test_mel_feat_path) - # shutil.rmtree(test_quant_feat_path) - finally: - shutil.rmtree(test_mel_feat_path) - shutil.rmtree(test_quant_feat_path) - + for data in loader: + x_input, mels, _ = data + expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2)) + assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}" -def test_parametrized_wavernn_dataset(): - """test dataloader with different parameters""" - params = [ - [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0], - [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4], - [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0], - [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0], - [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0], - [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2], - [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0], - ] - for param in params: - print(param) - wavernn_dataset_case(*param) + assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1] + count_iter += 1 + if count_iter == max_iter: + break diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py index 9b10759505..02a1fc1228 100644 --- a/tests/vocoder_tests/test_wavegrad_train.py +++ b/tests/vocoder_tests/test_wavegrad_train.py @@ -1,54 +1,50 @@ -import glob import os -import shutil -import unittest -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import WavegradConfig +import pytest +from tests import get_device_id, run_cli +from TTS.vocoder.configs import WavegradConfig -class WavegradTrainingTest(unittest.TestCase): - # TODO: Reactivate after improving CI run times - # This test currently takes ~2h on CI (15min/step vs 8sec/step locally) - if os.getenv("GITHUB_ACTIONS") == "true": - __test__ = False - - def test_train(self): # pylint: disable=no-self-use - config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") - output_path = os.path.join(get_tests_output_path(), "train_outputs") - - config = WavegradConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, - test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}, - ) - config.audio.do_trim_silence = True - config.audio.trim_db = 60 - config.save_json(config_path) - - # train the model for one epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " - ) - run_cli(command_train) - - # Find latest folder - continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - - # restore the model and continue training for one more epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " - ) - run_cli(command_train) - shutil.rmtree(continue_path) +GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" + + +# TODO: Reactivate after improving CI run times +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Takes ~2h on CI (15min/step vs 8sec/step locally)") +def test_train(tmp_path): + config_path = tmp_path / "test_vocoder_config.json" + output_path = tmp_path / "train_outputs" + + config = WavegradConfig( + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + seq_len=8192, + eval_split_size=1, + print_step=1, + print_eval=True, + data_path="tests/data/ljspeech", + output_path=output_path, + test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) + + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " + ) + run_cli(command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " + ) + run_cli(command_train) diff --git a/tests/vocoder_tests/test_wavernn_train.py b/tests/vocoder_tests/test_wavernn_train.py index 337e24259f..85e91efa36 100644 --- a/tests/vocoder_tests/test_wavernn_train.py +++ b/tests/vocoder_tests/test_wavernn_train.py @@ -1,45 +1,43 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.vocoder.configs import WavernnConfig from TTS.vocoder.models.wavernn import WavernnArgs -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_vocoder_config.json" + output_path = tmp_path / "train_outputs" -config = WavernnConfig( - model_args=WavernnArgs(), - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=256, # for shorter test time - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = WavernnConfig( + model_args=WavernnArgs(), + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + seq_len=256, # for shorter test time + eval_split_size=1, + print_step=1, + print_eval=True, + data_path="tests/data/ljspeech", + output_path=output_path, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " + ) + run_cli(command_train)