forked from r9y9/deepvoice3_pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjsut.py
64 lines (50 loc) · 2.23 KB
/
jsut.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
import os
import audio
from nnmnkwii.datasets import jsut
from nnmnkwii.io import hts
from hparams import hparams
from os.path import exists
import librosa
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
executor = ProcessPoolExecutor(max_workers=num_workers)
futures = []
transcriptions = jsut.TranscriptionDataSource(
in_dir, subsets=jsut.available_subsets).collect_files()
wav_paths = jsut.WavFileDataSource(
in_dir, subsets=jsut.available_subsets).collect_files()
for index, (text, wav_path) in enumerate(zip(transcriptions, wav_paths)):
futures.append(executor.submit(
partial(_process_utterance, out_dir, index + 1, wav_path, text)))
return [future.result() for future in tqdm(futures)]
def _process_utterance(out_dir, index, wav_path, text):
sr = hparams.sample_rate
# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")
# Trim silence from hts labels if available
if exists(lab_path):
labels = hts.load(lab_path)
assert labels[0][-1] == "silB"
assert labels[-1][-1] == "silE"
b = int(labels[0][1] * 1e-7 * sr)
e = int(labels[-1][0] * 1e-7 * sr)
wav = wav[b:e]
else:
wav, _ = librosa.effects.trim(wav, top_db=30)
if hparams.rescaling:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1]
# Compute a mel-scale spectrogram from the wav:
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
# Write the spectrograms to disk:
spectrogram_filename = 'jsut-spec-%05d.npy' % index
mel_filename = 'jsut-mel-%05d.npy' % index
np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
# Return a tuple describing this training example:
return (spectrogram_filename, mel_filename, n_frames, text)