-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdataset.py
executable file
·292 lines (232 loc) · 13 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
import utils
import torch
from torch.utils.data import Dataset
from librosa.core import load
import os
import numpy as np
from interpolate import interpolation
class FolderDataset(Dataset):
def __init__(self, datasets_path, path, cond_path, overlap_len, q_levels, ulaw, seq_len, batch_size, cond_dim,
cond_len, norm_ind, static_spk, look_ahead, partition):
super().__init__()
# Define class variables from initialization parameters
self.overlap_len = overlap_len
self.q_levels = q_levels
self.ulaw = ulaw
if self.ulaw:
self.quantize = utils.uquantize
else:
self.quantize = utils.linear_quantize
self.seq_len = seq_len
self.batch_size = batch_size
# Define sets of data, conditioners and speaker IDs
self.data = []
self.global_spk = []
self.audio = []
self.cond_dim = cond_dim
self.cond_len = cond_len
self.cond = np.empty(shape=[0, self.cond_dim])
# Define npy training dataset file names
npy_name_data = 'npy_datasets/' + partition + '/data' + static_spk*'_static' + '.npy'
npy_name_spk = 'npy_datasets/' + partition + '/speakers' + static_spk*'_static' + '.npy'
npy_name_audio_id = 'npy_datasets/' + partition + '/audio_id' + static_spk*'_static' + '.npy'
# Define npy file names with maximum and minimum values of de-normalized conditioners
npy_name_min_max_cond = 'npy_datasets/min_max' + norm_ind*'_ind' + (not norm_ind)*'_joint' \
+ static_spk*'_static' + '.npy'
npy_name_cond = 'npy_datasets/' + partition + '/conditioners' + norm_ind*'_ind' + (not norm_ind)*'_joint'\
+ static_spk*'_static' + '.npy'
# Define npy file name with array of unique speakers in dataset
npy_name_spk_id = 'npy_datasets/spk_id' + static_spk*'_static' + '.npy'
# Check if dataset has to be created
files = [npy_name_data, npy_name_cond, npy_name_spk, npy_name_min_max_cond]
create_dataset = len(files) != len([f for f in files if os.path.isfile(f)])
nosync = True
if create_dataset:
print('Create ' + partition + ' dataset', '-' * 60, '\n')
print('Extracting wav from: ', path)
print('Extracting conditioning from: ', cond_path)
print('List of files is: wav_' + partition + static_spk*'_static' + '.list')
# Get file names from partition's list list
file_names = open(datasets_path + 'wav_' + partition + static_spk*'_static' + '.list', 'r').\
read().splitlines()
if not os.path.isfile(npy_name_spk_id):
# Search for unique speakers in list and sort them
spk_list = list()
for file in file_names:
current_spk = file[0:2]
if current_spk not in spk_list:
spk_list.append(current_spk)
spk_list.sort()
spk = np.asarray(spk_list)
np.save(npy_name_spk_id, spk)
else:
spk = np.load(npy_name_spk_id)
# Load each of the files from the list. Note that extension has to be added
for counter, file in enumerate(file_names):
# Load WAV
print(file + '.wav')
(d, _) = load(path + file + '.wav', sr=None, mono=True)
num_samples = d.shape[0]
# Load CC conditioner
c = np.loadtxt(cond_path + file + '.cc')
c = c.reshape(-1, c.shape[1])
(num_ceps, _) = c.shape
# Load LF0 conditioner
f0file = np.loadtxt(cond_path + file + '.lf0')
f0, _ = interpolation(f0file, -10000000000)
f0 = f0.reshape(f0.shape[0], 1)
# Load FV conditioner
fvfile = np.loadtxt(cond_path + file + '.gv')
fv, uv = interpolation(fvfile, 1e3)
num_fv = fv.shape[0]
uv = uv.reshape(num_fv, 1)
fv = fv.reshape(num_fv, 1)
# Load speaker conditioner (index where the ID is located)
speaker = np.where(spk == file[0:2])[0][0]
speaker = np.repeat(speaker, num_fv)
# Array of audio ID to show the rearranging
audio = np.repeat(counter, num_fv)
if nosync:
oversize = num_samples % 80
print('oversize', oversize)
if oversize >= 60:
zeros = 80 - oversize
d = np.append(d, np.zeros(zeros))
if oversize <= 60 and oversize != 0:
d = d[:-oversize]
c = c[:-1][:]
f0 = f0[:-1]
fv = fv[:-1]
uv = uv[:-1]
else:
truncate = num_ceps*80
d = d[:truncate]
if not ulaw:
d = self.quantize(torch.from_numpy(d), self.q_levels).numpy()
# Concatenate all speech conditioners
cond = np.concatenate((c, f0), axis=1)
cond = np.concatenate((cond, fv), axis=1)
cond = np.concatenate((cond, uv), axis=1)
# Append/Concatenate current audio file, speech conditioners and speaker ID
self.data = np.append(self.data, d)
self.cond = np.concatenate((self.cond, cond), axis=0)
self.global_spk = np.append(self.global_spk, speaker)
self.audio = np.append(self.audio, audio)
total_samples = self.data.shape[0]
dim_cond = self.cond.shape[1]
print('Total samples: ', total_samples)
lon_seq = self.seq_len+self.overlap_len
self.num_samples = self.batch_size*(total_samples//(self.batch_size*lon_seq*self.cond_len))
print('Number of samples (1 audio file): ', self.num_samples)
self.total_samples = self.num_samples * (self.seq_len+self.overlap_len) * self.cond_len
total_conditioning = self.total_samples//self.cond_len
self.data = self.data[:self.total_samples]
self.cond = self.cond[:total_conditioning]
self.data = self.data[:self.total_samples].reshape(self.batch_size, -1)
self.length = self.total_samples // self.seq_len
self.cond = self.cond[:total_conditioning].reshape(self.batch_size, -1, dim_cond)
self.global_spk = self.global_spk[:total_conditioning].reshape(self.batch_size, -1)
self.audio = self.audio[:total_conditioning].reshape(self.batch_size, -1)
# Save maximum and minimum of de-normalized conditioners for conditions of train partition
if partition == 'train' and not os.path.isfile(npy_name_min_max_cond):
# Compute maximum and minimum of de-normalized conditioners of train partition
if norm_ind:
print('Computing maximum and minimum values for each speaker of training dataset.')
num_spk = len(spk)
self.max_cond = np.empty(shape=(num_spk, cond_dim))
self.min_cond = np.empty(shape=(num_spk, cond_dim))
for i in range(num_spk):
print('Computing speaker', i, 'of', num_spk, 'with ID:', spk[i])
self.max_cond[i] = np.amax(self.cond[self.global_spk == i], axis=0)
self.min_cond[i] = np.amin(self.cond[self.global_spk == i], axis=0)
else:
print('Computing maximum and minimum values for every speaker of training dataset.')
self.max_cond = np.amax(np.amax(self.cond, axis=1), axis=0)
self.min_cond = np.amin(np.amin(self.cond, axis=1), axis=0)
np.save(npy_name_min_max_cond, np.array([self.min_cond, self.max_cond]))
# Load maximum and minimum of de-normalized conditioners
else:
self.min_cond = np.load(npy_name_min_max_cond)[0]
self.max_cond = np.load(npy_name_min_max_cond)[1]
# Normalize conditioners with absolute maximum and minimum for each speaker of training partition
if norm_ind:
# Normalize conditioners with absolute maximum and minimum for each speaker of training partition
print('Normalizing conditioners for each speaker of training dataset.')
for i in range(len(spk)):
self.cond[self.global_spk == i] = (self.cond[self.global_spk == i] - self.min_cond[i]) / \
(self.max_cond[i] - self.min_cond[i])
else:
# Normalize conditioners with absolute maximum and minimum for each speaker of training partition
print('Normalizing conditioners for all speakers of training dataset.')
self.cond = (self.cond - self.min_cond) / (self.max_cond - self.min_cond)
# Save partition's dataset
np.save(npy_name_data, self.data)
np.save(npy_name_cond, self.cond)
np.save(npy_name_spk, self.global_spk)
np.save(npy_name_audio_id, self.audio)
print('Dataset created for ' + partition + ' partition', '-' * 60, '\n')
else:
# Load previously created dataset
self.data = np.load(npy_name_data)
self.global_spk = np.load(npy_name_spk)
if look_ahead:
if os.path.isfile(npy_name_cond.replace('.npy', '_ahead.npy')):
self.cond = np.load(npy_name_cond.replace('.npy', '_ahead.npy'))
else:
self.cond = np.load(npy_name_cond)
delayed = np.copy(self.cond)
delayed[:, :-1, :] = delayed[:, 1:, :]
self.cond = np.concatenate((self.cond, delayed), axis=2)
np.save(npy_name_cond.replace('.npy', '_ahead.npy'), self.cond)
else:
self.cond = np.load(npy_name_cond)
# Load maximum and minimum of de-normalized conditioners
self.min_cond = np.load(npy_name_min_max_cond)[0]
self.max_cond = np.load(npy_name_min_max_cond)[1]
# Compute length for current partition
self.length = np.prod(self.data.shape) // self.seq_len
print('Data shape:', self.data.shape)
print('Conditioners shape:', self.cond.shape)
print('Global speaker shape:', self.global_spk.shape)
print('Dataset loaded for ' + partition + ' partition', '-' * 60, '\n')
def __getitem__(self, index):
verbose = False
# Compute which sample within n_batch has to be returned given an index
n_batch, sample_in_batch = divmod(index, self.batch_size)
# Compute start and end for both input data and target sequences
start_data = n_batch * self.seq_len
start_target = start_data + self.overlap_len
end_target = start_target + self.seq_len
if not self.ulaw:
data = torch.from_numpy(self.data[sample_in_batch][start_data:end_target-1]).long()
target = torch.from_numpy(self.data[sample_in_batch][start_target:end_target]).long()
else:
data = self.quantize(torch.from_numpy(self.data[sample_in_batch][start_data:end_target-1]), self.q_levels)
target = self.quantize(torch.from_numpy(self.data[sample_in_batch][start_target:end_target]), self.q_levels)
# Count number of acoustic parameters computations in a sequence (1 computation every 80 audio samples)
cond_in_seq = self.seq_len//self.cond_len
if n_batch == 0: # Reset all hidden states to avoid predicting with non-related samples
reset = True
from_cond = n_batch * cond_in_seq + 1
else:
reset = False
from_cond = n_batch * cond_in_seq + 1
to_cond = from_cond + cond_in_seq
if verbose:
print('batch', n_batch)
print('sample in batch', sample_in_batch)
print('from cond', from_cond)
print('to cond', to_cond)
cond = torch.from_numpy(self.cond[sample_in_batch][from_cond:to_cond])
# Get the speaker ID for each conditioner in the sequence
global_spk = self.global_spk[sample_in_batch][from_cond:to_cond]
# Assume most repeated speaker as it doesn't matter on transitions from one audio to another
global_spk = np.argmax(np.bincount(global_spk.astype(int)))
spk = torch.from_numpy(np.array([global_spk]))
if verbose:
print('data size: ', data.size(), 'with sequence length: ', self.seq_len, 'and overlap: ', self.overlap_len)
print('conditioner size: ', cond.size())
print('speaker size: ', spk.size())
return data, reset, target, cond, spk
def __len__(self):
return self.length