-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_wav.py
37 lines (27 loc) · 1018 Bytes
/
preprocess_wav.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import pandas as pd
from src.tools.vad import preprocess_wav, write_wav
import os
from tqdm import tqdm
from src.model.module import Featurizer
import torch as t
MANIFESTROOT = 'data/manifest'
MANIFEST = [os.path.join(MANIFESTROOT, i) for i in os.listdir(MANIFESTROOT) if i.endswith('.csv')]
MANIFEST = list(filter(lambda x: 'libri' not in x, MANIFEST))
# def vad_and_write(wav_file):
# sig = preprocess_wav(wav_file, source_sr=16000)
# file = write_wav(wav_file, sig)
# return file
if __name__ == '__main__':
# for file in tqdm(MANIFEST, desc='file'):
# df = pd.read_csv(file)
# for wav in tqdm(df.wav_file, desc=f'{file}:wav'):
# vad_and_write(wav)
featurizer = Featurizer()
df = pd.DataFrame()
for file in MANIFEST:
ndf = pd.read_csv(file)
df = pd.concat([df, ndf])
samples = df.sample(50000)
wav_files = samples.wav_file
m, s = featurizer.cal_cmvn(wav_files)
t.save({'mean': m, 'std': s}, 'mean_std.pth')