From 5a40dd2df2b52108694b679ffe557d2e02318236 Mon Sep 17 00:00:00 2001 From: Nay San Date: Thu, 14 Apr 2022 15:46:00 -0700 Subject: [PATCH] Re-factor SLI scripts --- scripts/_run-pipeline.sh | 10 ---------- scripts/helpers/sli.py | 4 ++-- scripts/train_sli-by-sblr.py | 35 ++++++++--------------------------- 3 files changed, 10 insertions(+), 39 deletions(-) delete mode 100755 scripts/_run-pipeline.sh diff --git a/scripts/_run-pipeline.sh b/scripts/_run-pipeline.sh deleted file mode 100755 index 2efc551..0000000 --- a/scripts/_run-pipeline.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/bash - -# Usage: -# ./scripts/_run-pipeline.sh data/raw/MATHEWS_J33-002163A-S1.wav - -python scripts/run_vad-by-silero.py $1 --overwrite - -python scripts/run_sli-by-sblr.py models/zmu-eng_sli_k10.pkl $1 --overwrite --rm_vad_tier - -python scripts/run_asr-by-w2v2.py /projects/muruwari/data/checkpoints/no-lm_b10 $1 --cuda --overwrite diff --git a/scripts/helpers/sli.py b/scripts/helpers/sli.py index be11b19..a96bdd7 100644 --- a/scripts/helpers/sli.py +++ b/scripts/helpers/sli.py @@ -31,10 +31,10 @@ def get_sli_df(sli_train_dir): return sli_df -def get_sb_encoder(): +def get_sb_encoder(save_dir="tmp"): sb_encoder = EncoderClassifier.from_hparams( source="speechbrain/lang-id-voxlingua107-ecapa", - savedir="tmp/", + savedir=save_dir, run_opts={"device": "cuda:1" if torch.cuda.is_available() else "cpu" } ) diff --git a/scripts/train_sli-by-sblr.py b/scripts/train_sli-by-sblr.py index 63b9f4f..14ba718 100644 --- a/scripts/train_sli-by-sblr.py +++ b/scripts/train_sli-by-sblr.py @@ -1,15 +1,10 @@ +import pickle + from argparse import ArgumentParser from sklearn.linear_model import LogisticRegression from sklearn.utils import shuffle -from speechbrain.pretrained import EncoderClassifier -from tqdm import tqdm -import glob -import os -import pandas as pd -import pickle -import torch -import torchaudio +from helpers.sli import get_sli_df, get_sb_encoder, add_sbemb_cols, colsplit_feats_labels parser = ArgumentParser( prog='train_sli-by-sblr', @@ -23,31 +18,17 @@ args = parser.parse_args() -language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="tmp/") - -def get_sb_emb(wav_path): - waveform, sample_rate = torchaudio.load(wav_path) - - if sample_rate != 16_000: - print("Resampling audio to 16 kHz ...") - samp_to_16k = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16_000) - waveform = samp_to_16k(waveform) - - emb = language_id.encode_batch(waveform) - - return emb.reshape((1, 256)) - -wav_files = glob.glob(os.path.join(args.clips_dir, "*", "*.wav")) -langs = [ os.path.basename(os.path.dirname(f)) for f in wav_files ] +sli_df = get_sli_df(args.clips_dir) print("Extracting features...") -embds = pd.concat([ pd.DataFrame(get_sb_emb(f)) for f in tqdm(wav_files) ]) +sli_df = add_sbemb_cols(sli_df, sb_encoder=get_sb_encoder()) -langs, embds = shuffle(langs, embds, random_state=0) +feats, labels = colsplit_feats_labels(sli_df) +feats, labels = shuffle(feats, labels, random_state=0) print("Fitting classifier...") -clf = LogisticRegression(random_state=0, max_iter=args.logreg_maxiter).fit(embds, langs) +clf = LogisticRegression(random_state=0, max_iter=args.logreg_maxiter).fit(feats, labels) pickle.dump(clf, open(args.logreg_pkl, 'wb')) print(f"Saved classifier to {args.logreg_pkl}")