This repository has been archived by the owner on May 19, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathTaskfile.yml
411 lines (353 loc) · 16.2 KB
/
Taskfile.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
# Copyright: University of Queensland, 2019
######################################### DRIVER TASKS ############################################
_run-elan:
desc: "Run through processing pipeline for Elan transcriptions"
cmds:
# Default extract stage, assuming data are cleaned and filtered.
# Extracts data from all tiers in input_scripts files, which will flow all the way through the pipeline to the lexicon
- task clean-output-folder tmp-makedir make-kaldi-subfolders
- task elan-to-json
- task clean-json
- task build
- task process-audio
_rerun-elan:
desc: "Run through processing pipeline for Elan transcriptions, skip the audio steps for faster lexicon development"
cmds:
# Default extract stage, assuming data are cleaned and filtered.
# Extracts data from all tiers in input_scripts files, which will flow all the way through the pipeline to the lexicon
# Skips moving audio
# - task clean-output-folder tmp-makedir make-kaldi-subfolders
- task elan-to-json
- task clean-json
- task build
_run-elan-split:
desc: "Segment audio and text by Elan annotation start/end times then run through processing pipeline with default settings"
cmds:
# Extracts only from 'Phrase' tier, and skips annotations that have a corresponding annotation on a ref tier
- task clean-output-folder tmp-makedir make-kaldi-subfolders
- task split-eafs
- task clean-json > {{ .KALDI_OUTPUT_PATH }}/tmp/{{ .CLEANED_FILTERED_DATA }}
- task _build
- task process-audio
_run-trs:
desc: "Run through processing pipeline for TRS transcriptions"
cmds:
- task clean-output-folder tmp-makedir make-kaldi-subfolders
- task trs-to-json > {{ .KALDI_OUTPUT_PATH }}/tmp/dirty.json
- task clean-json > {{ .KALDI_OUTPUT_PATH }}/tmp/{{ .CLEANED_FILTERED_DATA }}
- task _build
- task process-audio
_run-textgrid:
desc: "Run through processing pipeline for TRS transcriptions"
cmds:
- task clean-output-folder tmp-makedir make-kaldi-subfolders
- task textgrid-to-json > {{ .KALDI_OUTPUT_PATH }}/tmp/dirty.json
- task clean-json > {{ .KALDI_OUTPUT_PATH }}/tmp/{{ .CLEANED_FILTERED_DATA }}
- task _build
- task process-audio
_train-test:
desc: "Run Kaldi train and test stages on default settings"
cmds:
- cd {{ .KALDI_OUTPUT_PATH }}/kaldi; ./run.sh
_show-lattice:
desc: "Create pdf of the lattice graph"
dir: /kaldi-helpers/working_dir/input/output/kaldi
cmds:
- |
for uttid in `cut -f1 -d " " ../tmp/json_splitted/testing/text`; do
./utils/show_lattice.sh $uttid ./exp/tri1/decode/lat.1.gz ./data/lang/words.txt
done
- mkdir -p /kaldi-helpers/working_dir/input/lattices
- cp *.pdf /kaldi-helpers/working_dir/input/lattices
_transcribe:
desc: "This doesn't build the files yet, manually create infer/wav.scp, infer/spk2utt, infer/utt2spk and put infer/audio."
dir: /kaldi-helpers
env:
PYTHONIOENCODING: "utf-8"
cmds:
- python3.6 {{ .HELPERS_PATH }}/{{ .INPUT_SCRIPTS_PATH }}/resample_audio.py -c {{ .INFER_PATH }}
- sh {{ .HELPERS_PATH }}/{{ .INFERENCE_SCRIPTS_PATH }}/generate-infer-files.sh
- rm -rf {{ .KALDI_OUTPUT_PATH }}/kaldi/data/infer
- cp -R working_dir/input/infer {{ .KALDI_OUTPUT_PATH }}/kaldi/data/infer
- cp working_dir/input/infer/audio.wav {{ .KALDI_OUTPUT_PATH }}/kaldi/audio.wav
- task infer
- task copy-infer-results
_transcribe-align:
desc: "This builds context files, just add infer/audio. Only works on single audio."
dir: /kaldi-helpers
cmds:
- sh {{ .HELPERS_PATH }}/{{ .INFERENCE_SCRIPTS_PATH }}/generate-infer-files.sh
- cp -R working_dir/input/infer {{ .KALDI_OUTPUT_PATH }}/kaldi/data/infer
- task infer-align
- task copy-infer-align-results
_transcribe-long:
desc: ""
dir: /kaldi-helpers
cmds:
- task _run-elan
- task _train-test
- task copy-infer-align-results
_demo:
desc: "Prepare and run a test inference (so I don't have to)"
cmds:
- task prep-for-demo-infer
- task infer
- mkdir -p working_dir/input/infer
- task copy-infer-results
_demo-align:
desc: "Prepare and run a test inference with aligned (ELAN & Textgrid) output"
cmds:
- task prep-for-demo-infer
- task infer-align
- mkdir -p working_dir/input/infer
- task copy-infer-align-results
######################################### HELPER TASKS ############################################
build:
desc: "Run through only the build stage with default settings (i.e. not the clean/filter/extract stage)"
cmds:
- task generate-kaldi-files
- task generate-kaldi-configs
- task copy-generated-files copy-phones-configs copy-helper-scripts
- echo "######################## Build task completed without errors"
process-audio:
desc: "Resample and move the audio"
cmds:
- task resample-audio
- task gather-wavs extract-wavs
- echo "######################## Process and move audio completed"
prep-for-demo-infer:
desc: "Do run, train steps, and build infer files"
cmds:
- rm -rf /kaldi-helpers/working_dir/input/infer
- task ready-example
- task _run-elan
- task _train-test
- mkdir -p {{ .KALDI_OUTPUT_PATH }}/kaldi/data/infer
- cp -R {{ .KALDI_OUTPUT_PATH }}/kaldi/data/test/* {{ .KALDI_OUTPUT_PATH }}/kaldi/data/infer
copy-infer-results:
desc: "Copy audio file and infer results back to input dir for easy access"
cmds:
- cp {{ .KALDI_OUTPUT_PATH }}/kaldi/data/infer/one-best-hypothesis.txt working_dir/input/infer/
# - oh this is gnarly..
- |
infer_audio_filename=$(head -n 1 working_dir/input/output/kaldi/data/test/wav.scp | awk '{print $2}' | cut -c 3- ) &&
cp "working_dir/input/output/kaldi/$infer_audio_filename" working_dir/input/infer/
copy-infer-align-results:
desc: "Copy infer-align results back to input dir for easy access"
cmds:
- cp {{ .KALDI_OUTPUT_PATH }}/kaldi/data/infer/utterance-0.eaf working_dir/input/infer/
- |
infer_audio_filename=$(head -n 1 working_dir/input/output/kaldi/data/test/wav.scp | awk '{print $2}' | cut -c 3- ) &&
cp "working_dir/input/output/kaldi/$infer_audio_filename" working_dir/input/infer/
generate-kaldi-files:
desc: "Generate corpus-related files for Kaldi from JSON data"
cmds:
- task json-to-kaldi
- task make-wordlist
- task make-prn-dict
- task make-nonsil-phones > {{ .KALDI_OUTPUT_PATH }}/tmp/nonsilence_phones.txt
generate-kaldi-configs:
desc: "Generate config files for Kaldi from KALDI_TEMPLATES, populate with Taskvars"
cmds:
# Grab variables from Taskvars.yml and inject into mo command
- KALDI_ROOT={{ .KALDI_ROOT }}
HELPERS_PATH={{ .HELPERS_PATH }}
CORPUS_PATH={{ .CORPUS_PATH }}
mo < {{ .KALDI_TEMPLATES }}/path.sh > {{ .KALDI_OUTPUT_PATH }}/tmp/path.sh
- MFCC_SAMPLE_FREQUENCY={{ .MFCC_SAMPLE_FREQUENCY }}
MFCC_FRAME_LENGTH={{ .MFCC_FRAME_LENGTH }}
MFCC_LOW_FREQ={{ .MFCC_LOW_FREQ }}
MFCC_HIGH_FREQ={{ .MFCC_HIGH_FREQ }}
MFCC_NUM_CEPS={{ .MFCC_NUM_CEPS }}
mo < {{ .KALDI_TEMPLATES }}/mfcc.conf > {{ .KALDI_OUTPUT_PATH }}/tmp/mfcc.conf
- DECODE_BEAM={{ .DECODE_BEAM }}
DECODE_FIRST_BEAM={{ .DECODE_FIRST_BEAM }}
mo < {{ .KALDI_TEMPLATES }}/decode.config > {{ .KALDI_OUTPUT_PATH }}/tmp/decode.config
##################### Helpers for copying things
copy-generated-files:
desc: "Copy generated files to appropriate (sub)directories under /output_scripts/kaldi"
cmds:
- cp {{ .KALDI_OUTPUT_PATH }}/tmp/json_splitted/training/corpus.txt {{ .KALDI_OUTPUT_PATH }}/kaldi/data/local/
- cp {{ .KALDI_OUTPUT_PATH }}/tmp/lexicon.txt {{ .KALDI_OUTPUT_PATH }}/kaldi/data/local/dict/
- cp {{ .KALDI_OUTPUT_PATH }}/tmp/nonsilence_phones.txt {{ .KALDI_OUTPUT_PATH }}/kaldi/data/local/dict/
- cp {{ .KALDI_OUTPUT_PATH }}/tmp/path.sh {{ .KALDI_OUTPUT_PATH }}/kaldi/
- cp {{ .KALDI_OUTPUT_PATH }}/tmp/mfcc.conf {{ .KALDI_OUTPUT_PATH }}/kaldi/conf/
- cp {{ .KALDI_OUTPUT_PATH }}/tmp/decode.config {{ .KALDI_OUTPUT_PATH }}/kaldi/conf/
# Note the default settings make the 'train' and 'test' folders identical (not anymore!)
- cp {{ .KALDI_OUTPUT_PATH }}/tmp/json_splitted/testing/segments {{ .KALDI_OUTPUT_PATH }}/tmp/json_splitted/testing/text {{ .KALDI_OUTPUT_PATH }}/tmp/json_splitted/testing/utt2spk {{ .KALDI_OUTPUT_PATH }}/tmp/json_splitted/testing/wav.scp {{ .KALDI_OUTPUT_PATH }}/kaldi/data/test/
- cp {{ .KALDI_OUTPUT_PATH }}/tmp/json_splitted/training/segments {{ .KALDI_OUTPUT_PATH }}/tmp/json_splitted/training/text {{ .KALDI_OUTPUT_PATH }}/tmp/json_splitted/training/utt2spk {{ .KALDI_OUTPUT_PATH }}/tmp/json_splitted/training/wav.scp {{ .KALDI_OUTPUT_PATH }}/kaldi/data/train/
copy-helper-scripts:
desc: "Copy the necessary scripts from Kaldi"
cmds:
- cp {{ .KALDI_TEMPLATES }}/cmd.sh {{ .KALDI_OUTPUT_PATH }}/kaldi/
- cp {{ .KALDI_TEMPLATES }}/run.sh {{ .KALDI_OUTPUT_PATH }}/kaldi/
- cp {{ .KALDI_TEMPLATES }}/score.sh {{ .KALDI_OUTPUT_PATH }}/kaldi/local/
- cp -L -r {{ .KALDI_ROOT }}/egs/wsj/s5/steps {{ .KALDI_OUTPUT_PATH }}/kaldi/steps
- cp -L -r {{ .KALDI_ROOT }}/egs/wsj/s5/utils {{ .KALDI_OUTPUT_PATH }}/kaldi/utils
copy-phones-configs:
desc: "Copy provided silence/optional silence configuration files"
cmds:
- cp ./working_dir/input/config/optional_silence.txt {{ .KALDI_OUTPUT_PATH }}/kaldi/data/local/dict/
- cp ./working_dir/input/config/silence_phones.txt {{ .KALDI_OUTPUT_PATH }}/kaldi/data/local/dict/
gather-wavs:
desc: "Gather all wav files inside input_scripts/data into output_scripts/media.zip"
cmds:
# Tar up .wav files in order to keep folder structure the same
# because Kaldi's wav.scp data file uses dir structure
- cd {{ .CORPUS_PATH }}; tar cf {{ .HELPERS_PATH }}/{{ .KALDI_OUTPUT_PATH }}/media.tar `find . | grep '\.wav'`
extract-wavs:
desc: "Extract all wav files into kaldi folder"
cmds:
- tar xf {{ .KALDI_OUTPUT_PATH }}/media.tar -C {{ .KALDI_OUTPUT_PATH }}/kaldi
- rm {{ .KALDI_OUTPUT_PATH }}/media.tar
##################### Helpers for folders stuff
clean-output-folder:
desc: "Delete all files and folders inside output_scripts directory"
cmds:
- rm -rf {{ .KALDI_OUTPUT_PATH }}/*
tmp-makedir:
desc: "Make the tmp directory, if it does not exist"
cmds:
- if [ ! -d {{ .KALDI_OUTPUT_PATH }}/tmp ]; then mkdir -p {{ .KALDI_OUTPUT_PATH }}/tmp; fi
tmp-delete:
deps: [tmp-makedir]
desc: "Delete all files in tmp directory"
cmds:
# Make directory non-empty, just in case it's just been initialised
- cp /dev/null {{ .KALDI_OUTPUT_PATH }}/null
- rm -r {{ .KALDI_OUTPUT_PATH }}/*
make-kaldi-subfolders:
desc: "Makes subfolder structure which Kaldi expects"
cmds:
- mkdir -p {{ .KALDI_OUTPUT_PATH }}/kaldi/data/local/dict
- mkdir -p {{ .KALDI_OUTPUT_PATH }}/kaldi/data/test
- mkdir -p {{ .KALDI_OUTPUT_PATH }}/kaldi/data/train
- mkdir -p {{ .KALDI_OUTPUT_PATH }}/kaldi/conf
- mkdir -p {{ .KALDI_OUTPUT_PATH }}/kaldi/local
##################### Helpers for generating things (mostly wrappers for Python scripts)
cat-all-json:
desc: "Concatenate all .json files into one .json file"
cmds:
- jq -s '. | add'
make-nonsil-phones:
desc: "Generate non-silence phones file from LETTER_TO_SOUND_PATH file defined in Taskfile.yml"
cmds:
- grep -v '^#' < {{ .LETTER_TO_SOUND_PATH }}
| cut -d' ' -f2
| grep -v '^$'
| sort -u
elan-to-json:
desc: "Convert a folder of .eaf files to a single JSON file"
env:
PYTHONIOENCODING: "utf-8"
cmds:
- python3.6 {{ .INPUT_SCRIPTS_PATH }}/elan_to_json.py
--input_dir {{ .CORPUS_PATH }}
--output_dir {{ .KALDI_OUTPUT_PATH }}
--tier {{ .TARGET_LANGUAGE_TIER }}
--output_json {{ .KALDI_OUTPUT_PATH }}/tmp/dirty.json
trs-to-json:
desc: "Convert a folder of .trs files to a single JSON file"
env:
PYTHONIOENCODING: "utf-8"
cmds:
- mkdir -p {{ .KALDI_OUTPUT_PATH }}/tmp
- python3.6 {{ .INPUT_SCRIPTS_PATH }}/trs_to_json.py --input_dir {{ .CORPUS_PATH }} --output_json {{ .KALDI_OUTPUT_PATH }}/tmp/dirty.json
textgrid-to-json:
desc: "Convert a folder of .textgrid files to a single JSON file"
env:
PYTHONIOENCODING: "utf-8"
cmds:
# praatio is another that won't install because of the ssl error. use 3.4 for now
- python3.6 {{ .INPUT_SCRIPTS_PATH }}/textgrid_to_json.py --input_dir {{ .CORPUS_PATH }}
json-to-kaldi:
desc: "Generate files for the Kaldi format"
cmds:
- python3.6 {{ .INPUT_SCRIPTS_PATH }}/json_to_kaldi.py
--input_json {{ .KALDI_OUTPUT_PATH }}/tmp/{{ .CLEANED_FILTERED_DATA }}
--output_folder {{ .KALDI_OUTPUT_PATH }}/tmp/json_splitted
--corpus_file {{ .KALDI_OUTPUT_PATH }}/tmp/corpus.txt
--text_corpus {{ .INPUT_PATH }}/config/text_corpora/
clean-json:
desc: "Clean corpus of problematic characters before passing data to Kaldi"
env:
PYTHONIOENCODING: "utf-8"
cmds:
- python3.6 {{ .INPUT_SCRIPTS_PATH }}/clean_json.py
--infile {{ .KALDI_OUTPUT_PATH }}/tmp/dirty.json
--outfile {{ .KALDI_OUTPUT_PATH }}/tmp/{{ .CLEANED_FILTERED_DATA }}
make-wordlist:
desc: "Make a list of unique words that occur in the corpus"
env:
PYTHONIOENCODING: "utf-8"
cmds:
- mkdir -p /kaldi-helpers/working_dir/input/config/text_corpora
- python3.6 {{ .INPUT_SCRIPTS_PATH }}/make_wordlist.py
--infile {{ .KALDI_OUTPUT_PATH }}/tmp/{{ .CLEANED_FILTERED_DATA }}
--outfile {{ .KALDI_OUTPUT_PATH }}/tmp/wordlist.txt
--word_list {{ .INPUT_PATH }}/config/additional_words.txt
--kaldi_corpus {{ .KALDI_OUTPUT_PATH }}/tmp/corpus.txt
make-prn-dict:
desc: "Make pronunciation dictionary"
env:
PYTHONIOENCODING: "utf-8"
cmds:
- python3.6 {{ .INPUT_SCRIPTS_PATH }}/make_prn_dict.py
--infile {{ .KALDI_OUTPUT_PATH }}/tmp/wordlist.txt
--outfile {{ .KALDI_OUTPUT_PATH }}/tmp/lexicon.txt
--config {{ .LETTER_TO_SOUND_PATH }}
resample-audio:
desc: "Change audio to 16 bit 44.1kHz mono WAV"
env:
PYTHONIOENCODING: "utf-8"
cmds:
- python3.6 {{ .INPUT_SCRIPTS_PATH }}/resample_audio.py
--corpus {{ .CORPUS_PATH }}
split-eafs:
desc: "Read Elan files, slices matching WAVs by start and end times of annotations on a particular tier, outputting separate clips and text. Skips annotations with value '*PUB' on the main tier, or annotations that have a ref annotation on the 'Silence' tier."
env:
PYTHONIOENCODING: "utf-8"
cmds:
- python3.6 {{ .INPUT_SCRIPTS_PATH }}/split_eafs.py
--input_dir {{ .DIRTY_DATA_PATH }}
--tier {{ .TARGET_LANGUAGE_TIER }}
--silence_marker {{ .SILENCE_MARKER }}
--silence_tier {{ .SILENCE_REF_TIER }}
--output_json {{ .KALDI_OUTPUT_PATH }}/tmp/dirty.json
--output_audio_dir {{ .CORPUS_PATH }}
--output_text_dir {{ .KALDI_OUTPUT_PATH }}/tmp/labels
##################### Helpers for preparing toy data for demo, and inferencing
ready-example:
desc: "Prepares the Abui toy corpus for use with Kaldi/kaldi-helpers"
env:
PYTHONIOENCODING: "utf-8"
cmds:
- mkdir -p /kaldi-helpers/working_dir/input
- cp -R /kaldi-helpers/resources/corpora/abui_toy_corpus/* working_dir/input/
infer:
desc: "Run Kaldi inference_scripts on test data"
dir: /kaldi-helpers/working_dir/input/output/kaldi
env:
PYTHONIOENCODING: "utf-8"
cmds:
- sh {{ .HELPERS_PATH }}/{{ .INFERENCE_SCRIPTS_PATH }}/gmm-decode.sh
infer-align:
desc: "Run Kaldi inference_scripts on test data and output_scripts ctm aligned files"
dir: /kaldi-helpers/working_dir/input/output/kaldi
env:
PYTHONIOENCODING: "utf-8"
cmds:
- sh {{ .HELPERS_PATH }}/{{ .INFERENCE_SCRIPTS_PATH }}/gmm-decode-align.sh
split-inferences:
desc: "split all audio "
dir: /kaldi-helpers/working_dir/input/infer
env:
PYTHONIOENCODING: "utf-8"
cmds:
- mkdir -p {{ .INFER_PATH }}
- python3.6 python3.6 {{ .INPUT_SCRIPTS_PATH }}/split_on_silence.py \
--input_dir {{ .INFER_PATH }}
--output_dir {{ .INFER_PATH }}
--silence_length 180
--threshold 20
--added_silence 100