Merge pull request Idlak#4 from Idlak/dev_trainfix2

Reverted nnet training utils / scripts to previous version
Skaiste · Jun 15, 2018 · 0210345 · 0210345
2 parents 0dcf922 + 442dd3b
commit 0210345
Show file tree

Hide file tree

Showing 6 changed files with 72 additions and 15 deletions.
diff --git a/egs/wsj/s5/steps/nnet/train_scheduler.sh b/egs/wsj/s5/steps/nnet/train_scheduler.sh
@@ -17,6 +17,7 @@ l2_penalty=0
 train_tool="nnet-train-frmshuff"
 train_tool_opts="--minibatch-size=256 --randomizer-size=32768 --randomizer-seed=777"
 feature_transform=
+output_feature_transform=
 
 split_feats= # int -> number of splits 'feats.scp -> feats.${i}.scp', starting from feats.1.scp,
              # (data are alredy shuffled and split to N parts),
@@ -84,6 +85,7 @@ mlp_base=${mlp_init##*/}; mlp_base=${mlp_base%.*}
 log=$dir/log/iter00.initial.log; hostname>$log
 $train_tool --cross-validate=true --randomize=false --verbose=$verbose $train_tool_opts \
   ${feature_transform:+ --feature-transform=$feature_transform} \
+  ${output_feature_transform:+ --output-feature-transform=$output_feature_transform} \
   ${frame_weights:+ "--frame-weights=$frame_weights"} \
   ${utt_weights:+ "--utt-weights=$utt_weights"} \
   "$feats_cv" "$labels_cv" $mlp_best \
@@ -125,6 +127,7 @@ for iter in $(seq -w $max_iters); do
     --learn-rate=$learn_rate --momentum=$momentum \
     --l1-penalty=$l1_penalty --l2-penalty=$l2_penalty \
     ${feature_transform:+ --feature-transform=$feature_transform} \
+    ${output_feature_transform:+ --output-feature-transform=$output_feature_transform} \
     ${frame_weights:+ "--frame-weights=$frame_weights"} \
     ${utt_weights:+ "--utt-weights=$utt_weights"} \
     "$feats_tr_portion" "$labels_tr" $mlp_best $mlp_next \
@@ -137,6 +140,7 @@ for iter in $(seq -w $max_iters); do
   log=$dir/log/iter${iter}.cv.log; hostname>$log
   $train_tool --cross-validate=true --randomize=false --verbose=$verbose $train_tool_opts \
     ${feature_transform:+ --feature-transform=$feature_transform} \
+    ${output_feature_transform:+ --output-feature-transform=$output_feature_transform} \
     ${frame_weights:+ "--frame-weights=$frame_weights"} \
     ${utt_weights:+ "--utt-weights=$utt_weights"} \
     "$feats_cv" "$labels_cv" $mlp_next \
@@ -147,7 +151,7 @@ for iter in $(seq -w $max_iters); do
 
   # accept or reject?
   loss_prev=$loss
-  if [ 1 == $(awk "BEGIN{print($loss_new < $loss ? 1:0);}") -o $iter -le $keep_lr_iters -o $iter -le $min_iters ]; then
+  if [ 1 == $(awk "BEGIN{print($loss_new < $loss ? 1:0);}") -o $iter -le $keep_lr_iters ]; then
     # accepting: the loss was better, or we had fixed learn-rate, or we had fixed epoch-number,
     loss=$loss_new
     mlp_best=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.4f" $tr_loss)_cv$(printf "%.4f" $loss_new)

diff --git a/egs/wsj/s5/utils/nnet/make_lstm_proto.py b/egs/wsj/s5/utils/nnet/make_lstm_proto.py
@@ -17,21 +17,51 @@
 
 # Generated Nnet prototype, to be initialized by 'nnet-initialize'.
 
-import sys
+import sys, math
 
 ###
 ### Parse options
 ###
 from optparse import OptionParser
 usage="%prog [options] <feat-dim> <num-leaves> >nnet-proto-file"
 parser = OptionParser(usage)
+# Softmax related,
+parser.add_option('--no-softmax', dest='with_softmax',
+                   help='Do not put <SoftMax> in the prototype [default: %default]',
+                   default=True, action='store_false');
+parser.add_option('--block-softmax-dims', dest='block_softmax_dims',
+                   help='Generate <BlockSoftmax> with dims D1:D2:D3 [default: %default]',
+                   default="", type='string');
 # Required,
 parser.add_option('--cell-dim', dest='cell_dim', type='int', default=320,
                    help='Number of cells for one direction in LSTM [default: %default]');
 parser.add_option('--proj-dim', dest='proj_dim', type='int', default=400,
                    help='Number of LSTM recurrent units [default: %default]');
 parser.add_option('--num-layers', dest='num_layers', type='int', default=2,
                    help='Number of LSTM layers [default: %default]');
+# Activation related,
+parser.add_option('--activation-final', dest='activation_final',
+                   help='If set, outputs an activation layer as final layer [default: %default]',
+                   default=False, action='store_true');
+parser.add_option('--activation-type', dest='activation_type',
+                   help='Select type of activation function : (<Sigmoid>|<Tanh>|<ParametricRelu>) [default: %default]',
+                   default='<Tanh>', type='string');
+parser.add_option('--activation-opts', dest='activation_opts',
+                   help='Additional options for protoype of activation function [default: %default]',
+                   default='', type='string');
+# Affine-transform related,
+parser.add_option('--hid-bias-mean', dest='hid_bias_mean',
+                   help='Set bias for hidden activations [default: %default]',
+                   default=-2.0, type='float');
+parser.add_option('--hid-bias-range', dest='hid_bias_range',
+                   help='Set bias range for hidden activations (+/- 1/2 range around mean) [default: %default]',
+                   default=4.0, type='float');
+parser.add_option('--param-stddev-factor', dest='param_stddev_factor',
+                   help='Factor to rescale Normal distriburtion for initalizing weight matrices [default: %default]',
+                   default=0.1, type='float');
+parser.add_option('--no-glorot-scaled-stddev', dest='with_glorot',
+                   help='Generate normalized weights according to X.Glorot paper, but mapping U->N with same variance (factor sqrt(x/(dim_in+dim_out)))',
+                   action='store_false', default=True);
 # Optional (default == 'None'),
 parser.add_option('--lstm-param-range', dest='lstm_param_range', type='float',
                    help='Range of initial LSTM parameters [default: %default]');
@@ -54,6 +84,15 @@
 
 (feat_dim, num_leaves) = map(int,args);
 
+
+# Optionaly scale
+def Glorot(dim1, dim2):
+  if o.with_glorot:
+    # 35.0 = magic number, gives ~1.0 in inner layers for hid-dim 1024dim,
+    return 35.0 * math.sqrt(2.0/(dim1+dim2));
+  else:
+    return 1.0
+
 # Original prototype from Jiayu,
 #<NnetProto>
 #<Transmit> <InputDim> 40 <OutputDim> 40
@@ -78,9 +117,18 @@
   print "<LstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (o.proj_dim, o.proj_dim, o.cell_dim) + lstm_extra_opts
 
 # Adding <Tanh> for more stability,
+#print "%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, o.proj_dim, o.proj_dim, o.activation_opts) # Non-linearity
 print "<Tanh> <InputDim> %d <OutputDim> %d" % (o.proj_dim, o.proj_dim)
 
 # Softmax layer,
-print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> 0.0 <BiasRange> 0.0" % (o.proj_dim, num_leaves) + softmax_affine_opts
-print "<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves)
+print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f" % (o.proj_dim, num_leaves, 0.0, 0.0, (o.param_stddev_factor * Glorot(o.proj_dim, num_leaves)), 1.0, 0.1)# + softmax_affine_opts
+
+# Optionaly append softmax
+if o.with_softmax:
+  if o.block_softmax_dims == "":
+    print "<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves)
+  else:
+    print "<BlockSoftmax> <InputDim> %d <OutputDim> %d <BlockDims> %s" % (num_leaves, num_leaves, o.block_softmax_dims)
 
+if o.activation_final:
+  print "%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_leaves, num_leaves, o.activation_opts)
diff --git a/egs/wsj/s5/utils/nnet/make_nnet_proto.py b/egs/wsj/s5/utils/nnet/make_nnet_proto.py
@@ -37,6 +37,9 @@
                    help='Generate <BlockSoftmax> with dims D1:D2:D3 [default: %default]',
                    default="", type='string');
 # Activation related,
+parser.add_option('--activation-final', dest='activation_final',
+                   help='If set, outputs an activation layer as final layer [default: %default]',
+                   default=None, type='string');
 parser.add_option('--activation-type', dest='activation_type',
                    help='Select type of activation function : (<Sigmoid>|<Tanh>|<ParametricRelu>) [default: %default]',
                    default='<Sigmoid>', type='string');
@@ -231,6 +234,9 @@ def Glorot(dim1, dim2):
   else:
     print "<BlockSoftmax> <InputDim> %d <OutputDim> %d <BlockDims> %s" % (num_leaves, num_leaves, o.block_softmax_dims)
 
+if o.activation_final:
+  print "%s <InputDim> %d <OutputDim> %d %s" % (o.activation_final, num_leaves, num_leaves, o.activation_opts)
+
 # We are done!
 sys.exit(0)
 
diff --git a/idlak-egs/tts_tangle_arctic/s2/local/idlak_make_lang.py b/idlak-egs/tts_tangle_arctic/s2/local/idlak_make_lang.py
@@ -427,10 +427,11 @@ def idlak_make_lang(textfile, datadir, langdir):
                 chars[c] = 1
             # get phone set from transcription lexicon
             for p in prons:
-                pp = p.split()
-                for phone in pp:
-                    phones[phone] = 1
-                fplex.write(("%s %s\n" % (utf8w, p)).encode('utf-8'))
+                if len(p):
+                    pp = p.split()
+                    for phone in pp:
+                        phones[phone] = 1
+                    fplex.write(("%s %s\n" % (utf8w, p)).encode('utf-8'))
             if handler.oov.has_key(w):
                 fpoov.write(("%s %s\n" % (utf8w, prons[0])).encode('utf-8'))
         fplex.close()

diff --git a/idlak-egs/tts_tangle_arctic/s2/local/mlsa_synthesis_pitch_mlpg.sh b/idlak-egs/tts_tangle_arctic/s2/local/mlsa_synthesis_pitch_mlpg.sh
@@ -182,7 +182,7 @@ if [ "$synth" = "cere" ]; then
 elif [ "$synth" = "excitation" ]; then
     echo "generating in $tmpdir"
     x2x +af $mcep > $mcep.float
-    mlsacheck -l $fftlen -c 2 -r 0 -P 5 -m $order -a $alpha < $mcep.float > $mcep.float.stable
+    mlsacheck 2> /dev/null -l $fftlen -c 2 -r 0 -P 5 -m $order -a $alpha < $mcep.float > $mcep.float.stable
     psize=`echo "$period * $srate / 1000" | bc`
     # We have to drop the first few F0 frames to match SPTK behaviour
     #cat $f0 | awk -v srate=$srate '(NR > 2){if ($1 > 0) print srate / $1; else print 0.0}' | x2x +af \
@@ -214,7 +214,7 @@ elif [ "$synth" = "WORLD" ]; then
     echo $world/synth $fftlen $srate $f0.double $mcep.sp.double $bap.double $out_wav
     $world/synth $fftlen $srate $f0.double $mcep.sp.double $bap.double $out_wav
 else
-    x2x +af $mcep | mlsacheck -l $fftlen -c 2 -r 0 -P 5 -m $order -a $alpha > $mcep.float
+    x2x +af $mcep | mlsacheck 2> /dev/null -l $fftlen -c 2 -r 0 -P 5 -m $order -a $alpha > $mcep.float
     psize=`echo "$period * $srate / 1000" | bc`
     # We have to drop the first few F0 frames to match SPTK behaviour
     cat $f0 | awk -v srate=$srate '(NR > 2){if ($1 > 0) print srate / $1; else print 0.0}' | x2x +af \

diff --git a/idlak-egs/tts_tangle_arctic/s2/run.sh b/idlak-egs/tts_tangle_arctic/s2/run.sh
@@ -514,11 +514,9 @@ echo "
 *********************
 ** Congratulations **
 *********************
-TTS-DNN trained and sample synthesis done.
+TTS-DNN trained.
 
-Samples can be found in $dnndir/tst_forward/wav_mlpg/*.wav.
-
-More synthesis can be performed using the utils/synthesis_test.sh utility,
+Synthesis can be performed using the utils/synthesis_test.sh utility,
 e.g.: echo 'Test 1 2 3' | utils/synthesis_test-48k.sh
 "
 echo "#### Step 6: packaging DNN voice ####"
@@ -527,6 +525,6 @@ local/make_dnn_voice_pitch.sh --spk $spk --srate $srate --mcep_order $order --bn
 
 echo "Voice packaged successfully. Portable models have been stored in ${spk}_pmdl."
 echo "Synthesis can be performed using:
-         echo \"This is a demo of D N N synthesis\" | local/synthesis_voice_pitch.sh ${spk}_pmdl <out_wav>"
+         echo \"This is a demo of D N N synthesis\" | local/synthesis_voice_pitch.sh ${spk}_pmdl <out_dir>"