train.cfg

# Model/Embeddings
word_vec_size: 600  # Word embedding size for src and tgt
share_embeddings: True  # Share embeddings from src and tgt

# Model/Embedding Features
feat_vec_size: 20  # Attribute embedding size
feat_merge: mlp  # Merge action for incorporating feature embeddings [concat|sum|mlp]
feat_merge_activation: ReLU


# Model Structure
model_type: table  # Type of source model to use [text|table|img|audio]
model_dtype: fp32
encoder_type: htransformer  # Type of encoder [rnn|brnn|transformer|htransformer|cnn]
decoder_type: hrnn  # Type of decoder [rnn|transformer|cnn|hrnn]
param_init: 0.1  # Uniform distribution with support (-param_init, +param_init)

# We put sizes we wish to change manually at -1
layers: -1
enc_layers: -1
heads: -1
glu_depth: -1

# Encoder sizes
transformer_ff: 1024  # Size of hidden transformer feed-forward
units_layers: 2
chunks_layers: 2
units_head: 2
chunks_head: 2
units_glu_depth: 1
chunks_glu_depth: 1

# Decoder sizes
dec_layers: 2
rnn_size: 600
input_feed: 1
bridge: True
rnn_type: LSTM


# Model/Attention
global_attention: general  # Type of attn to use [dot|general|mlp|none]
global_attention_function: softmax  # [softmax|sparsemax]
self_attn_type: scaled-dot  # self attn type in transformer [scaled-dot|average]
generator_function: softmax
use_pos: True  # whether using attributes in attention layers

# Model/Copy
copy_attn: True
reuse_copy_attn: True  # Reuse standard attention for copy
copy_attn_force: True  # When available, train to copy


#  Files and logs
data: experiments/exp-1/data/data  # path to datafile from preprocess.py
save_model: experiments/exp-1/models/model  # path to store checkpoints
log_file: experiments/exp-1/train-log.txt

report_every: 50  # log current loss every X steps
save_checkpoint_steps: 500  # save a cp every X steps


# Gpu related:
gpu_ranks: [0]  # ids of gpus to use
world_size: 1  # total number of distributed processes
gpu_backend: nccl  # type of torch distributed backend
gpu_verbose_level: 0
master_ip: localhost
master_port: 10000
seed: 123


# Optimization & training
batch_size: 32
batch_type: sents
normalization: sents
accum_count: [2]  # Update weights every X batches
accum_steps: [0]  # steps at which accum counts value changes
valid_steps: 500 # run models on validation set every X steps
train_steps: 30000
optim: adam
max_grad_norm: 5
dropout: .5
adam_beta1: 0.9
adam_beta2: 0.999
label_smoothing: 0.0
average_decay: 0
average_every: 1

# Learning rate
learning_rate: 0.001
learning_rate_decay: 0.5  # lr *= lr_decay
start_decay_step: 5000
decay_steps: 10000