-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqag.ini
89 lines (78 loc) · 2.4 KB
/
qag.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
[main]
quiet: False
ignoreWarnings: True
# model type: QG|AE|E2E
type: QA
# text|chat
baseType: chat
modelSize: 7
# training modes: test|norm
mode: norm
[paths]
base: /models/llama-hf/7b-${main:baseType}
output: /models/output/${main:mode}/${main:modelSize}b-${main:baseType}${main:type}
data: /data/pbe/${main:type}
log: /data/logs
# dataProcessor paths (specify files)
dpSource: /data/pbe/clean/contextQuestions.csv
dpDest: /data/pbe/QA/data.jsonl
[data]
# manual|generate
sampleMode: generate
evalToTrainRatio: 0.07
qualityThreshold: 7
# tunable hyperparameters
[hyperparameters]
# General hyperparameters
learningRate: 1e-4
weightDecay: 1e-4
# multiple epochs decrease performance due to overfitting
epochs: 1
# LORA
# scaling factor as compared to r. setting it at r means your data is as loud as the base data
# setting it at higher multiples makes your data louder. I need to test it at lower values
loraAlpha: 256
# r = matrix rank: size of matrices "on the side" of
# FFW determines how many parameters get fine-tuned
r: 64
loraDropout: 0.01
# which layers to train: query, value, key, o_proj. use first letters
loraLayers: qvko
# none|all|lora_only
bias: none
# general setting for training
[train]
# controls for how often to stop to evaluate|save|log the model
# no|steps|epoch
saveStrategy: epoch
evalStrategy: steps
stepSize: 50
testSteps: 2
# number of most recent checkpoints to save
saveTotalLimit: 1
# whether to keep around the best model
loadBestModelAtEnd: False
# max data length during training
maxSeqLength: 512
# refers to how large the batches are per each GPU
perDeviceTrainBatchSize: 2
# how many predictions steps to accumulate the output tensors for on the GPU,
# before moving the results to the CPU. If None, all predictions are accumulated
# on GPU. higher numbers are faster but can throw 'Out of Memory' errors)
gradientAccumulationSteps: 10
evalAccumulationSteps: 10
# optimize only the model completion (answers/questions)
# see https://github.com/huggingface/trl/issues/426 && .../trl/pull/445
optimizeCompletion: True
# packs several examples into one training input. SFTTrainer uses an EOS
# token to separate examples. forced to False when typeCompletionOnly = True
packing: False
addCustomTokens: False
[eval]
# bleu|meteor|rouge
evalMetric: bleu
[generate]
# max number of new tokens to generate
maxLength: 256
# penalizes token repetition. 1.0 for no penalty, 1.2 recommended
repetitionPenalty: 1.0