This repository has been archived by the owner on Dec 3, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdefaults.ini.sample
148 lines (116 loc) · 3.43 KB
/
defaults.ini.sample
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# =============================================================================
# Configuration file for IGT Detection.
#
# The following file contains a number of settings to configure the IGT
# detection script's settings environment.
# =============================================================================
# =============================================================================
# Section for various paths.
[paths]
# This is where the human-readable feature vectors will be output
# for the files to be classified.
feat_dir = ./output/feats
# This is where the labeled files output by the classifier will
# be placed.
classified_dir = ./output/classified
# Where to output the detected IGT instances.
detected_dir = ./output/detected-igt
# This is where the gold-standard freki files will be placed for
# evaluation.
gold_dir = ./gold
# This is where various debug outputs, such as the probability
# distributions for the classifier's decisions
debug_dir = ./output/debug
# The path to the classifier model to use by default.
classifier_path = ./data/igt-detector.model
# Default file paths
eval_files = %(classified_dir)s/*.txt
train_files = ./train/*.txt
test_files = ./test/*.txt
[files]
en_wordlist = ./data/wordlists/english_words.txt
gls_wordlist = ./data/wordlists/gloss.txt
met_wordlist = ./data/wordlists/meta.txt
lng_names = ./data/langs.txt
gram_list = ./data/wordlists/grams.txt
gram_list_cased = ./data/wordlist/grams_case_sensitive.txt
[runtime]
java_mem = 16g
debug_on = 1
#pythonpath = ./path/to/additional/modules
# =============================================================================
# LABEL Setup
#
# Here the settings for which labels to use
# =============================================================================
[labels]
# Use "B" and "I" labels in combination with "L", "G", "T", and "M" labels
# to differentiate between different types of lines occurring in the
# middle of an IGT vs. not
use_bi_labels = 1
# Some lines appear as combinations of labels, such as "L-G-T" for all
# three on a single line. If this is set to true, these types of
# combined labels are allowed. If set to false, only the first
# of the multiple labels will be used.
use_multi_labels = 0
# "Flags" are additional information that is intended to be included in
# the information about the line, such as +AC (for Author Citation)
# or +LN (for Language Name). These are stripped out by default, as
# otherwise they would result in an explosion of labels.
strip_flags = 1
[featuresets]
max_features = 10000
freki_feats_enabled = 1
text_feats_enabled = 1
use_prev_line = 1
use_prev_prev_line = 1
use_next_line = 1
[freki_features]
is_indented = 1
is_first_page = 1
prev_line_same_block = 1
next_line_same_block = 1
has_nonstandard_font = 1
has_larger_font = 1
has_smaller_font = 1
f_high_iscore = 1
f_med_iscore = 1
f_low_iscore = 1
[text_features]
prev_tag=1
words = 1
has_langname = 1
has_grams = 1
has_parenthetical = 1
has_citation = 1
has_asterisk = 1
has_bracketing = 1
has_underscore = 1
has_quotation = 1
has_numbering = 1
has_leading_whitespace = 1
high_oov_rate = 1
med_oov_rate = 1
high_gls_oov = 1
med_gls_oov = 1
has_jpn = 1
has_grk = 1
has_kor = 1
has_cyr = 1
has_acc_lat = 1
has_dia = 1
has_uni = 1
has_year = 1
[thresholds]
high_oov = 0.5
med_oov = 0.2
med_overlap = 0.25
high_overlap = 0.5
high_iscore = 0.66
med_iscore = 0.5
low_iscore = 0.25
[nfold]
nfold_ratio = 0.9
nfold_iters = 10
nfold_seed = 232
nfold_dir = ./output/nfold