-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.ini
executable file
·71 lines (69 loc) · 2.09 KB
/
config.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
[locations]
lgid-dir = .
language-table = %(lgid-dir)s/res/lang_table.txt
most-common-codes = %(lgid-dir)s/res/common_codes.txt
english-word-names = %(lgid-dir)s/res/english_word_language_names.txt
word-index = %(lgid-dir)s/res/word_index.txt
language-index = %(lgid-dir)s/res/language_index.txt
word-language-mapping = %(lgid-dir)s/res/word_language_mapping.txt
odin-source = %(lgid-dir)s/res/odin-by-lang
odin-language-model = %(lgid-dir)s/res/odin-lm
crubadan-index = %(lgid-dir)s/res/Crubadan.csv
crubadan-base-uri = http://crubadan.org/files/
crubadan-language-model = %(lgid-dir)s/res/crubadan_lm
crubadan-directory-index = %(lgid-dir)s/res/crubadan_directory_index.csv
classify-error-file = %(lgid-dir)s/errors.txt
[parameters]
# window sizes (number of lines)
window-size = 25
after-window-size = 25
close-window-size = 2
after-close-window-size = 2
# word/morpheme/character n-gram settings
word-n-gram-size = 1
morpheme-n-gram-size = 2
character-n-gram-size = 3
crubadan-char-size = 3
crubadan-word-size = 1
morpheme-delimiter = [\s\-\=\+]+
# if a feature is used N times in a window/document, it is considered frequent
frequent-mention-threshold = 6
after-frequent-mention-threshold = 6
article-frequent-mention-threshold = 20
# normalize capitalization (lower/upper/title) of lang names to guide search
mention-capitalization = title
# length of language name to be considered suspiciously short
short-name-size = 3
# return all mentions in a given span or only the single longest one
single-longest-mention = no
# use ODIN language models for code+language or just the code
code-only-odin-lms = no
[features]
GL-first-lines = yes
GL-last-lines = yes
GL-frequent = yes
GL-most-frequent = yes
GL-most-frequent-code = yes
GL-possible-english-word = yes
GL-short-lang-name = yes
GL-is-english = yes
GL-multi-word-name = yes
W-prev = yes
W-close = yes
W-closest = yes
W-frequent = yes
W-after = yes
W-close-after = yes
W-closest-after = yes
W-frequent-after = yes
L-in-line = yes
G-in-line = yes
T-in-line = yes
M-in-line = yes
L-LMw = yes
L-LMm = yes
L-LMc = yes
L-CR-LMw = yes
L-CR-LMc = yes
G-overlap = yes
W-prevclass = yes