defaults.ini.sample

# =============================================================================
# Configuration file for IGT Detection.
#
# The following file contains a number of settings to configure the IGT
# detection script's settings environment.
# =============================================================================


# =============================================================================
# Section for various paths.

[paths]

# This is where the human-readable feature vectors will be output
# for the files to be classified.
feat_dir = ./output/feats

# This is where the labeled files output by the classifier will
# be placed.
classified_dir = ./output/classified

# Where to output the detected IGT instances.
detected_dir = ./output/detected-igt

# This is where the gold-standard freki files will be placed for
# evaluation.
gold_dir = ./gold

# This is where various debug outputs, such as the probability
# distributions for the classifier's decisions
debug_dir = ./output/debug

# The path to the classifier model to use by default.
classifier_path = ./data/igt-detector.model

# Default file paths
eval_files = %(classified_dir)s/*.txt
train_files = ./train/*.txt
test_files = ./test/*.txt


[files]
en_wordlist = ./data/wordlists/english_words.txt
gls_wordlist = ./data/wordlists/gloss.txt
met_wordlist = ./data/wordlists/meta.txt
lng_names = ./data/langs.txt
gram_list = ./data/wordlists/grams.txt
gram_list_cased = ./data/wordlist/grams_case_sensitive.txt

[runtime]

java_mem = 16g
debug_on = 1
#pythonpath = ./path/to/additional/modules

# =============================================================================
# LABEL Setup
#
# Here the settings for which labels to use
# =============================================================================

[labels]

# Use "B" and "I" labels in combination with "L", "G", "T", and "M" labels
# to differentiate between different types of lines occurring in the
# middle of an IGT vs. not
use_bi_labels = 1

# Some lines appear as combinations of labels, such as "L-G-T" for all
# three on a single line. If this is set to true, these types of
# combined labels are allowed. If set to false, only the first
# of the multiple labels will be used.
use_multi_labels = 0

# "Flags" are additional information that is intended to be included in
# the information about the line, such as +AC (for Author Citation)
# or +LN (for Language Name). These are stripped out by default, as
# otherwise they would result in an explosion of labels.
strip_flags = 1

[featuresets]

max_features = 10000

freki_feats_enabled = 1
text_feats_enabled = 1

use_prev_line = 1
use_prev_prev_line = 1
use_next_line = 1

[freki_features]

is_indented = 1
is_first_page = 1
prev_line_same_block = 1
next_line_same_block = 1
has_nonstandard_font = 1
has_larger_font = 1
has_smaller_font = 1

f_high_iscore = 1
f_med_iscore = 1
f_low_iscore = 1

[text_features]

prev_tag=1
words = 1
has_langname = 1
has_grams = 1
has_parenthetical = 1
has_citation = 1
has_asterisk = 1
has_bracketing = 1
has_underscore = 1
has_quotation = 1
has_numbering = 1
has_leading_whitespace = 1
high_oov_rate = 1
med_oov_rate = 1
high_gls_oov = 1
med_gls_oov = 1
has_jpn = 1
has_grk = 1
has_kor = 1
has_cyr = 1
has_acc_lat = 1
has_dia = 1
has_uni = 1
has_year = 1


[thresholds]
high_oov = 0.5
med_oov = 0.2
med_overlap = 0.25
high_overlap = 0.5

high_iscore = 0.66
med_iscore = 0.5
low_iscore = 0.25

[nfold]
nfold_ratio = 0.9
nfold_iters = 10
nfold_seed = 232
nfold_dir = ./output/nfold