forked from meta-toolkit/meta
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.toml
109 lines (93 loc) · 2.49 KB
/
config.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
stop-words = "../data/lemur-stopwords.txt"
libsvm-modules = "../deps/libsvm-modules/"
prefix = "../data/"
function-words = "../data/function-words.txt"
punctuation = "../data/sentence-boundaries/sentence-punctuation.txt"
start-exceptions = "../data/sentence-boundaries/sentence-start-exceptions.txt"
end-exceptions = "../data/sentence-boundaries/sentence-end-exceptions.txt"
dataset = "ceeaus"
corpus = "line.toml" # located inside dataset folder
index = "ceeaus"
indexer-ram-budget = 1024 # **estimated** RAM budget for indexing in MB
# always set this lower than your physical RAM!
# indexer-num-threads = 8 # default value is system thread concurrency
[[analyzers]]
method = "ngram-word"
ngram = 1
filter = "default-unigram-chain"
[query-runner]
#query-judgements = "../data/ceeaus-qrels.txt" # uncomment to run IR eval
query-path = "../queries.txt" # create this file!
trec-format = false # default: false
max-results = 10 # default: 10
query-id-start = 1 # default: 1
[ranker]
method = "bm25"
k1 = 1.2
b = 0.75
k3 = 500
[classifier]
method = "one-vs-all"
[classifier.base]
method = "sgd"
loss = "hinge"
[regressor]
method = "sgd"
loss = "least-squares"
[lda]
inference = "gibbs"
max-iters = 1000
alpha = 1.0
beta = 1.0
topics = 4
model-prefix = "lda-model"
[crf]
prefix = "crf"
treebank = "penn-treebank" # relative to data prefix
corpus = "wsj"
section-size = 99
train-sections = [0, 18]
dev-sections = [19, 21]
test-sections = [22, 24]
[language-model]
arpa-file = "../data/english-sentences.arpa"
binary-file-prefix = "english-sentences-"
[diff]
n-value = 3
max-edits = 3
# penalty defaults are all zero (no penalty)
base-penalty = 0.0 # base penalty is for any edit
insert-penalty = 0.0
substitute-penalty = 0.0
remove-penalty = 0.0
[features]
method = "info-gain"
prefix = "features"
features-per-class = 20
[sequence]
prefix = "perceptron-tagger"
treebank = "penn-treebank" # relative to data prefix
corpus = "wsj"
section-size = 99
train-sections = [0, 18]
dev-sections = [19, 21]
test-sections = [22, 24]
[parser]
prefix = "parser"
treebank = "penn-treebank" # relative to data prefix
corpus = "wsj"
section-size = 99
train-sections = [2, 21]
dev-sections = [22, 22]
test-sections = [23, 23]
[embeddings]
prefix = "word-embeddings"
filter = [{type = "icu-tokenizer", suppress-tags = true}, {type = "lowercase"}]
vector-size = 50
[embeddings.vocab]
min-count = 5
max-size = 500000
[embeddings.sgns]
iterations = 5
negative-samples = 5
max-window-size = 5