-
Notifications
You must be signed in to change notification settings - Fork 1
/
extract_vectors.sh
executable file
·137 lines (126 loc) · 5.54 KB
/
extract_vectors.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/bash
# The next line is executed by /bin/sh, but not tcl \
export ELLOGON=${ELLOGON_HOME:=/opt/Ellogon}
# The next line is executed by /bin/sh, but not tcl \
exec tclsh8.6 "$0" ${1+"$@"}
cd [file dirname [file normalize [info script]]]
puts "Current working directory: [pwd]"
# Extend Tcl's module path, to include Ellogon's modules
tcl::tm::path add $::env(ELLOGON)/ellogon2.0/tm
# Load Ellogon...
package require ellogon
# Load some packages we are going to use...
package require ELEP::Macros::ComponentRunner
package require ELEP::MachineLearning::Applications::NERCWord2Vec
# Load our Ellogon modules from the "modules" directory...
PUL_LoadPluginsInDir ./modules
# Prepare our data: Run module TransformAnnotatedData on collection
# ArgumentMin1...
if {1} {
set collection [tip_OpenCollectionFixName ./data/ArgumentMin1]
set runner [ELEP::Macros::ComponentRunner new]
$runner run $collection TransformAnnotatedData Gazetteer
$runner destroy
# Save the collection...
tip_Sync $collection
tip_Close $collection
}
proc apply_trainer {trainer file} {
$trainer reset
$trainer configure -save_instances_all_O 1
set neg [file rootname $file].neg.txt
$trainer init_training_file $file $neg
set col [tip_OpenCollectionFixName ./data/ArgumentMin1]
for {set doc [tip_FirstDocument $col]} {$doc ne ""} \
{set doc [tip_NextDocument $col]} {
$trainer train_add_document $doc
tip_Close $doc
}
tip_Close $col
puts " Generated [$trainer cget -instances] instances..."
puts " Generated [$trainer cget -instances_O] instances (negative)..."
puts " Instances saved in: $file"
puts " Negatives saved in: $neg"
$trainer reset
};# apply_trainer
# Generate the base-case: CRF with words & POS tags...
if {0} {
puts "Generating base classifier training data..."
set trainer [ELEP::MachineLearning::Applications::NERC::Serialise new \
-ne_type segment -ne_constraints {ann::type in {claim support argument}} \
-templates_U {w pos} \
-templates_B {} \
-context_before 0 \
-context_after 0 \
]
apply_trainer $trainer ./data/crf/words-pos-context-0.txt
$trainer configure -context_before -2 -context_after 2
apply_trainer $trainer ./data/crf/words-pos-context-2.txt
$trainer configure -context_before -5 -context_after 5
apply_trainer $trainer ./data/crf/words-pos-context-5.txt
$trainer configure -context_before -2 -context_after 2 -skip_instances_all_O 0
apply_trainer $trainer ./data/crf/neg-words-pos-context-2.txt
## Only support...
$trainer configure -ne_constraints {ann::type in {support}} \
-context_before 0 -context_after 0 -skip_instances_all_O 1
apply_trainer $trainer ./data/crf/only-support-words-pos-context-0.txt
$trainer configure -context_before -2 -context_after 2
apply_trainer $trainer ./data/crf/only-support-words-pos-context-2.txt
$trainer configure -context_before -5 -context_after 5
apply_trainer $trainer ./data/crf/only-support-words-pos-context-5.txt
$trainer destroy
}
# Use word2vec...
if {0} {
puts "Generating word2vec classifier training data..."
set trainer [ELEP::MachineLearning::Applications::NERCWord2Vec::Serialise new\
-ne_type segment -ne_constraints {ann::type in {claim support argument}} \
-templates_U {w pos} \
-templates_B {} \
-context_before 0 \
-context_after 0 \
]
$trainer load_vectors ./representations/model.bin
apply_trainer $trainer ./data/crf/words-pos-w2v-context-0.txt
$trainer configure -context_before -2 -context_after 2
apply_trainer $trainer ./data/crf/words-pos-w2v-context-2.txt
$trainer configure -context_before -5 -context_after 5
apply_trainer $trainer ./data/crf/words-pos-w2v-context-5.txt
## Only support...
$trainer configure -ne_constraints {ann::type in {support}} \
-context_before 0 -context_after 0
apply_trainer $trainer ./data/crf/only-support-words-pos-w2v-context-0.txt
$trainer configure -context_before -2 -context_after 2
apply_trainer $trainer ./data/crf/only-support-words-pos-w2v-context-2.txt
$trainer configure -context_before -5 -context_after 5
apply_trainer $trainer ./data/crf/only-support-words-pos-w2v-context-5.txt
$trainer destroy
}
if {1} {
puts "Generating word2vec classifier training data (+lookup)..."
set trainer [ELEP::MachineLearning::Applications::NERCWord2Vec::Serialise new\
-ne_type segment -ne_constraints {ann::type in {claim support argument}} \
-templates_U {w pos chk} \
-templates_B {} \
-context_before 0 \
-context_after 0 \
-skip_instances_duplicate 1 \
-generate_disjunctive_features 0 \
]
$trainer load_vectors ./representations/model.bin
apply_trainer $trainer ./data/crf/words-pos-lookup-w2v-context-0.txt
# $trainer configure -context_before -2 -context_after 2
# apply_trainer $trainer ./data/crf/words-pos-lookup-w2v-context-2.txt
# $trainer configure -context_before -5 -context_after 5
# apply_trainer $trainer ./data/crf/words-pos-lookup-w2v-context-5.txt
## Only support...
# $trainer configure -ne_constraints {ann::type in {support}} \
# -context_before 0 -context_after 0
# # apply_trainer $trainer ./data/crf/only-support-words-pos-lookup-w2v-context-0.txt
# $trainer configure -context_before -2 -context_after 2
# apply_trainer $trainer ./data/crf/only-support-words-pos-lookup-w2v-context-2.txt
# $trainer configure -context_before -5 -context_after 5
# apply_trainer $trainer ./data/crf/only-support-words-pos-lookup-w2v-context-5.txt
$trainer destroy
}
# vim: syntax=tcl