forked from niklim/FSC_and_AoA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
compute_fsc.py
261 lines (217 loc) · 11.8 KB
/
compute_fsc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import os
import copy
import json
import random
import pickle
import itertools
import numpy as np
import pandas as pd
from datetime import datetime
from src.utils.io_ops import write_df
from semspaces.space import SemanticSpace
from src.utils.set_ops import shared_words
from src.utils.encode import get_celex_coverage
from src.measures.fsc_ld import levenshtein_fsc
from src.measures.fsc_te import target_embedded_fsc
from src.measures.semantic import neighborhood_density
from src.resources import aoa, concreteness, valence, subtlex, old20, morpholex
from src.utils.neighbors import get_target_embedded_neighbors, get_levenshtein_neighbours
random_baseline = False
sample_vocab = True
data_dir = os.path.join(os.getcwd(), 'data/')
out_dir = os.path.join(os.getcwd(), 'output/')
if not os.path.exists(out_dir):
os.makedirs(out_dir)
# LOAD DATA
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: Started loading the data..."))
embedding_space = SemanticSpace.from_csv(
os.path.join(data_dir, 'embedding_space.cbow.ukwac.subtlex.300dims.w5.w2v'),
prenorm=True
)
w2v_words = embedding_space.included_words()
celex = json.load(open(os.path.join(data_dir, 'celex_dict.json')))
aoa_words, aoa_norms = aoa.read(os.path.join(data_dir, "AoA.xlsx"))
w2aoa = pd.Series(aoa_norms["Rating.Mean"].values, index=aoa_norms["Word"]).to_dict()
w2concr = concreteness.read(os.path.join(data_dir, "concreteness.txt"))
w2val = valence.read(os.path.join(data_dir, "valence.csv"))
w2freq = subtlex.read(os.path.join(data_dir, "subtlex.csv"))
w2old = old20.read(os.path.join(data_dir, "word2old.csv"))
mono = list(morpholex.read_mono(os.path.join(data_dir, "MorphoLEX_en.xlsx")))
poly = list(morpholex.read_poly(os.path.join(data_dir, "MorphoLEX_en.xlsx")))
mono_inflected = list(morpholex.read_mono_inflected(os.path.join(data_dir, "MorphoLEX_en.xlsx")))
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: Done"))
# CONCATENATE THE DIFFERENT SETS OF POTENTIAL TARGET WORDS FOR LATER FILTERING STEPS
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: Started finding the target words..."))
filter_words = list(
itertools.chain(
aoa_words, set(w2concr.keys()), set(w2val.keys()), mono, poly, mono_inflected,
)
)
print('The reference vocabulary for measuring SND consists of {} words.'.format(len(filter_words)))
# FIND WORDS SHARED ACROSS RESOURCES, SUCH THAT WE CAN ESTIMATE ALL NECESSARY PREDICTORS: FREQUENCY, CONCRETENESS,
# VALENCE, SEMANTIC NEIGHBOURHOOD DENSITY, AND OLD20. SINCE ICONICITY NORMS ARE FAR SMALLER IN SIZE, A SEPARATE ANALYSIS
# IS RUN WITH THOSE, WITHOUT FILTERING WORDS FOR THE MAIN EXPERIMENT
shared = shared_words(
aoa_words, set(w2concr.keys()), set(w2val.keys()), list(w2v_words), set(w2freq.keys()), set(w2old.keys())
)
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: Done"))
# ASSIGN BOOLEAN VALUE TO EACH WORD FOR WHICH ALL VARIABLES ARE AVAILABLE DEPENDING ON THEIR MORPHOLOGICAL COMPLEXITY
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: Mapping targets to psycholinguistic properties..."))
w2morph = morpholex.compute_morph_complexity(shared, mono, mono_inflected, poly)
# RESTRICT TO WORDS FOR WHICH A UNIQUE, UNAMBIGUOUS PHONOLOGICAL TRANSCRIPTION IS AVAILABLE IN CELEX, THEN MAP
# (ORTHO, PHONO) TUPLES TO AOA VALUES
t2phon = {k: v for k, v in get_celex_coverage(w2morph.keys(), celex)[0]}
t2aoa = {k: v for k, v in w2aoa.items() if k in t2phon.keys()}
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: Done."))
# COMPUTE SEMANTIC NEIGHBORHOOD DENSITY
snd_path = os.path.join(data_dir, "target2snd.json")
try:
t2snd = json.load(open(snd_path, "rb"))
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: File {} found and loaded.".format(snd_path)))
except FileNotFoundError:
t2snd = neighborhood_density(embedding_space, t2phon.keys(), filter_words)
json.dump(t2snd, open(snd_path, 'w'))
# FIND NEIGHBORS (TARGET-EMBEDDING AND LEVENSHTEIN) FOR ORTHOGRAPHIC AND PHONOLOGICAL FORMS
reference_vocab = [x for x in set(w2freq.keys()) if str(x) != 'nan']
print('The reference vocabulary for retrieving form-based neighbors consists of {} words.'.format(len(reference_vocab)))
ortho2neighbors_te = get_target_embedded_neighbors(set(t2phon.keys()), reference_vocab)
print(
datetime.now().strftime("%d/%m/%Y %H:%M:%S: Done retrieving target-embedded neighbors for orthographic neighbors.")
)
ortho2neighbors_ld = get_levenshtein_neighbours(set(t2phon.keys()), reference_vocab)
print(
datetime.now().strftime("%d/%m/%Y %H:%M:%S: Done retrieving Levenshtein distance neighbors for orthographic forms.")
)
phono2neighbors_te = get_target_embedded_neighbors(t2phon, reference_vocab, celex=celex)
print(
datetime.now().strftime("%d/%m/%Y %H:%M:%S: Done retrieving target-embedded neighbors for phonological forms.")
)
phono2neighbors_ld = get_levenshtein_neighbours(t2phon, reference_vocab, celex=celex)
print(
datetime.now().strftime("%d/%m/%Y %H:%M:%S: Done retrieving Levenshtein distance neighbors for phonological forms.")
)
# TRY TO FETCH FSC VALUES (ORTHO AND PHONO, COMPUTED USING TARGET-EMBEDDING NEIGHBORS OR LEVENSHTEIN DISTANCE
# NEIGHBORS) FROM FILE. IF THE FILE DOESN'T EXIST, COMPUTE VALUES
fsc_dir = os.path.join(out_dir, 'FSCmeasures')
if not os.path.exists(fsc_dir):
os.makedirs(fsc_dir)
osc_te_path = os.path.join(fsc_dir, "OSC_te.pkl")
try:
t2OSC_te = pickle.load(open(osc_te_path, "rb"))
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: File {} found and loaded.".format(snd_path)))
except FileNotFoundError:
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: Started computing target-embedded OSC..."))
t2OSC_te = target_embedded_fsc(ortho2neighbors_te, embedding_space, w2freq)
pickle.dump(t2OSC_te, open(osc_te_path, "wb"))
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: Done."))
print()
osc_ld_path = os.path.join(fsc_dir, "OSC_ld.pkl")
try:
t2OSC_ld = pickle.load(open(osc_ld_path, "rb"))
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: File {} found and loaded.".format(snd_path)))
except FileNotFoundError:
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: Started computing Levenshtein OSC..."))
t2OSC_ld = levenshtein_fsc(ortho2neighbors_ld, embedding_space)
pickle.dump(t2OSC_ld, open(osc_ld_path, "wb"))
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: Done."))
print()
psc_te_path = os.path.join(fsc_dir, "PSC_te.pkl")
try:
t2PSC_te = pickle.load(open(psc_te_path, "rb"))
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: File {} found and loaded.".format(snd_path)))
except FileNotFoundError:
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: Started computing target-embedded PSC..."))
t2PSC_te = target_embedded_fsc(phono2neighbors_te, embedding_space, w2freq)
pickle.dump(t2PSC_te, open(psc_te_path, "wb"))
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: Done."))
print()
psc_ld_path = os.path.join(fsc_dir, "PSC_ld.pkl")
try:
t2PSC_ld = pickle.load(open(psc_ld_path, "rb"))
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: File {} found and loaded.".format(snd_path)))
except FileNotFoundError:
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: Started computing Levenshtein PSC..."))
t2PSC_ld = levenshtein_fsc(phono2neighbors_ld, embedding_space)
pickle.dump(t2PSC_ld, open(psc_ld_path, "wb"))
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: Done."))
print()
# WRITE MEASURES TO FILE FOR SUBSEQUENT STATISTICAL ANALYSIS
write_df(
t2phon.keys(), os.path.join(fsc_dir, "fsc_measures.csv"), w2aoa, t2OSC_te, t2OSC_ld, t2PSC_te,
t2PSC_ld, w2concr, w2val, w2freq, w2old, t2phon, w2morph, t2snd
)
if random_baseline:
n_subsamples = 1000
seeds = random.sample(range(0, 100000000), n_subsamples)
print(datetime.now().strftime(
"%d/%m/%Y %H:%M:%S: Started computing FSC from {} random permutations of the embeddings...".format(n_subsamples)
))
# COMPUTE FSC MEASURES FROM RANDOM PERMUTATIONS OF THE WORD EMBEDDINGS, REPEAT 1000 TIMES AND SAVE MEASURES TO FILE
random_embeddings = copy.deepcopy(embedding_space)
fsc_dir_rnd = os.path.join(out_dir, 'FSCrandom')
if not os.path.exists(fsc_dir_rnd):
os.makedirs(fsc_dir_rnd)
for i, seed in enumerate(seeds):
np.random.seed(seed)
print(datetime.now().strftime("%d/%m/%Y %H:%M:%S: Started permutation {} of {}...".format(i + 1, n_subsamples)))
random_embeddings.vectors = np.random.permutation(random_embeddings.vectors)
t2OSC_te_rnd = target_embedded_fsc(ortho2neighbors_te, random_embeddings, w2freq)
t2OSC_ld_rnd = levenshtein_fsc(ortho2neighbors_ld, random_embeddings)
t2PSC_te_rnd = target_embedded_fsc(phono2neighbors_te, random_embeddings, w2freq)
t2PSC_ld_rnd = levenshtein_fsc(phono2neighbors_ld, random_embeddings)
write_df(
t2phon.keys(), os.path.join(fsc_dir_rnd, "df{}.csv".format(i+1)), w2aoa, t2OSC_te_rnd, t2OSC_ld_rnd,
t2PSC_te_rnd, t2PSC_ld_rnd, w2concr, w2val, w2freq, w2old, t2phon, w2morph, t2snd
)
if sample_vocab:
n_subsamples = 500
sampling_rates = [50, 75]
seeds = random.sample(range(0, 100000000), n_subsamples)
fsc_dir_subset = os.path.join(out_dir, 'FSCsubset')
if not os.path.exists(fsc_dir_subset):
os.makedirs(fsc_dir_subset)
for rate in sampling_rates:
print(datetime.now().strftime(
"%d/%m/%Y %H:%M:%S: Started computing FSC from {} samples of {}% of the reference vocabulary...".format(
n_subsamples, rate
)
))
fsc_subdir_subset = os.path.join(fsc_dir_subset, 'rate{}'.format(rate))
if not os.path.exists(fsc_subdir_subset):
os.makedirs(fsc_subdir_subset)
# determine the size of each random sample of the reference vocabulary given the sampling rate
target_vocab_size = round(len(reference_vocab)*rate/100)
for i, seed in enumerate(seeds):
np.random.seed(seed)
print(datetime.now().strftime(
"%d/%m/%Y %H:%M:%S: Sample {} of {}...".format(i + 1, n_subsamples))
)
reference_vocab_subsample = random.sample(reference_vocab, target_vocab_size)
ortho2neighbors_te_subsample = get_target_embedded_neighbors(set(t2phon.keys()), reference_vocab_subsample)
print(
datetime.now().strftime(
"%d/%m/%Y %H:%M:%S: Done retrieving target-embedded neighbors for orthographic neighbors.")
)
ortho2neighbors_ld_subsample = get_levenshtein_neighbours(set(t2phon.keys()), reference_vocab_subsample)
print(
datetime.now().strftime(
"%d/%m/%Y %H:%M:%S: Done retrieving Levenshtein distance neighbors for orthographic forms.")
)
phono2neighbors_te_subsample = get_target_embedded_neighbors(t2phon, reference_vocab_subsample, celex=celex)
print(
datetime.now().strftime(
"%d/%m/%Y %H:%M:%S: Done retrieving target-embedded neighbors for phonological forms.")
)
phono2neighbors_ld_subsample = get_levenshtein_neighbours(t2phon, reference_vocab_subsample, celex=celex)
print(
datetime.now().strftime(
"%d/%m/%Y %H:%M:%S: Done retrieving Levenshtein distance neighbors for phonological forms.")
)
t2OSC_te_subset = target_embedded_fsc(ortho2neighbors_te_subsample, embedding_space, w2freq)
t2OSC_ld_subset = levenshtein_fsc(ortho2neighbors_ld_subsample, embedding_space)
t2PSC_te_subset = target_embedded_fsc(phono2neighbors_te_subsample, embedding_space, w2freq)
t2PSC_ld_subset = levenshtein_fsc(phono2neighbors_ld_subsample, embedding_space)
write_df(
t2phon.keys(), os.path.join(fsc_subdir_subset, "df{}.csv".format(i + 1)), w2aoa, t2OSC_te_subset,
t2OSC_ld_subset, t2PSC_te_subset, t2PSC_ld_subset, w2concr, w2val, w2freq, w2old, t2phon, w2morph, t2snd
)