-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path3_word_vectors_authors.Rmd
190 lines (149 loc) · 5.41 KB
/
3_word_vectors_authors.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
---
title: "Vector spaces for authors"
output: html_document
---
# Load packages
```{r, echo = FALSE}
## Save package names as a vector of strings
pkgs <-
c(
"quanteda",
"textTinyR",
"ClusterR",
"text2vec"
)
## Install uninstalled packages
lapply(pkgs[!(pkgs %in% installed.packages())], install.packages)
## Load all packages to library and adjust options
lapply(pkgs, library, character.only = TRUE)
```
## Text Preprocessing
```{r}
author_corpus <- corpus(dat_agg2, text_field = "abstract")
author_tokens <- quanteda::tokens(
author_corpus,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_separators = TRUE,
remove_url = TRUE,
split_hyphens = TRUE,
include_docvars = TRUE,
verbose = TRUE,
)
author_tokens <- quanteda::tokens_remove(author_tokens, stopword_list)
```
# Extract word embeddings using word2vec
```{r keras_tf_first_time, message = FALSE}
devtools::install_github("rstudio/keras")
library(keras)
install_keras()
```
```{r preprocess_tokens}
tokens_author <- as.list(author_tokens)
tokens_author <- lapply(tokens_author, function(x) stringr::str_c(x, collapse = " "))
tokens_author2 <- as.data.frame(tokens_author)
tokens_author3 <- tokens_author2 %>%
tidyr::gather(identifier, text, text1:text22008)
tokenizer <- text_tokenizer(num_words = 45000, lower = TRUE)
tokenizer %>% fit_text_tokenizer(tokens_author3$text)
```
```{r skipgram_model}
skipgrams_generator <- function(text, tokenizer, window_size, negative_samples) {
gen <- texts_to_sequences_generator(tokenizer, sample(text))
function() {
skip <- generator_next(gen) %>%
skipgrams(
vocabulary_size = tokenizer$num_words,
window_size = window_size,
negative_samples = 1
)
x <- transpose(skip$couples) %>% map(. %>% unlist %>% as.matrix(ncol = 1))
y <- skip$labels %>% as.matrix(ncol = 1)
list(x, y)
}
}
```
```{r hyperparameter_word2vec, message = FALSE}
embedding_size <- 100 # Dimension of the embedding vector.
skip_window <- 10 # How many words to consider left and right.
num_sampled <- 1 # Number of negative examples to sample for each word.
# define placeholders for the input
input_target <- layer_input(shape = 1)
input_context <- layer_input(shape = 1)
# the embedding matrix is a quasi-lookup table for the word vectors
# The embedding acts as lookup table for the word vectors with dimensions (vocabulary, embedding_size).
embedding <- layer_embedding(
input_dim = tokenizer$num_words + 1,
output_dim = embedding_size,
input_length = 1,
name = "embedding"
)
target_vector <- input_target %>%
embedding() %>%
layer_flatten()
context_vector <- input_context %>%
embedding() %>%
layer_flatten()
# use basic cosine similarity to estimate the similarity of the target_vector and the context_vector
dot_product <- layer_dot(list(target_vector, context_vector), axes = 1)
# use this to output a dense layer with sigmoid activation
output <- layer_dense(dot_product, units = 1, activation = "sigmoid")
```
```{r keras_model}
model <- keras_model(list(input_target, input_context), output)
model %>% compile(loss = "binary_crossentropy", optimizer = "adam")
summary(model)
```
```{r keras_model_train}
# train the model on the data
model %>%
fit_generator(
skipgrams_generator(tokens_author3$text, tokenizer, skip_window, negative_samples),
steps_per_epoch = 2000,
epochs = 5
)
```
```{r w2v_embeddings}
# extract the weights from the model
embedding_matrix_author <- get_weights(model)[[1]]
# extract words
words <- data_frame(
word = names(tokenizer$word_index),
id = as.integer(unlist(tokenizer$word_index))
)
words <- words %>%
filter(id <= tokenizer$num_words) %>%
arrange(id)
row.names(embedding_matrix_author) <- c("UNK", words$word) # specifying those embeddings that are unknown, i.e., that we don't have an embedding for
# save as word2vec .txt file without the first row of unkowns
embedding_matrix2_author <- embedding_matrix_author[-1, ]
write.table(
embedding_matrix2_author,
file = "~/Downloads/w2v_embeddings_author_100.txt",
row.names = TRUE, # keep the row names
col.names = FALSE, # but remove the column names
quote = FALSE # make sure this is set to FALSE so that tokens (words) are not wrapped in quotation marks -- doc2vec from textinyR cannot handle this
)
```
```{r}
# ensure that the tokens are lower-case and stored in a list
author_tokens_lower <- tokens_tolower(author_tokens)
author_tokens_d2v <- as.list(author_tokens_lower)
# intialize a new doc2vec object using the embedding matrix from word2vec
init <- textTinyR::Doc2Vec$new(token_list = author_tokens_d2v,
word_vector_FILE = "w2v_embeddings_author_100.txt",
print_every_rows = 5000,
verbose = TRUE,
copy_data = FALSE # if embedding matrix is large, set to FALSE for memory efficiency
)
# inspect the stored embedding matrix (to ensure it returns non-zero entries) when copy_data = TRUE
res_wv <- init$pre_processed_wv()
str(res_wv)
# run the doc2vec model using min_max_norm or sum_sqrt
author2_minmax <- init$doc2vec_methods(method = "min_max_norm", threads = 6)
# check the dimensions - should be NUMBER-OF-DOCUMENTS x NUMBER-OF-WORD-VECTORS
dim(author2_minmax)
# replace rownames with XXX
rownames(author2_minmax) <- tokens_author3$identifier
```