-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_single_layer.py
127 lines (95 loc) · 3.72 KB
/
train_single_layer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from xml.dom import minidom
import nltk
import math
import pickle
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
#torch.manual_seed(1)
if __name__=="__main__":
START_TOKEN = 'SENT_START'
END_TOKEN = 'SENT_END'
# Load data
print 'Loading XML file...'
xmldoc = minidom.parse('ted_en-20160408.xml')
print 'Getting English text...'
textList = xmldoc.getElementsByTagName('content')
print 'Number of transcripts:', len(textList)
# print textList[0].childNodes[0].nodeValue
sentenceList = []
NUM_TRANSCRIPTS = 1000
# Get tokenized sentences from French transcripts
for s in textList[:NUM_TRANSCRIPTS]:
text = s.childNodes[0].nodeValue
# Split text into sentences
sentences = nltk.sent_tokenize(text.decode('utf-8').lower())
# Split sentences into words
tokenized_sents = [nltk.word_tokenize(s) for s in sentences]
tokenized_sents = [[START_TOKEN] + t + [END_TOKEN] for t in tokenized_sents]
# i = 1
sentenceList += tokenized_sents
# Use only some of the sentences in vocabulary
NUM_SENTENCES = 40000
TESTING_SENTENCES=10000
print 'Taking only', NUM_SENTENCES, 'sentences...'
trainingList = sentenceList[:NUM_SENTENCES]
testList=sentenceList[NUM_SENTENCES:NUM_SENTENCES+TESTING_SENTENCES]
# Create word_to_index dict from transcripts
word_to_index = {}
index_to_word = []
for s in sentenceList:
for word in s:
if word not in word_to_index:
word_to_index[word] = len(word_to_index)
index_to_word.append(word)
print 'Size of vocabulary:', len(word_to_index)
def sent2id (sentence,word_to_index):
idxs=[word_to_index[word] for word in sentence]
tensor=torch.LongTensor(idxs)
return autograd.Variable(tensor)
class LSTM_Single_Layer(nn.Module):
def __init__(self,embedding_dim,hidden_dim,vocab_size):
super(LSTM_Single_Layer,self).__init__()
self.word_embeddings=nn.Embedding(vocab_size,embedding_dim)
self.hidden_dim=hidden_dim
self.lstm=nn.LSTM(embedding_dim,hidden_dim)
self.linear=nn.Linear(hidden_dim,vocab_size)
self.hidden=self.init_hidden()
def init_hidden(self):
return (autograd.Variable(torch.zeros(1,1,self.hidden_dim)),
autograd.Variable(torch.zeros(1,1,self.hidden_dim)))
def forward(self,sentence):
embeds=self.word_embeddings(sentence)
lstm_out,self.hidden=self.lstm(embeds.view(len(sentence),1,-1),self.hidden)
log_probs=F.log_softmax(self.linear(lstm_out.view(len(sentence),-1)))
return log_probs
HIDDEN_DIM=100
EMBEDDING_DIM=64
model=LSTM_Single_Layer(EMBEDDING_DIM,HIDDEN_DIM,len(word_to_index))
loss_function=nn.NLLLoss()
optimizer=optim.SGD(model.parameters(),lr=0.1)
'''
i=0
for epoch in range(1):
for sentence in trainingList:
model.zero_grad()
model.hidden=model.init_hidden()
sentence_in=sent2id(sentence,word_to_index)
y_actual=torch.cat((sentence_in[1:],torch.LongTensor([word_to_index["SENT_END"]])),0)
y_preds=model(sentence_in)
loss=loss_function(y_preds,y_actual)
loss.backward()
optimizer.step()
print(loss)
print i
i=i+1
torch.save(model.state_dict(),'model_params.pkl')
'''
pickle.dump(testList,open("testList","wb"))
pickle.dump(word_to_index,open("word_to_index","wb"))
pickle.dump(index_to_word,open("index_to_word","wb"))