-
Notifications
You must be signed in to change notification settings - Fork 6
/
corpus.py
158 lines (118 loc) · 4.1 KB
/
corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import sys
import gzip
import json
import operator
import numpy as np
from math import log
from mapping import Mapping
from utils import calc_LCS
from collections import defaultdict
from documents import QAdoc, ReviewDoc, Sentence
from multiprocessing import Pool
import multiprocessing
mp = multiprocessing.cpu_count()
#hyper-parameters
k1 = 1.2
b = 0.75
def Multiprocess_compute(itemID):
return itemID[1].Multiprocess_PairWiseFeature(itemID[0])
class Corpus:
def __init__(self, QAFile, ReviewFile, minReview):
self.Map = Mapping(QAFile, ReviewFile, minReview)
self.QAnswers = []
self.Sentences = []
self.QPerItem = []
self.SPerItem = []
self.PairWiseFeature = {}
self.Avgdl = defaultdict(float)
self.Map.create_mappings()
for i in range(len(self.Map.ItemIDMap)):
self.QPerItem.append([])
for i in range(len(self.Map.ItemIDMap)):
self.SPerItem.append([])
def construct_QAnswersAndQPerItem(self):
print "Creating Question Answer objects\n"
print "Reading QA Files"
qa = gzip.open(self.Map.QAfile, 'r')
for qajson in qa:
l = eval(qajson)
if l['asin'] in self.Map.ItemIDMap.keys():
itemId = self.Map.ItemIDMap[l['asin']]
qType = l['questionType']
if qType == 'open-ended':
aType = 'Not Applicable'
else:
aType = l['answerType']
question = l['question']
answer = l['answer']
obj = QAdoc(itemId, qType, aType, question, answer, self.Map.V, self.Map.WordIDMap)
self.QAnswers.append(obj)
self.QPerItem[itemId].append(len(self.QAnswers)-1)
del qa
print "Read QAfiles\n"
def construct_SentencesAndSPerItem(self):
print "Creating Sentences per Review\n"
print "Reading Review Files"
review = gzip.open(self.Map.ReviewFile, 'r')
for rjson in review:
l = eval(rjson)
if l['asin'] in self.Map.ItemIDMap:
itemID = self.Map.ItemIDMap[l['asin']]
reviewText = l['reviewText']
obj = ReviewDoc(itemID, reviewText, self.Sentences, self.SPerItem, self.Map.V, self.Map.WordIDMap)
del review
print "Read Reviews\n"
def Multiprocess_PairWiseFeature(self,itemID):
IDF,TF,avgdl = self.helper(itemID)
temp_pairwise = {}
for question in self.QPerItem[itemID]:
for sent in self.SPerItem[itemID]:
bm25 = 0.0
bm25_plus = 0.0
for wordID in self.QAnswers[question].Question:
numr = IDF[wordID] * TF[sent,wordID]*(k1 + 1)
denr = TF[sent,wordID] + k1*(1 - b + (b * len(self.Sentences[sent].Sent)/avgdl))
bm25 += (numr*1.0)/denr
bm25_plus += bm25 + IDF[wordID]
LCS = calc_LCS(self.QAnswers[question].Question, self.Sentences[sent].Sent)
temp_pairwise[(question,sent)] = np.array([[bm25, bm25_plus, LCS]], dtype =np.float64)
return temp_pairwise,{itemID:avgdl}
def helper(self,itemID):
N = len(self.SPerItem[itemID])
TF = defaultdict(int)
IDF = np.zeros((self.Map.V))
DF = defaultdict(int)
avgdl = 0.0
for sent in self.SPerItem[itemID]:
Sent = self.Sentences[sent]
avgdl += len(Sent.Sent)
for wordID in Sent.Sent:
TF[sent,wordID] +=1
DF[wordID,sent] = 1
for wordID in range(0,self.Map.V):
nt = sum([1 for ID,sent in DF if wordID == ID])
if nt != 0:
IDF[wordID] = log(N+1) - log(nt)
avgdl = (avgdl*1.0)/N
return IDF,TF,avgdl
def Calculate_PairWiseFeature(self):
print "\n\nStarting pool..."
print "Total number of cores found : ",mp
pool = Pool(mp)
dicts_ = pool.map(Multiprocess_compute, zip(range(len(self.Map.ItemIDMap)),[self]*(len(self.Map.ItemIDMap))))
pool.close()
pool.join()
print "Stoping pool..."
print "Assigning pairwise features..."
for d in dicts_:
self.PairWiseFeature.update(d[0])
self.Avgdl[d[1].keys()[0]] = d[1][d[1].keys()[0]]
dicts_ = None
# Find out empty questions formed due to stem and vocab check
empty_items = []
for x in range(len(self.QAnswers)):
if len(self.QAnswers[x].Question)==0:
empty_items.append(x)
for idx in range(len(self.QPerItem)):
self.QPerItem[idx] = list(filter(lambda x: x not in empty_items, self.QPerItem[idx]))
print "Pairwise features created\n\n"