forked from Nanthini10/Similar-Duo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
minHash.py
199 lines (186 loc) · 6.59 KB
/
minHash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
'''
@author: nanthini, harshat
'''
import pandas as pd
import numpy as np
from numpy import random
import matplotlib.pyplot as plt
from scipy.sparse import find
import itertools
import numpy.matlib
# Our implementation to read in the csv file ,
# Due to computation time and memory constraints, we only consider the first 50,000 rows of the original csv file ,
# The remainder of the questions are solved with the matrix provided on piazza ,
'''
values = pd.read_csv('ml-20m/ratings.csv', usecols=['userId','movieId','rating'],nrows=50000) ,
values['rating'] = np.where(values['rating']>2.5,1,0) ,
new_values = values.pivot(index='movieId', columns = 'userId', values = 'rating') ,
new_values = new_values.fillna(0) ,
print new_values
'''
# Jaccard Similarity for users in form of set ,
def my_jaccard(a,b):
return float(len(a&b))/float(len(a|b))
# Jaccard Similarity for users in form of matrix ,
def jaccard_sim(a,b):
if float(np.logical_or(a,b).sum())==0:
return 0
return float(np.logical_and(a,b).sum())/float(np.logical_or(a,b).sum())
print "Loading the values from ratings_np.npy ..."
values = np.load('data/ratings_np.npy')
rows,cols = values.shape
# Number of Hash Functions ,
n = 1000
# Large Prime number greater than number of rows ,
R = 29063
# Generate a and b for hash functions between ,
# range of 1 and R so that there are no zeros ,
coeff=random.randint(1,R,size=[n,2])
srt = find(values)
# Sort the users and movies in order of the movies ,
arrSrt = srt[1].argsort()
movies = srt[0][arrSrt[::1]]
users = srt[1][arrSrt[::1]]
#Initialize list of sets for Question 1.3 ,
um = [set() for i in range(cols)]
''' ,
# Question 1.2 ,
(i)Computing for random 10000 pairs
(ii)max_10 has the maximum 10 of the random pairs
(iii)plotting the histogram
'''
print "Generating 10000 random pairs and calculating their similarities... "
# Randomly selecting 10,000 pairs ,
indices = random.randint(0,values.shape[1], size = 20000)
sim = list()
for i in range(0,20000,2):
# In the small chance that we select the same index, simply compare to next one ,
if(indices[i]==indices[i+1]):
indices[i+1]+=1
sim.append(jaccard_sim(values[:,indices[i]],values[:,indices[i+1]]))
# Method to find the largest 10 similarities explained in report
max_10 = [0 for i in range(10)]
min_10 = 0
for i in range(len(sim)):
if sim[i]>min_10:
max_10[max_10.index(min_10)] = sim[i]
min_10 = min(max_10)
print "The average Jaccard Similarity is: " ,np.asarray(sim).mean()
print "The max 10 Jaccard Similarity values are: "
print np.asarray(max_10)
print "Histogram for the data obtained..."
plt.hist(sim)
plt.title("Histogram")
plt.xlabel("Similarities")
plt.ylabel("Frequency")
plt.show()
'''
# Question 1.3 ,Efficient way to store the values
um: maps the users to a set of movies that they like
'''
u_indexj = 0
# Make a list of sets for the user ,
# That is, each user has a set of the movies which it liked ,
print "Storing the data efficiently in the form 'user:{movies}'"
for i in range(cols):
if(u_indexj==len(movies)):
break
while(users[u_indexj]==i + 1):
um[i].add(movies[u_indexj])
u_indexj +=1
if(u_indexj==len(users)):
break
# count the number os users that have actually liked movies ,
count1 = 0
for i in range (cols):
if (bool(um[i])):
count1 +=1
''' ,
# Question 1.4 , Partioning the signature matrix into bands,
then into buckets and finally finding candidate pairs
h1: signature matrix
hash_bands: buckets for each band
hash_bands[i]: buckets within a band
similar_pairs: candidate pairs that match the criteria of similarity>=0.65
'''
# Process to find the signature matrix is described in detail in report
print "Generating a signature matrix for the dataset..."
bMatrix = np.matlib.repmat(coeff[:,1],rows,1)
aMatrix = coeff[:,0]
mTemp = ([i+1 for i in range (rows)])
mNew = np.asarray(mTemp)
aMatrix.reshape(n,1)
myHash = np.outer(aMatrix,mTemp)
myHashFinal = np.mod(myHash+bMatrix.T,R)
h1 = np.zeros((n,cols))
for i in range(cols):
indices = um[i]
if (bool(um[i])):
h1[:,i] = myHashFinal[:,list(indices)].min(1)
print "Signature matrix generated with shape ",h1.shape
# We choose 100 bands of 10 rows each for 1000 hash functions ,
# Our decision to choose these values is explained in the report ,
band = 100
r = 10
# Choose a very large prime number
#RDash = 112519
RDash = 1299827
final = np.empty([band,cols])
a = random.randint(1,RDash, size = r)
a = a.reshape([r,1])
b = np.asarray(random.randint(1,RDash, size = r))
b = b.reshape([r,1])
b = np.matlib.repmat(b,1,cols)
hash_bands = {}
for i in range(band):
final[i] = np.sum(np.mod(np.multiply(a,h1[(i*r):(i*r)+r,:])+b,RDash),axis=0)
for j in range(final[i].shape[0]):
if i not in hash_bands:
hash_bands[i] = {}
if final[i][j] not in hash_bands[i]:
hash_bands[i][final[i][j]] = [j]
else:
hash_bands[i][final[i][j]].append(j)
similar_pairs = set()
for i in hash_bands:
for hash_num in hash_bands[i]:
if len(hash_bands[i][hash_num]) > 1:
for pair in itertools.combinations(hash_bands[i][hash_num], r=2):
u,v = pair
if float(len(um[v].union(um[u])))!=0 and (len(um[v].intersection(um[u])) / float(len(um[v].union(um[u])))>=0.65):
similar_pairs.add(pair)
print "The number of similar pairs is: ", len(similar_pairs)
'''
1.5: Given a random point, find the nearest neighbor
test = the random point
maxSim = determines the threshold that has to be satisfied
'''
#test = input("Enter a random query point(userID) whose nearest neighbor should be found: ")
test = random.randint(0,values.shape[0])
neighbors = set()
maxSim = 0.8
print "Checking if ",test," is present in similar pairs computed before"
flag = False
for pair in similar_pairs:
if test in pair:
comp = jaccard_sim(values[:,pair[0]],values[:,pair[1]])
if comp>maxSim:
maxSim = comp
maxInd = pair
flag = True
if flag:
print "Found the nearest neighbor as user ",maxInd," with similarity ",maxSim
if not flag:
print "Could not find a pair within similar_pairs, using brute force..."
flag = False
for j in range(values.shape[1]):
if j not in similar_pairs and j!=test:
comp = jaccard_sim(h1[:,test],h1[:,j])
if comp>maxSim:
maxSim = comp
maxInd = j
flag = True
if flag:
print "Found the nearest neighbor as user ",maxInd," with similarity ",maxSim
if not flag:
print "The user doesn't have any other user with similarity > ",maxSim