-
Notifications
You must be signed in to change notification settings - Fork 0
/
ClusterSImilarity.py
138 lines (109 loc) · 3.66 KB
/
ClusterSImilarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import re
from pprint import pprint
from fuzzywuzzy import fuzz
from datasketch import MinHash, MinHashLSH
class ClusterSimilarity:
def matches(self, stra, strb):
return stra in strb
def measure(self, currentActor, newActor):
wordsCurrentActor = currentActor.split("_")
wordsNewActor = newActor.split("_")
if len(wordsCurrentActor) >= len(wordsNewActor): #current actor name may be a superset of the new name suggested
i = 0
j = 0
while i < len(wordsCurrentActor):
if self.matches(wordsNewActor[j], wordsCurrentActor[i]):
j = j + 1
i = i + 1
return float(j) / len(wordsNewActor)
else:
i = 0
j = 0
while i < len(wordsNewActor):
if self.matches(wordsCurrentActor[j], wordsNewActor[i]):
j = j + 1
i = i + 1
return float(j)/len(wordsNewActor)
#==================================================================================================
class FuzzyClusterSimilarity(ClusterSimilarity):
def measure(self, currentActor, newActor):
return fuzz.partial_ratio(currentActor.replace("_"," "), newActor.replace("_"," "))
#==================================================================================================
class MinhashClusterSimilarity(ClusterSimilarity):
NUM_PERM =128
cache = {}
def getHashed(self, name=[]):
m1 = None
if name in self.cache:
m1 = self.cache[name]
else:
m1 = MinHash()
words = name.split(' ')
for d in words:
m1.update(d.encode('utf8'))
self.cache[name] = m1
return m1
def measure(self, currentActor, newActor):
ca = currentActor.replace("_"," ").strip()
na = newActor.replace("_"," ").strip()
m1 = self.getHashed(currentActor.replace("_", " ").strip())
m2 = self.getHashed(newActor.replace("_", " ").strip())
val = 100 * m1.jaccard(m2)
return int(val)
#===================================================================================================
# clusterSimilarity = MinhashClusterSimilarity()
#
# actorFile = open("petrarch2/data/dictionaries/Phoenix.International.actors.txt")
#
# currentActor = None
# actorNamesDict = {}
#
# for line in actorFile:
# line = line.strip()
# if line.startswith('#') or len(line) == 0: # if it is a comment
# continue
# line = line.split('#')[0]
#
# line = re.sub(r'\[[^\]]*\]', '', line).strip()
#
# if len(line) != 0:
# if line.startswith("+"):
# if currentActor not in actorNamesDict:
# actorNamesDict[currentActor] = []
# actorNamesDict[currentActor].append(line.replace("+",""))
# else:
# currentActor = line
#
#
# #pprint(actorNamesDict)
#
# actorSynsetRatio = {}
#
# for key in actorNamesDict:
# actorSynonyms = actorNamesDict[key]
# actorSynsetRatio[key] = {}
# for other in actorSynonyms:
# res = clusterSimilarity.measure(key, other)
# actorSynsetRatio[key][other] = res
#
# pprint(actorSynsetRatio)
#
#
# diffActorRatio = {}
#
# actorNames = []
#
# for key in actorNamesDict:
# actorNames.append(key)
#
# i = 0
# for i in range(0, len(actorNames)):
# diffActorRatio[actorNames[i]] = {}
# for j in range(i+1, len(actorNames)):
# res = clusterSimilarity.measure(actorNames[i], actorNames[j])
# diffActorRatio[actorNames[i]][actorNames[j]] = res
#
# print "\n"
# pprint(diffActorRatio)
#
# print clusterSimilarity.measure("Donald Trump", "Melanila Trump")