-
Notifications
You must be signed in to change notification settings - Fork 1
/
simhash_hanming.py
129 lines (116 loc) · 3.68 KB
/
simhash_hanming.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
from hashlib import md5
import sys
class Token:
def __init__(self, hash_list, weight):
self.hash_list = hash_list
self.weight = weight
def tokenize(doc):
"""
Lower case, remove punctuation and split in spaces
"""
doc = doc.lower()
doc = re.sub('[,;]', ' ', doc)
doc = re.split('\s+', doc)
doc = filter(None, doc)
return doc
def md5Hash(token):
h = bin(int(md5(token.encode("utf-8")).hexdigest(), 16))
return h[2:]
def hash_threshold(token_dict, fp_len):
"""
Iterate through the token dictionary multiply the hash lists with the weights
and apply the binary threshold
"""
sum_hash = [0] * fp_len
for _, token in token_dict.items():
sum_hash = [ x + token.weight * y for x, y in zip(sum_hash, token.hash_list)]
# apply binary threshold
for i, ft in enumerate(sum_hash):
if ft > 0:
sum_hash[i] = 1
else:
sum_hash[i] = 0
return sum_hash
def binconv(fp, fp_len):
"""
Converts 0 to -1 in the tokens' hashes to facilitate
merging of the tokens' hashes later on.
input : 1001...1
output : [1,-1,-1, 1, ... , 1]
"""
vec = [1] * fp_len
for indx, b in enumerate(fp):
if b == '0':
vec[indx] = -1
return vec
def calc_weights(terms, fp_len):
"""
Calculates the weight of each one of the tokens. In this implementation
these weights are equal to the term frequency within the document.
:param tokens: A list of all the tokens (words) within the document
:fp_len: The length of the Simhash values
return dictionary "my_term": Token([-1,1,-1,1,..,-1], 5)
"""
term_dict = {}
for term in terms:
# get weights
if term not in term_dict:
fp_hash = md5Hash(term).zfill(fp_len)
fp_hash_list = binconv(fp_hash, fp_len)
token = Token(fp_hash_list, 0)
term_dict[term] = token
term_dict[term].weight += 1
return term_dict
def simhash(doc, fp_len=128):
"""
:param doc: The document we want to generate the Simhash value
:fp_len: The number of bits we want our hash to be consisted of.
Since we are hashing each token of the document using
md5 (which produces a 128 bit hash value) then this
variable fp_len should be 128. Feel free to change
this value if you use a different hash function for
your tokens.
:return The Simhash value of a document ex. '0000100001110'
"""
tokens = tokenize(doc)
token_dict = calc_weights(tokens, fp_len)
fp_hash_list = hash_threshold(token_dict, fp_len)
fp_hast_str = ''.join(str(v) for v in fp_hash_list)
return fp_hast_str
def hanming(hash1,hash2,HASH_LENGTH):
distance = 0
#hash1 = 11010111
#hash2 = 10101100
#HASH_LENGTH = 8
print(hash1)
print(hash2)
for i in range(HASH_LENGTH):
bit1 = hash1 & (1 << i)
bit2 = hash2 & (1 << i)
if bit1 != bit2:
distance = distance + 1
print(distance)
return distance
if __name__ == '__main__':
# Just for demonstration
#doc = sys.stdin.read()
'''
doc = 'SADFAAasd sg'
binary_hash = simhash(doc)
print(binary_hash)
print(type(binary_hash)
'''
filename = 'ceshi.txt'
with open(filename, 'r') as file_to_read:
binary_hash = []
while True:
line = file_to_read.readline() # 整行读取数据
if not line:
break
pass
binary_hash.append(int(simhash(line)))
distance = hanming(binary_hash[0],binary_hash[1],128)
print(distance)