simhash_hanming.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
from hashlib import md5
import sys

class Token:

    def __init__(self, hash_list, weight):
        self.hash_list = hash_list
        self.weight = weight

def tokenize(doc):
    """
    Lower case, remove punctuation and split in spaces
    """
    doc = doc.lower()
    doc = re.sub('[,;]', ' ', doc)
    doc = re.split('\s+', doc)
    doc = filter(None, doc)
    return doc

def md5Hash(token):
    h = bin(int(md5(token.encode("utf-8")).hexdigest(), 16))
    return h[2:]

def hash_threshold(token_dict, fp_len):
    """
    Iterate through the token dictionary multiply the hash lists with the weights
    and apply the binary threshold
    """
    sum_hash = [0] * fp_len
    for _, token in token_dict.items():
        sum_hash = [ x + token.weight * y for x, y in zip(sum_hash, token.hash_list)]

    # apply binary threshold
    for i, ft in enumerate(sum_hash):
        if ft > 0:
            sum_hash[i] = 1
        else:
            sum_hash[i] = 0
    return sum_hash

def binconv(fp, fp_len):
    """
    Converts 0 to -1 in the tokens' hashes to facilitate
    merging of the tokens' hashes later on.
    input  : 1001...1
    output : [1,-1,-1, 1, ... , 1]
    """
    vec = [1] * fp_len
    for indx, b in enumerate(fp):
        if b == '0':
            vec[indx] = -1
    return vec


def calc_weights(terms, fp_len):
    """
    Calculates the weight of each one of the tokens. In this implementation
    these weights are equal to the term frequency within the document.

    :param tokens: A list of all the tokens (words) within the document
    :fp_len: The length of the Simhash values
    return dictionary "my_term": Token([-1,1,-1,1,..,-1], 5)
    """
    term_dict = {}
    for term in terms:
        # get weights
        if term not in term_dict:
            fp_hash = md5Hash(term).zfill(fp_len)
            fp_hash_list = binconv(fp_hash, fp_len)
            token = Token(fp_hash_list, 0)
            term_dict[term] = token
        term_dict[term].weight += 1
    return term_dict

def simhash(doc, fp_len=128):
    """
    :param doc: The document we want to generate the Simhash value
    :fp_len: The number of bits we want our hash to be consisted of.
                Since we are hashing each token of the document using
                md5 (which produces a 128 bit hash value) then this
                variable fp_len should be 128. Feel free to change
                this value if you use a different hash function for
                your tokens.
    :return The Simhash value of a document ex. '0000100001110'
    """
    tokens = tokenize(doc)
    token_dict = calc_weights(tokens, fp_len)
    fp_hash_list = hash_threshold(token_dict, fp_len)
    fp_hast_str =  ''.join(str(v) for v in fp_hash_list)
    return fp_hast_str
def hanming(hash1,hash2,HASH_LENGTH):
    distance = 0
    #hash1 = 11010111
    #hash2 = 10101100
    #HASH_LENGTH = 8
    print(hash1)
    print(hash2)
    for i in range(HASH_LENGTH):
        bit1 = hash1 & (1 << i)
        bit2 = hash2 & (1 << i)
        if bit1 != bit2:
            distance = distance + 1
    print(distance)
    return distance

if __name__ == '__main__':
    # Just for demonstration
    #doc = sys.stdin.read()
    '''
    doc = 'SADFAAasd sg'
    binary_hash = simhash(doc)
    print(binary_hash)
    print(type(binary_hash)
    '''
    filename = 'ceshi.txt'
    with open(filename, 'r') as file_to_read:
        binary_hash = []
        while True:
            line = file_to_read.readline()  # 整行读取数据
            if not line:
                break
                pass
            binary_hash.append(int(simhash(line)))
    distance = hanming(binary_hash[0],binary_hash[1],128)
    print(distance)