Skip to content

Commit

Permalink
fix: change datatype of simhash to string, because pyarrow is incompa…
Browse files Browse the repository at this point in the history
…tible with uint64 (#170)
  • Loading branch information
zhijianma authored Jan 4, 2024
1 parent afac978 commit 0431f25
Showing 1 changed file with 11 additions and 13 deletions.
24 changes: 11 additions & 13 deletions data_juicer/ops/deduplicator/document_simhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# https://github.com/bigscience-workshop/data-preparation
# --------------------------------------------------------

from collections import Counter, defaultdict, deque
from collections import defaultdict, deque
from typing import Dict, Set

import numpy as np
Expand Down Expand Up @@ -156,8 +156,8 @@ def compute_hash(self, sample):
f'Unimplemented tokenization method [{self.tokenization}]')

# compute simhash
sample[HashKeys.simhash] = np.uint64(
simhash.compute(map(simhash.unsigned_hash, tokens)))
sample[HashKeys.simhash] = str(
np.uint64(simhash.compute(map(simhash.unsigned_hash, tokens))))
return sample

def process(self, dataset, show_num=0):
Expand All @@ -176,25 +176,23 @@ def process(self, dataset, show_num=0):
# find matches
logger.info(f'Start querying {len(dataset)} samples.')
matches = simhash.find_all(
dataset[HashKeys.simhash],
np.uint64(dataset[HashKeys.simhash]),
self.num_blocks,
self.hamming_distance,
)
logger.info(f'Querying done, found {len(matches)} matches.')

# compute hash diff distribution
graph = defaultdict(dict)
dist = Counter()
for x, y in matches:
x = str(x)
y = str(y)
graph[x][y] = graph[y][x] = True
num_diff = num_differing_bits(x, y)
dist[num_diff] += 1
logger.info(f'Hash diff distribution: {dist}')

hash2ids: Dict[int, Set[str]] = defaultdict(set)
hashes: Set[int] = set(dataset[HashKeys.simhash])
hash2cluster: Dict[int, int] = {}
visited: Set[int] = set()

hash2ids: Dict[str, Set[str]] = defaultdict(set)
hashes: Set[str] = set(dataset[HashKeys.simhash])
hash2cluster: Dict[str, int] = {}
visited: Set[str] = set()
cluster_id: int = 0

for sid, hash_val in enumerate(dataset[HashKeys.simhash]):
Expand Down

0 comments on commit 0431f25

Please sign in to comment.