Skip to content

Commit

Permalink
Do not tokenize content twice
Browse files Browse the repository at this point in the history
    * Update formatting

Signed-off-by: Jono Yang <[email protected]>
  • Loading branch information
JonoYang committed Oct 22, 2024
1 parent 99d2b0c commit cf0107b
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 182 deletions.
49 changes: 25 additions & 24 deletions src/matchcode_toolkit/fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,23 @@

from matchcode_toolkit.halohash import BitAverageHaloHash


# A collection of directory fingerprints that we want to avoid
IGNORED_DIRECTORY_FINGERPRINTS = [
# This is both the directory content and directory structure fingerprint for
# an empty directory.
'0000000000000000000000000000000000000000',
"0000000000000000000000000000000000000000",
]


def _create_directory_fingerprint(inputs):
"""
Return a 128-bit BitAverageHaloHash fingerprint in hex from `inputs`
"""
inputs = [i.encode('utf-8') for i in inputs if i]
inputs = [i.encode("utf-8") for i in inputs if i]
bah128 = BitAverageHaloHash(inputs, size_in_bits=128).hexdigest()
inputs_count = len(inputs)
inputs_count_hex_str = '%08x' % inputs_count
bah128 = bah128.decode('utf-8')
inputs_count_hex_str = "%08x" % inputs_count
bah128 = bah128.decode("utf-8")
directory_fingerprint = inputs_count_hex_str + bah128
return directory_fingerprint

Expand All @@ -55,7 +54,7 @@ def _get_resource_subpath(resource, top):
The subpath returned would be 'baz.c'
"""
_, _, subpath = resource.path.partition(top.path)
subpath = subpath.lstrip('/')
subpath = subpath.lstrip("/")
return subpath


Expand Down Expand Up @@ -88,16 +87,16 @@ def _compute_directory_fingerprints(directory, codebase):
return

directory_content_fingerprint = create_content_fingerprint(children)
if hasattr(directory, 'directory_content_fingerprint'):
if hasattr(directory, "directory_content_fingerprint"):
directory.directory_content_fingerprint = directory_content_fingerprint
else:
directory.extra_data['directory_content'] = directory_content_fingerprint
directory.extra_data["directory_content"] = directory_content_fingerprint

directory_structure_fingerprint = create_structure_fingerprint(directory, children)
if hasattr(directory, 'directory_structure_fingerprint'):
if hasattr(directory, "directory_structure_fingerprint"):
directory.directory_structure_fingerprint = directory_structure_fingerprint
else:
directory.extra_data['directory_structure'] = directory_structure_fingerprint
directory.extra_data["directory_structure"] = directory_structure_fingerprint

directory.save(codebase)
return directory
Expand Down Expand Up @@ -162,7 +161,7 @@ def create_halohash_chunks(bah128):


# Split on whitespace and punctuations: keep only characters and numbers
query_pattern = '[^_\\W]+'
query_pattern = "[^_\\W]+"
word_splitter = re.compile(query_pattern, re.UNICODE).findall


Expand Down Expand Up @@ -237,28 +236,30 @@ def create_file_fingerprints(content, ngram_length=8, window_length=64):
"hailstorm": [],
}

# Create fingerprint
words = tokenizer(content)
# tokenize content intow words
words = list(tokenizer(content))

# Create a file fingerprint from the number of elements in the content hash
# and the content hash digest iteself.
ngs = ngrams(words, ngram_length)
ngs_bytes = [[g.encode('utf-8') for g in ng] for ng in ngs]
ngs_bytes = [b''.join(ng) for ng in ngs_bytes]
ngs_bytes = [[g.encode("utf-8") for g in ng] for ng in ngs]
ngs_bytes = [b"".join(ng) for ng in ngs_bytes]
content_hash, ngs_count = BitAverageHaloHash(ngs_bytes), len(ngs_bytes)
if content_hash:
content_fingerprint = content_hash.hexdigest().decode('utf-8')
ngs_count_hex_str = '%08x' % ngs_count
content_fingerprint = content_hash.hexdigest().decode("utf-8")
ngs_count_hex_str = "%08x" % ngs_count
file_fingerprint = ngs_count_hex_str + content_fingerprint
fingerprints['halo1'] = file_fingerprint
fingerprints["halo1"] = file_fingerprint

words = tokenizer(content)
# Select windows from the content to find snippet similarities
windows = ngrams(words, window_length)
selected_windows = select_ngrams(windows)
selected_windows_bytes = [[g.encode('utf-8') for g in window] for window in selected_windows]
selected_windows_bytes = [b''.join(window) for window in selected_windows_bytes]
selected_windows_bytes = [[g.encode("utf-8") for g in window] for window in selected_windows]
selected_windows_bytes = [b"".join(window) for window in selected_windows_bytes]
hailstorm_hashes = [
BitAverageHaloHash(window).hexdigest().decode('utf-8')
for window in selected_windows_bytes
BitAverageHaloHash(window).hexdigest().decode("utf-8") for window in selected_windows_bytes
]
if hailstorm_hashes:
fingerprints['hailstorm'] = hailstorm_hashes
fingerprints["hailstorm"] = hailstorm_hashes

return fingerprints
22 changes: 17 additions & 5 deletions src/matchcode_toolkit/halohash.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,10 @@ def __init__(self, msg=None, size_in_bits=128):
# TODO: pick one hash algorithm
self.hashmodule = commoncode_hash.get_hasher(size_in_bits)
except:
raise Exception('No available hash module for the requested '
'hash size in bits: %(size_in_bits)d' % locals())
raise Exception(
"No available hash module for the requested "
"hash size in bits: %(size_in_bits)d" % locals()
)
self.update(msg)

@property
Expand All @@ -190,7 +192,13 @@ def update(self, msg):
"""
if not msg:
return
if isinstance(msg, (list, tuple,)):
if isinstance(
msg,
(
list,
tuple,
),
):
for m in msg:
self.__hashup(m)
else:
Expand Down Expand Up @@ -242,7 +250,9 @@ def combine(cls, hashes):
"""
size_in_bits = hashes[0].size_in_bits
for h in hashes:
assert isinstance(hash, cls), 'all hashes should be a BitAverageHaloHash, not {}'.format(type(h))
assert isinstance(
hash, cls
), "all hashes should be a BitAverageHaloHash, not {}".format(type(h))
assert h.size_in_bits == size_in_bits

all_columns = [h.columns for h in hashes]
Expand Down Expand Up @@ -313,7 +323,9 @@ def slices(s, size):
... pass
"""
length = len(s)
assert length % size == 0, 'Invalid slice size: len(%(s)r) is not a multiple of %(size)r' % locals()
assert length % size == 0, (
"Invalid slice size: len(%(s)r) is not a multiple of %(size)r" % locals()
)
# TODO: time alternative
# return [s[index:index + size] for index in range(0, length, size)]
chunks = [iter(s)] * size
Expand Down
13 changes: 6 additions & 7 deletions src/matchcode_toolkit/plugin_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@

import attr

from commoncode.cliutils import PluggableCommandLineOption
from commoncode.cliutils import SCAN_GROUP
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes
from commoncode.cliutils import PluggableCommandLineOption
from plugincode.scan import ScanPlugin
from plugincode.scan import scan_impl

from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes


@scan_impl
class FingerprintScanner(ScanPlugin):
Expand All @@ -28,12 +29,10 @@ class FingerprintScanner(ScanPlugin):
sort_order = 6
options = [
PluggableCommandLineOption(
(
'--fingerprint',
),
("--fingerprint",),
is_flag=True,
default=False,
help='Compute directory and resource fingerprints that are used for matching',
help="Compute directory and resource fingerprints that are used for matching",
help_group=SCAN_GROUP,
sort_order=20,
)
Expand Down
Loading

0 comments on commit cf0107b

Please sign in to comment.