Skip to content

Commit

Permalink
Merge pull request #12 from aboutcode-org/hailstorm
Browse files Browse the repository at this point in the history
Add snippet hash
  • Loading branch information
JonoYang authored Oct 29, 2024
2 parents 5ccab02 + 8001b85 commit 7b96032
Show file tree
Hide file tree
Showing 10 changed files with 1,973 additions and 221 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
Changelog
=========

vNext
-----

*2024-10-29* -- Collect snippet hashes in fingerprint plugin.

v5.1.1
------

*2024-08-07* -- Update link references of ownership from nexB to aboutcode-org.


v5.1.0
------

Expand Down
92 changes: 45 additions & 47 deletions src/matchcode_toolkit/fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,23 @@

from matchcode_toolkit.halohash import BitAverageHaloHash


# A collection of directory fingerprints that we want to avoid
IGNORED_DIRECTORY_FINGERPRINTS = [
# This is both the directory content and directory structure fingerprint for
# an empty directory.
'0000000000000000000000000000000000000000',
"0000000000000000000000000000000000000000",
]


def _create_directory_fingerprint(inputs):
"""
Return a 128-bit BitAverageHaloHash fingerprint in hex from `inputs`
"""
inputs = [i.encode('utf-8') for i in inputs if i]
inputs = [i.encode("utf-8") for i in inputs if i]
bah128 = BitAverageHaloHash(inputs, size_in_bits=128).hexdigest()
inputs_count = len(inputs)
inputs_count_hex_str = '%08x' % inputs_count
bah128 = bah128.decode('utf-8')
inputs_count_hex_str = "%08x" % inputs_count
bah128 = bah128.decode("utf-8")
directory_fingerprint = inputs_count_hex_str + bah128
return directory_fingerprint

Expand All @@ -55,7 +54,7 @@ def _get_resource_subpath(resource, top):
The subpath returned would be 'baz.c'
"""
_, _, subpath = resource.path.partition(top.path)
subpath = subpath.lstrip('/')
subpath = subpath.lstrip("/")
return subpath


Expand Down Expand Up @@ -88,16 +87,16 @@ def _compute_directory_fingerprints(directory, codebase):
return

directory_content_fingerprint = create_content_fingerprint(children)
if hasattr(directory, 'directory_content_fingerprint'):
if hasattr(directory, "directory_content_fingerprint"):
directory.directory_content_fingerprint = directory_content_fingerprint
else:
directory.extra_data['directory_content'] = directory_content_fingerprint
directory.extra_data["directory_content"] = directory_content_fingerprint

directory_structure_fingerprint = create_structure_fingerprint(directory, children)
if hasattr(directory, 'directory_structure_fingerprint'):
if hasattr(directory, "directory_structure_fingerprint"):
directory.directory_structure_fingerprint = directory_structure_fingerprint
else:
directory.extra_data['directory_structure'] = directory_structure_fingerprint
directory.extra_data["directory_structure"] = directory_structure_fingerprint

directory.save(codebase)
return directory
Expand Down Expand Up @@ -162,7 +161,7 @@ def create_halohash_chunks(bah128):


# Split on whitespace and punctuations: keep only characters and numbers
query_pattern = '[^_\\W]+'
query_pattern = "[^_\\W]+"
word_splitter = re.compile(query_pattern, re.UNICODE).findall


Expand Down Expand Up @@ -195,7 +194,7 @@ def tokenizer(text):
return _tokenizer(text.lower())


def get_file_fingerprint_hashes(location, ngram_length=8, **kwargs):
def get_file_fingerprint_hashes(location, ngram_length=8, window_length=64, **kwargs):
"""
Return a mapping of fingerprint hashes for the file at `location`
Expand All @@ -218,50 +217,49 @@ def get_file_fingerprint_hashes(location, ngram_length=8, **kwargs):
with open(location) as f:
content = f.read()

file_fingerprint = create_file_fingerprint(
return create_file_fingerprints(
content,
ngram_length=ngram_length
)
return dict(
halo1=file_fingerprint
ngram_length=ngram_length,
window_length=window_length,
)


def create_content_hash(content, ngram_length=8):
def create_file_fingerprints(content, ngram_length=8, window_length=64):
"""
Return a 128-bit BitAverageHaloHash from file `content` and the number of
ngrams inserted into the hash
Return a mapping of halo1 and snippet hashes from content
"""
from licensedcode.tokenize import ngrams
from licensedcode.tokenize import select_ngrams

# Break content into words, then create ngrams from words
words = tokenizer(content)
ngs = ngrams(words, ngram_length)

# Convert each list of ngrams to a sequence of bytes
ngs_bytes = [[g.encode('utf-8') for g in ng] for ng in ngs]

# Join all ngrams into a single bytearray
ngs_bytes = [b''.join(ng) for ng in ngs_bytes]

# Create fingerprints and return fingerprint hashes
if ngs_bytes:
return BitAverageHaloHash(ngs_bytes), len(ngs_bytes)
else:
return None, 0
fingerprints = {
"halo1": "",
"snippets": [],
}

# tokenize content intow words
words = list(tokenizer(content))

def create_file_fingerprint(content, ngram_length=8):
"""
Return a 128-bit BitAverageHaloHash fingerprint in hex from file `content`
"""
# Create fingerprint
content_hash, ngs_count = create_content_hash(
content,
ngram_length=ngram_length
)
# Create a file fingerprint from the number of elements in the content hash
# and the content hash digest iteself.
ngs = ngrams(words, ngram_length)
ngs_bytes = [[g.encode("utf-8") for g in ng] for ng in ngs]
ngs_bytes = [b"".join(ng) for ng in ngs_bytes]
content_hash, ngs_count = BitAverageHaloHash(ngs_bytes), len(ngs_bytes)
if content_hash:
content_fingerprint = content_hash.hexdigest().decode('utf-8')
ngs_count_hex_str = '%08x' % ngs_count
content_fingerprint = content_hash.hexdigest().decode("utf-8")
ngs_count_hex_str = "%08x" % ngs_count
file_fingerprint = ngs_count_hex_str + content_fingerprint
return file_fingerprint
fingerprints["halo1"] = file_fingerprint

# Select windows from the content to find snippet similarities
windows = ngrams(words, window_length)
selected_windows = select_ngrams(windows)
selected_windows_bytes = [[g.encode("utf-8") for g in window] for window in selected_windows]
selected_windows_bytes = [b"".join(window) for window in selected_windows_bytes]
snippets = [
BitAverageHaloHash(window).hexdigest().decode("utf-8") for window in selected_windows_bytes
]
if snippets:
fingerprints["snippets"] = snippets

return fingerprints
22 changes: 17 additions & 5 deletions src/matchcode_toolkit/halohash.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,10 @@ def __init__(self, msg=None, size_in_bits=128):
# TODO: pick one hash algorithm
self.hashmodule = commoncode_hash.get_hasher(size_in_bits)
except:
raise Exception('No available hash module for the requested '
'hash size in bits: %(size_in_bits)d' % locals())
raise Exception(
"No available hash module for the requested "
"hash size in bits: %(size_in_bits)d" % locals()
)
self.update(msg)

@property
Expand All @@ -190,7 +192,13 @@ def update(self, msg):
"""
if not msg:
return
if isinstance(msg, (list, tuple,)):
if isinstance(
msg,
(
list,
tuple,
),
):
for m in msg:
self.__hashup(m)
else:
Expand Down Expand Up @@ -242,7 +250,9 @@ def combine(cls, hashes):
"""
size_in_bits = hashes[0].size_in_bits
for h in hashes:
assert isinstance(hash, cls), 'all hashes should be a BitAverageHaloHash, not {}'.format(type(h))
assert isinstance(
hash, cls
), "all hashes should be a BitAverageHaloHash, not {}".format(type(h))
assert h.size_in_bits == size_in_bits

all_columns = [h.columns for h in hashes]
Expand Down Expand Up @@ -313,7 +323,9 @@ def slices(s, size):
... pass
"""
length = len(s)
assert length % size == 0, 'Invalid slice size: len(%(s)r) is not a multiple of %(size)r' % locals()
assert length % size == 0, (
"Invalid slice size: len(%(s)r) is not a multiple of %(size)r" % locals()
)
# TODO: time alternative
# return [s[index:index + size] for index in range(0, length, size)]
chunks = [iter(s)] * size
Expand Down
14 changes: 7 additions & 7 deletions src/matchcode_toolkit/plugin_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,30 +9,30 @@

import attr

from commoncode.cliutils import PluggableCommandLineOption
from commoncode.cliutils import SCAN_GROUP
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes
from commoncode.cliutils import PluggableCommandLineOption
from plugincode.scan import ScanPlugin
from plugincode.scan import scan_impl

from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes


@scan_impl
class FingerprintScanner(ScanPlugin):
resource_attributes = dict(
directory_content_fingerprint=attr.ib(default=None, repr=False),
directory_structure_fingerprint=attr.ib(default=None, repr=False),
halo1=attr.ib(default=None, repr=False),
snippets=attr.ib(default=None, repr=False),
)
sort_order = 6
options = [
PluggableCommandLineOption(
(
'--fingerprint',
),
("--fingerprint",),
is_flag=True,
default=False,
help='Compute directory and resource fingerprints that are used for matching',
help="Compute directory and resource fingerprints that are used for matching",
help_group=SCAN_GROUP,
sort_order=20,
)
Expand Down
Loading

0 comments on commit 7b96032

Please sign in to comment.