diff --git a/src/matchcode_toolkit/fingerprinting.py b/src/matchcode_toolkit/fingerprinting.py index 90b7ad3..aeef5d6 100644 --- a/src/matchcode_toolkit/fingerprinting.py +++ b/src/matchcode_toolkit/fingerprinting.py @@ -12,12 +12,11 @@ from matchcode_toolkit.halohash import BitAverageHaloHash - # A collection of directory fingerprints that we want to avoid IGNORED_DIRECTORY_FINGERPRINTS = [ # This is both the directory content and directory structure fingerprint for # an empty directory. - '0000000000000000000000000000000000000000', + "0000000000000000000000000000000000000000", ] @@ -25,11 +24,11 @@ def _create_directory_fingerprint(inputs): """ Return a 128-bit BitAverageHaloHash fingerprint in hex from `inputs` """ - inputs = [i.encode('utf-8') for i in inputs if i] + inputs = [i.encode("utf-8") for i in inputs if i] bah128 = BitAverageHaloHash(inputs, size_in_bits=128).hexdigest() inputs_count = len(inputs) - inputs_count_hex_str = '%08x' % inputs_count - bah128 = bah128.decode('utf-8') + inputs_count_hex_str = "%08x" % inputs_count + bah128 = bah128.decode("utf-8") directory_fingerprint = inputs_count_hex_str + bah128 return directory_fingerprint @@ -55,7 +54,7 @@ def _get_resource_subpath(resource, top): The subpath returned would be 'baz.c' """ _, _, subpath = resource.path.partition(top.path) - subpath = subpath.lstrip('/') + subpath = subpath.lstrip("/") return subpath @@ -88,16 +87,16 @@ def _compute_directory_fingerprints(directory, codebase): return directory_content_fingerprint = create_content_fingerprint(children) - if hasattr(directory, 'directory_content_fingerprint'): + if hasattr(directory, "directory_content_fingerprint"): directory.directory_content_fingerprint = directory_content_fingerprint else: - directory.extra_data['directory_content'] = directory_content_fingerprint + directory.extra_data["directory_content"] = directory_content_fingerprint directory_structure_fingerprint = create_structure_fingerprint(directory, children) - if hasattr(directory, 'directory_structure_fingerprint'): + if hasattr(directory, "directory_structure_fingerprint"): directory.directory_structure_fingerprint = directory_structure_fingerprint else: - directory.extra_data['directory_structure'] = directory_structure_fingerprint + directory.extra_data["directory_structure"] = directory_structure_fingerprint directory.save(codebase) return directory @@ -162,7 +161,7 @@ def create_halohash_chunks(bah128): # Split on whitespace and punctuations: keep only characters and numbers -query_pattern = '[^_\\W]+' +query_pattern = "[^_\\W]+" word_splitter = re.compile(query_pattern, re.UNICODE).findall @@ -237,28 +236,30 @@ def create_file_fingerprints(content, ngram_length=8, window_length=64): "hailstorm": [], } - # Create fingerprint - words = tokenizer(content) + # tokenize content intow words + words = list(tokenizer(content)) + + # Create a file fingerprint from the number of elements in the content hash + # and the content hash digest iteself. ngs = ngrams(words, ngram_length) - ngs_bytes = [[g.encode('utf-8') for g in ng] for ng in ngs] - ngs_bytes = [b''.join(ng) for ng in ngs_bytes] + ngs_bytes = [[g.encode("utf-8") for g in ng] for ng in ngs] + ngs_bytes = [b"".join(ng) for ng in ngs_bytes] content_hash, ngs_count = BitAverageHaloHash(ngs_bytes), len(ngs_bytes) if content_hash: - content_fingerprint = content_hash.hexdigest().decode('utf-8') - ngs_count_hex_str = '%08x' % ngs_count + content_fingerprint = content_hash.hexdigest().decode("utf-8") + ngs_count_hex_str = "%08x" % ngs_count file_fingerprint = ngs_count_hex_str + content_fingerprint - fingerprints['halo1'] = file_fingerprint + fingerprints["halo1"] = file_fingerprint - words = tokenizer(content) + # Select windows from the content to find snippet similarities windows = ngrams(words, window_length) selected_windows = select_ngrams(windows) - selected_windows_bytes = [[g.encode('utf-8') for g in window] for window in selected_windows] - selected_windows_bytes = [b''.join(window) for window in selected_windows_bytes] + selected_windows_bytes = [[g.encode("utf-8") for g in window] for window in selected_windows] + selected_windows_bytes = [b"".join(window) for window in selected_windows_bytes] hailstorm_hashes = [ - BitAverageHaloHash(window).hexdigest().decode('utf-8') - for window in selected_windows_bytes + BitAverageHaloHash(window).hexdigest().decode("utf-8") for window in selected_windows_bytes ] if hailstorm_hashes: - fingerprints['hailstorm'] = hailstorm_hashes + fingerprints["hailstorm"] = hailstorm_hashes return fingerprints diff --git a/src/matchcode_toolkit/halohash.py b/src/matchcode_toolkit/halohash.py index 538db55..36aa00f 100644 --- a/src/matchcode_toolkit/halohash.py +++ b/src/matchcode_toolkit/halohash.py @@ -176,8 +176,10 @@ def __init__(self, msg=None, size_in_bits=128): # TODO: pick one hash algorithm self.hashmodule = commoncode_hash.get_hasher(size_in_bits) except: - raise Exception('No available hash module for the requested ' - 'hash size in bits: %(size_in_bits)d' % locals()) + raise Exception( + "No available hash module for the requested " + "hash size in bits: %(size_in_bits)d" % locals() + ) self.update(msg) @property @@ -190,7 +192,13 @@ def update(self, msg): """ if not msg: return - if isinstance(msg, (list, tuple,)): + if isinstance( + msg, + ( + list, + tuple, + ), + ): for m in msg: self.__hashup(m) else: @@ -242,7 +250,9 @@ def combine(cls, hashes): """ size_in_bits = hashes[0].size_in_bits for h in hashes: - assert isinstance(hash, cls), 'all hashes should be a BitAverageHaloHash, not {}'.format(type(h)) + assert isinstance( + hash, cls + ), "all hashes should be a BitAverageHaloHash, not {}".format(type(h)) assert h.size_in_bits == size_in_bits all_columns = [h.columns for h in hashes] @@ -313,7 +323,9 @@ def slices(s, size): ... pass """ length = len(s) - assert length % size == 0, 'Invalid slice size: len(%(s)r) is not a multiple of %(size)r' % locals() + assert length % size == 0, ( + "Invalid slice size: len(%(s)r) is not a multiple of %(size)r" % locals() + ) # TODO: time alternative # return [s[index:index + size] for index in range(0, length, size)] chunks = [iter(s)] * size diff --git a/src/matchcode_toolkit/plugin_fingerprint.py b/src/matchcode_toolkit/plugin_fingerprint.py index 6046fd6..ee972ef 100644 --- a/src/matchcode_toolkit/plugin_fingerprint.py +++ b/src/matchcode_toolkit/plugin_fingerprint.py @@ -9,13 +9,14 @@ import attr -from commoncode.cliutils import PluggableCommandLineOption from commoncode.cliutils import SCAN_GROUP -from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints -from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes +from commoncode.cliutils import PluggableCommandLineOption from plugincode.scan import ScanPlugin from plugincode.scan import scan_impl +from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints +from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes + @scan_impl class FingerprintScanner(ScanPlugin): @@ -28,12 +29,10 @@ class FingerprintScanner(ScanPlugin): sort_order = 6 options = [ PluggableCommandLineOption( - ( - '--fingerprint', - ), + ("--fingerprint",), is_flag=True, default=False, - help='Compute directory and resource fingerprints that are used for matching', + help="Compute directory and resource fingerprints that are used for matching", help_group=SCAN_GROUP, sort_order=20, ) diff --git a/tests/test_fingerprinting.py b/tests/test_fingerprinting.py index 19bb9c6..dd4bc6e 100644 --- a/tests/test_fingerprinting.py +++ b/tests/test_fingerprinting.py @@ -23,160 +23,150 @@ from matchcode_toolkit.halohash import byte_hamming_distance -class Resource(): - def __init__(self, path='', size=0, sha1=''): +class Resource: + def __init__(self, path="", size=0, sha1=""): self.path = path self.size = size self.sha1 = sha1 class TestFingerprintingFunctions(FileBasedTesting): - test_data_dir = os.path.join(os.path.dirname( - __file__), 'testfiles/fingerprinting') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles/fingerprinting") def test__create_directory_fingerprint(self): test_input = [ - 'package', - 'package/readme.txt', - 'package/index.js', - 'package/package.json', + "package", + "package/readme.txt", + "package/index.js", + "package/package.json", ] directory_fingerprint = _create_directory_fingerprint(test_input) - expected_directory_fingerprint = '0000000410d24471969646cb5402032288493126' + expected_directory_fingerprint = "0000000410d24471969646cb5402032288493126" self.assertEqual(expected_directory_fingerprint, directory_fingerprint) indexed_elements_count, _ = split_fingerprint(directory_fingerprint) self.assertEqual(len(test_input), indexed_elements_count) def test_split_fingerprint(self): - directory_fingerprint = '0000000410d24471969646cb5402032288493126' - indexed_elements_count, bah128 = split_fingerprint( - directory_fingerprint) + directory_fingerprint = "0000000410d24471969646cb5402032288493126" + indexed_elements_count, bah128 = split_fingerprint(directory_fingerprint) expected_indexed_elements_count = 4 - self.assertEqual(expected_indexed_elements_count, - indexed_elements_count) + self.assertEqual(expected_indexed_elements_count, indexed_elements_count) - expected_bah128 = '10d24471969646cb5402032288493126' + expected_bah128 = "10d24471969646cb5402032288493126" self.assertEqual(expected_bah128, bah128) def test_create_content_fingerprint(self): test_resources = [ - Resource(sha1='d4e4abbe8e2a8169d6a52907152c2c80ec884745'), - Resource(sha1='0c94f137f6e0536db8cb2622a9dc84253b91b90c'), - Resource(sha1='10cab45fe6f353b47b587a576c1077a96ce348f5'), - Resource(sha1='134f2b052b6e5f56b631be2eded70f89d44cf381'), + Resource(sha1="d4e4abbe8e2a8169d6a52907152c2c80ec884745"), + Resource(sha1="0c94f137f6e0536db8cb2622a9dc84253b91b90c"), + Resource(sha1="10cab45fe6f353b47b587a576c1077a96ce348f5"), + Resource(sha1="134f2b052b6e5f56b631be2eded70f89d44cf381"), ] fingerprint = create_content_fingerprint(test_resources) - expected_fingerprint = '00000004005b88c2800f0044044781ae05680419' + expected_fingerprint = "00000004005b88c2800f0044044781ae05680419" self.assertEqual(expected_fingerprint, fingerprint) def test__get_resource_subpath(self): - test_resource = Resource(path='foo/bar/baz/qux.c') - test_top_resource = Resource(path='foo/bar/') + test_resource = Resource(path="foo/bar/baz/qux.c") + test_top_resource = Resource(path="foo/bar/") subpath = _get_resource_subpath(test_resource, test_top_resource) - expected_subpath = 'baz/qux.c' + expected_subpath = "baz/qux.c" self.assertEqual(expected_subpath, subpath) def test_create_structure_fingerprint(self): - test_top_resource = Resource(path='package') + test_top_resource = Resource(path="package") test_child_resources = [ - Resource(path='package/readme.txt', size=771), - Resource(path='package/index.js', size=608), - Resource(path='package/package.json', size=677), + Resource(path="package/readme.txt", size=771), + Resource(path="package/index.js", size=608), + Resource(path="package/package.json", size=677), ] - fingerprint = create_structure_fingerprint( - test_top_resource, test_child_resources) - expected_fingerprint = '00000003ce72f4308a1bc1afb0fb47ed590b5c53' + fingerprint = create_structure_fingerprint(test_top_resource, test_child_resources) + expected_fingerprint = "00000003ce72f4308a1bc1afb0fb47ed590b5c53" self.assertEqual(expected_fingerprint, fingerprint) def test_create_halohash_chunks(self): - test_bah128 = 'ce72f4308a1bc1afb0fb47ed590b5c53' + test_bah128 = "ce72f4308a1bc1afb0fb47ed590b5c53" chunk1, chunk2, chunk3, chunk4 = create_halohash_chunks(test_bah128) - expected_chunk1 = bytearray(b'\xcer\xf40') - expected_chunk2 = bytearray(b'\x8a\x1b\xc1\xaf') - expected_chunk3 = bytearray(b'\xb0\xfbG\xed') - expected_chunk4 = bytearray(b'Y\x0b\\S') + expected_chunk1 = bytearray(b"\xcer\xf40") + expected_chunk2 = bytearray(b"\x8a\x1b\xc1\xaf") + expected_chunk3 = bytearray(b"\xb0\xfbG\xed") + expected_chunk4 = bytearray(b"Y\x0b\\S") self.assertEqual(chunk1, expected_chunk1) self.assertEqual(chunk2, expected_chunk2) self.assertEqual(chunk3, expected_chunk3) self.assertEqual(chunk4, expected_chunk4) def test_compute_codebase_directory_fingerprints(self): - scan_loc = self.get_test_loc('abbrev-1.0.3-i.json') + scan_loc = self.get_test_loc("abbrev-1.0.3-i.json") vc = VirtualCodebase(location=scan_loc) vc = compute_codebase_directory_fingerprints(vc) - directory_content = vc.root.extra_data['directory_content'] - directory_structure = vc.root.extra_data['directory_structure'] - expected_directory_content = '0000000346ce04751a3c98f00086f16a91d9790b' - expected_directory_structure = '000000034f9bf110673bdf06197cd514a799a66c' + directory_content = vc.root.extra_data["directory_content"] + directory_structure = vc.root.extra_data["directory_structure"] + expected_directory_content = "0000000346ce04751a3c98f00086f16a91d9790b" + expected_directory_structure = "000000034f9bf110673bdf06197cd514a799a66c" self.assertEqual(expected_directory_content, directory_content) self.assertEqual(expected_directory_structure, directory_structure) def test_do_not_compute_fingerprint_for_empty_dirs(self): - scan_loc = self.get_test_loc('test.json') + scan_loc = self.get_test_loc("test.json") vc = VirtualCodebase(location=scan_loc) vc = compute_codebase_directory_fingerprints(vc) - directory_content = vc.root.extra_data['directory_content'] - directory_structure = vc.root.extra_data['directory_structure'] - expected_directory_content = '000000032a5fa8d01922536b53e8fc6e3d43766f' - expected_directory_structure = '000000030a399ce2b947a6f611821965a4fcc577' + directory_content = vc.root.extra_data["directory_content"] + directory_structure = vc.root.extra_data["directory_structure"] + expected_directory_content = "000000032a5fa8d01922536b53e8fc6e3d43766f" + expected_directory_structure = "000000030a399ce2b947a6f611821965a4fcc577" self.assertEqual(expected_directory_content, directory_content) self.assertEqual(expected_directory_structure, directory_structure) # These directories should not have fingerprints generated or stored in # extra_data - empty_dir_1 = vc.get_resource('test/test') - empty_dir_2 = vc.get_resource('test/test/test2') + empty_dir_1 = vc.get_resource("test/test") + empty_dir_2 = vc.get_resource("test/test/test2") self.assertEqual({}, empty_dir_1.extra_data) self.assertEqual({}, empty_dir_1.extra_data) self.assertEqual({}, empty_dir_2.extra_data) self.assertEqual({}, empty_dir_2.extra_data) def test_get_file_fingerprint_hashes_one_line_removed(self): - test_file1 = self.get_test_loc('inflate.c') - test_file2 = self.get_test_loc('inflate-mod.c') + test_file1 = self.get_test_loc("inflate.c") + test_file2 = self.get_test_loc("inflate-mod.c") result1 = get_file_fingerprint_hashes(test_file1) result2 = get_file_fingerprint_hashes(test_file2) - result1 = result1.get('halo1') - result2 = result2.get('halo1') - result1_indexed_elements_count, result1_fingerprint = split_fingerprint( - result1) - result2_indexed_elements_count, result2_fingerprint = split_fingerprint( - result2) + result1 = result1.get("halo1") + result2 = result2.get("halo1") + result1_indexed_elements_count, result1_fingerprint = split_fingerprint(result1) + result2_indexed_elements_count, result2_fingerprint = split_fingerprint(result2) expected_result1_indexed_elements_count = 6395 expected_result2_indexed_elements_count = 6388 assert result1_indexed_elements_count == expected_result1_indexed_elements_count assert result2_indexed_elements_count == expected_result2_indexed_elements_count - expected_result1_fingerprint = 'a23a49e4cd40718d1297be719e6564a4' - expected_result2_fingerprint = 'aa3a49e4cd40718d1297be519e6564a4' + expected_result1_fingerprint = "a23a49e4cd40718d1297be719e6564a4" + expected_result2_fingerprint = "aa3a49e4cd40718d1297be519e6564a4" assert result1_fingerprint == expected_result1_fingerprint assert result2_fingerprint == expected_result2_fingerprint - assert byte_hamming_distance( - result1_fingerprint, result2_fingerprint) == 2 + assert byte_hamming_distance(result1_fingerprint, result2_fingerprint) == 2 def test_get_file_fingerprint_hashes_one_line_added(self): - test_file1 = self.get_test_loc('inflate.c') - test_file2 = self.get_test_loc('inflate-mod2.c') + test_file1 = self.get_test_loc("inflate.c") + test_file2 = self.get_test_loc("inflate-mod2.c") result1 = get_file_fingerprint_hashes(test_file1) result2 = get_file_fingerprint_hashes(test_file2) - result1 = result1.get('halo1') - result2 = result2.get('halo1') - result1_indexed_elements_count, result1_fingerprint = split_fingerprint( - result1) - result2_indexed_elements_count, result2_fingerprint = split_fingerprint( - result2) + result1 = result1.get("halo1") + result2 = result2.get("halo1") + result1_indexed_elements_count, result1_fingerprint = split_fingerprint(result1) + result2_indexed_elements_count, result2_fingerprint = split_fingerprint(result2) expected_result1_indexed_elements_count = 6395 expected_result2_indexed_elements_count = 6398 assert result1_indexed_elements_count == expected_result1_indexed_elements_count assert result2_indexed_elements_count == expected_result2_indexed_elements_count - expected_result1_fingerprint = 'a23a49e4cd40718d1297be719e6564a4' - expected_result2_fingerprint = 'a23b49e4cd40708d1297be719c6564a4' + expected_result1_fingerprint = "a23a49e4cd40718d1297be719e6564a4" + expected_result2_fingerprint = "a23b49e4cd40708d1297be719c6564a4" assert result1_fingerprint == expected_result1_fingerprint assert result2_fingerprint == expected_result2_fingerprint - assert byte_hamming_distance( - result1_fingerprint, result2_fingerprint) == 3 + assert byte_hamming_distance(result1_fingerprint, result2_fingerprint) == 3 diff --git a/tests/test_halohash.py b/tests/test_halohash.py index cb45ae2..5bfe7a5 100644 --- a/tests/test_halohash.py +++ b/tests/test_halohash.py @@ -7,20 +7,18 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from collections import defaultdict - +import copy import csv import math -import copy import os import random import subprocess +from collections import defaultdict from commoncode.testcase import FileBasedTesting from matchcode_toolkit import halohash - SIZE_IN_BITS = 128 @@ -30,10 +28,7 @@ def load_csv(location): mappings field->value). """ with open(location) as csvin: - reader = csv.DictReader( - csvin, - quoting=csv.QUOTE_NONNUMERIC - ) + reader = csv.DictReader(csvin, quoting=csv.QUOTE_NONNUMERIC) fields = reader.fieldnames values = sorted(reader, key=lambda d: d.items()) return fields, values @@ -49,12 +44,8 @@ def check_results( Load and compare the CSV at `expected_file` against `results`. """ if regen: - with open(expected_file, 'w') as f: - writer = csv.DictWriter( - f, - fieldnames=fieldnames, - quoting=csv.QUOTE_NONNUMERIC - ) + with open(expected_file, "w") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC) writer.writeheader() writer.writerows(results) @@ -64,11 +55,11 @@ def check_results( # check results line by line for more compact results for exp, res in zip(expected, results): assert exp[column1_name] == res[column1_name] - expected_mean_hamming_distance = exp['mean hamming distance'] - expected_standard_deviation = exp['standard deviation'] + expected_mean_hamming_distance = exp["mean hamming distance"] + expected_standard_deviation = exp["standard deviation"] exp_min = expected_mean_hamming_distance - expected_standard_deviation exp_max = expected_mean_hamming_distance + expected_standard_deviation - assert exp_min <= res['mean hamming distance'] <= exp_max + assert exp_min <= res["mean hamming distance"] <= exp_max def calculate_hamming_distance(content_hash, modified_content): @@ -85,13 +76,11 @@ def calculate_mean_and_standard_deviation(hamming_distances): number_of_hamming_distances = len(hamming_distances) # 1: Find the mean. - mean_hamming_distance = sum(hamming_distances) / \ - number_of_hamming_distances + mean_hamming_distance = sum(hamming_distances) / number_of_hamming_distances # 2: For each data point, find the square of its distance to the mean, then sum the values. s0 = sum( - (hamming_distance - mean_hamming_distance) ** 2 - for hamming_distance in hamming_distances + (hamming_distance - mean_hamming_distance) ** 2 for hamming_distance in hamming_distances ) # 3: Divide by the number of data points. @@ -104,16 +93,17 @@ def calculate_mean_and_standard_deviation(hamming_distances): class TestHalohash(FileBasedTesting): - test_data_dir = os.path.join( - os.path.dirname(__file__), 'testfiles/halohash') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles/halohash") def setUp(self): - words_loc = self.get_test_loc('words.txt') + words_loc = self.get_test_loc("words.txt") with open(words_loc) as f: - self.original_content = [bytes(x.strip(), 'utf-8') for x in f] + self.original_content = [bytes(x.strip(), "utf-8") for x in f] def test_halohash_random_delete(self, regen=False): - for number_of_words in [500,]: + for number_of_words in [ + 500, + ]: content = copy.copy(self.original_content[:number_of_words]) original_hash = halohash.BitAverageHaloHash(content) @@ -125,36 +115,44 @@ def test_halohash_random_delete(self, regen=False): # we are moving towards unrelated files past that n = int(math.floor(len(modified_content) * 0.10)) for _ in range(n): - hamming_distance = calculate_hamming_distance( - original_hash, - modified_content - ) + hamming_distance = calculate_hamming_distance(original_hash, modified_content) number_of_elements = len(modified_content) hamming_distance_by_number_of_elements[number_of_elements].append( - hamming_distance) - modified_content.pop(random.randint( - 0, len(modified_content) - 1)) + hamming_distance + ) + modified_content.pop(random.randint(0, len(modified_content) - 1)) # Take mean and standard deviation results = [] - for number_of_elements, hamming_distances in hamming_distance_by_number_of_elements.items(): + for ( + number_of_elements, + hamming_distances, + ) in hamming_distance_by_number_of_elements.items(): mean_hamming_distance, standard_deviation = calculate_mean_and_standard_deviation( - hamming_distances) + hamming_distances + ) results.append( { - 'number of hashed elements': int(number_of_elements), - 'mean hamming distance': round(mean_hamming_distance, 1), - 'standard deviation': round(standard_deviation, 1) + "number of hashed elements": int(number_of_elements), + "mean hamming distance": round(mean_hamming_distance, 1), + "standard deviation": round(standard_deviation, 1), } ) expected_results_loc = self.get_test_loc( - f'{number_of_words}-delete-expected-results.csv') - check_results(results, expected_results_loc, [ - 'number of hashed elements', 'mean hamming distance', 'standard deviation'], regen=regen) + f"{number_of_words}-delete-expected-results.csv" + ) + check_results( + results, + expected_results_loc, + ["number of hashed elements", "mean hamming distance", "standard deviation"], + regen=regen, + ) def test_halohash_random_replace(self, regen=False): - for number_of_words in [500,]: + for number_of_words in [ + 500, + ]: content = copy.copy(self.original_content[:number_of_words]) original_hash = halohash.BitAverageHaloHash(content) @@ -168,43 +166,49 @@ def test_halohash_random_replace(self, regen=False): # we are moving towards unrelated files past that n = int(math.floor(len(modified_content) * 0.10)) for _ in range(n): - hamming_distance = calculate_hamming_distance( - original_hash, - modified_content - ) + hamming_distance = calculate_hamming_distance(original_hash, modified_content) hamming_distance_by_number_of_words_replaced[words_replaced].append( - hamming_distance) + hamming_distance + ) - modified_content.pop(random.randint( - 0, len(modified_content) - 1)) + modified_content.pop(random.randint(0, len(modified_content) - 1)) new_word = ( subprocess.run( - ['shuf', '-n', '1', '/usr/share/dict/american-english'], - stdout=subprocess.PIPE + ["shuf", "-n", "1", "/usr/share/dict/american-english"], + stdout=subprocess.PIPE, ) - .stdout - .decode('utf-8') + .stdout.decode("utf-8") .strip() - .replace('"', '') + .replace('"', "") + ) + modified_content[random.randint(0, len(modified_content) - 1)] = bytes( + new_word, "utf-8" ) - modified_content[random.randint( - 0, len(modified_content) - 1)] = bytes(new_word, 'utf-8') words_replaced += 1 # Take mean and standard deviation results = [] - for words_replaced, hamming_distances in hamming_distance_by_number_of_words_replaced.items(): + for ( + words_replaced, + hamming_distances, + ) in hamming_distance_by_number_of_words_replaced.items(): mean_hamming_distance, standard_deviation = calculate_mean_and_standard_deviation( - hamming_distances) + hamming_distances + ) results.append( { - 'words replaced': int(words_replaced), - 'mean hamming distance': round(mean_hamming_distance, 1), - 'standard deviation': round(standard_deviation, 1) + "words replaced": int(words_replaced), + "mean hamming distance": round(mean_hamming_distance, 1), + "standard deviation": round(standard_deviation, 1), } ) expected_results_loc = self.get_test_loc( - f'{number_of_words}-replaced-expected-results.csv') - check_results(results, expected_results_loc, [ - 'words replaced', 'mean hamming distance', 'standard deviation'], regen=regen) + f"{number_of_words}-replaced-expected-results.csv" + ) + check_results( + results, + expected_results_loc, + ["words replaced", "mean hamming distance", "standard deviation"], + regen=regen, + ) diff --git a/tests/test_plugin_fingerprinting.py b/tests/test_plugin_fingerprinting.py index 1f9b35a..65c4510 100644 --- a/tests/test_plugin_fingerprinting.py +++ b/tests/test_plugin_fingerprinting.py @@ -16,36 +16,36 @@ from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes - """ These tests spawn new process as if launched from the command line. """ class TestPluginFingerprinting(FileBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def test_plugin_fingerprinting_api_works(self): - test_loc = self.get_test_loc('fingerprinting/inflate.c') + test_loc = self.get_test_loc("fingerprinting/inflate.c") detections = list(get_file_fingerprint_hashes(location=test_loc)) assert detections def test_fingerprinting_plugin_works(self): - test_dir = self.get_test_loc('fingerprinting', copy=True) - result_file = self.get_temp_file('json') + test_dir = self.get_test_loc("fingerprinting", copy=True) + result_file = self.get_temp_file("json") args = [ - '--info', - '--fingerprint', - '--verbose', - '--json', result_file, + "--info", + "--fingerprint", + "--verbose", + "--json", + result_file, test_dir, ] run_scan_click(args) - test_loc = self.get_test_loc('fingerprinting-expected.json') + test_loc = self.get_test_loc("fingerprinting-expected.json") check_json_scan( test_loc, result_file, remove_file_date=True, check_headers=False, - regen=REGEN_TEST_FIXTURES + regen=REGEN_TEST_FIXTURES, )