Skip to content

Commit

Permalink
Add test for hailstorm snippet match
Browse files Browse the repository at this point in the history
Signed-off-by: Jono Yang <[email protected]>
  • Loading branch information
JonoYang committed Oct 24, 2024
1 parent cf0107b commit a2ab3e0
Show file tree
Hide file tree
Showing 3 changed files with 554 additions and 11 deletions.
34 changes: 23 additions & 11 deletions tests/test_fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,15 +139,15 @@ def test_get_file_fingerprint_hashes_one_line_removed(self):

expected_result1_indexed_elements_count = 6395
expected_result2_indexed_elements_count = 6388
assert result1_indexed_elements_count == expected_result1_indexed_elements_count
assert result2_indexed_elements_count == expected_result2_indexed_elements_count
self.assertEqual(expected_result1_indexed_elements_count, result1_indexed_elements_count)
self.assertEqual(expected_result2_indexed_elements_count, result2_indexed_elements_count)

expected_result1_fingerprint = "a23a49e4cd40718d1297be719e6564a4"
expected_result2_fingerprint = "aa3a49e4cd40718d1297be519e6564a4"
assert result1_fingerprint == expected_result1_fingerprint
assert result2_fingerprint == expected_result2_fingerprint
self.assertEqual(expected_result1_fingerprint, result1_fingerprint)
self.assertEqual(expected_result2_fingerprint, result2_fingerprint)

assert byte_hamming_distance(result1_fingerprint, result2_fingerprint) == 2
self.assertEqual(2, byte_hamming_distance(result1_fingerprint, result2_fingerprint))

def test_get_file_fingerprint_hashes_one_line_added(self):
test_file1 = self.get_test_loc("inflate.c")
Expand All @@ -161,12 +161,24 @@ def test_get_file_fingerprint_hashes_one_line_added(self):

expected_result1_indexed_elements_count = 6395
expected_result2_indexed_elements_count = 6398
assert result1_indexed_elements_count == expected_result1_indexed_elements_count
assert result2_indexed_elements_count == expected_result2_indexed_elements_count
self.assertEqual(expected_result1_indexed_elements_count, result1_indexed_elements_count)
self.assertEqual(expected_result2_indexed_elements_count, result2_indexed_elements_count)

expected_result1_fingerprint = "a23a49e4cd40718d1297be719e6564a4"
expected_result2_fingerprint = "a23b49e4cd40708d1297be719c6564a4"
assert result1_fingerprint == expected_result1_fingerprint
assert result2_fingerprint == expected_result2_fingerprint

assert byte_hamming_distance(result1_fingerprint, result2_fingerprint) == 3
self.assertEqual(expected_result1_fingerprint, result1_fingerprint)
self.assertEqual(expected_result2_fingerprint, result2_fingerprint)

self.assertEqual(3, byte_hamming_distance(result1_fingerprint, result2_fingerprint))

def test_hailstorm_similarity(self):
# 1 function from adler32.c has been added to zutil.c
test_file1 = self.get_test_loc("hailstorm/adler32.c")
test_file2 = self.get_test_loc("hailstorm/zutil.c")
results1 = get_file_fingerprint_hashes(test_file1)
results2 = get_file_fingerprint_hashes(test_file2)
result1 = results1.get("hailstorm")
result2 = results2.get("hailstorm")
expected_result = {"16e774a453769c012ca1e7f3685b4111", "498885acf844eda1f65af9e746deaff7"}
result = set(result1).intersection(result2)
self.assertEqual(expected_result, result)
179 changes: 179 additions & 0 deletions tests/testfiles/fingerprinting/hailstorm/adler32.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
/* adler32.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/

/* @(#) $Id$ */

#include "zutil.h"

#define local static

local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));

#define BASE 65521 /* largest prime smaller than 65536 */
#define NMAX 5552
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */

#define DO1(buf,i) {adler += (buf)[i]; sum2 += adler;}
#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1);
#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2);
#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4);
#define DO16(buf) DO8(buf,0); DO8(buf,8);

/* use NO_DIVIDE if your processor does not do division in hardware --
try it both ways to see which is faster */
#ifdef NO_DIVIDE
/* note that this assumes BASE is 65521, where 65536 % 65521 == 15
(thank you to John Reiser for pointing this out) */
# define CHOP(a) \
do { \
unsigned long tmp = a >> 16; \
a &= 0xffffUL; \
a += (tmp << 4) - tmp; \
} while (0)
# define MOD28(a) \
do { \
CHOP(a); \
if (a >= BASE) a -= BASE; \
} while (0)
# define MOD(a) \
do { \
CHOP(a); \
MOD28(a); \
} while (0)
# define MOD63(a) \
do { /* this assumes a is not negative */ \
z_off64_t tmp = a >> 32; \
a &= 0xffffffffL; \
a += (tmp << 8) - (tmp << 5) + tmp; \
tmp = a >> 16; \
a &= 0xffffL; \
a += (tmp << 4) - tmp; \
tmp = a >> 16; \
a &= 0xffffL; \
a += (tmp << 4) - tmp; \
if (a >= BASE) a -= BASE; \
} while (0)
#else
# define MOD(a) a %= BASE
# define MOD28(a) a %= BASE
# define MOD63(a) a %= BASE
#endif

/* ========================================================================= */
uLong ZEXPORT adler32(adler, buf, len)
uLong adler;
const Bytef *buf;
uInt len;
{
unsigned long sum2;
unsigned n;

/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;

/* in case user likes doing a byte at a time, keep it fast */
if (len == 1) {
adler += buf[0];
if (adler >= BASE)
adler -= BASE;
sum2 += adler;
if (sum2 >= BASE)
sum2 -= BASE;
return adler | (sum2 << 16);
}

/* initial Adler-32 value (deferred check for len == 1 speed) */
if (buf == Z_NULL)
return 1L;

/* in case short lengths are provided, keep it somewhat fast */
if (len < 16) {
while (len--) {
adler += *buf++;
sum2 += adler;
}
if (adler >= BASE)
adler -= BASE;
MOD28(sum2); /* only added so many BASE's */
return adler | (sum2 << 16);
}

/* do length NMAX blocks -- requires just one modulo operation */
while (len >= NMAX) {
len -= NMAX;
n = NMAX / 16; /* NMAX is divisible by 16 */
do {
DO16(buf); /* 16 sums unrolled */
buf += 16;
} while (--n);
MOD(adler);
MOD(sum2);
}

/* do remaining bytes (less than NMAX, still just one modulo) */
if (len) { /* avoid modulos if none remaining */
while (len >= 16) {
len -= 16;
DO16(buf);
buf += 16;
}
while (len--) {
adler += *buf++;
sum2 += adler;
}
MOD(adler);
MOD(sum2);
}

/* return recombined sums */
return adler | (sum2 << 16);
}

/* ========================================================================= */
local uLong adler32_combine_(adler1, adler2, len2)
uLong adler1;
uLong adler2;
z_off64_t len2;
{
unsigned long sum1;
unsigned long sum2;
unsigned rem;

/* for negative len, return invalid adler32 as a clue for debugging */
if (len2 < 0)
return 0xffffffffUL;

/* the derivation of this formula is left as an exercise for the reader */
MOD63(len2); /* assumes len2 >= 0 */
rem = (unsigned)len2;
sum1 = adler1 & 0xffff;
sum2 = rem * sum1;
MOD(sum2);
sum1 += (adler2 & 0xffff) + BASE - 1;
sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
if (sum1 >= BASE) sum1 -= BASE;
if (sum1 >= BASE) sum1 -= BASE;
if (sum2 >= (BASE << 1)) sum2 -= (BASE << 1);
if (sum2 >= BASE) sum2 -= BASE;
return sum1 | (sum2 << 16);
}

/* ========================================================================= */
uLong ZEXPORT adler32_combine(adler1, adler2, len2)
uLong adler1;
uLong adler2;
z_off_t len2;
{
return adler32_combine_(adler1, adler2, len2);
}

uLong ZEXPORT adler32_combine64(adler1, adler2, len2)
uLong adler1;
uLong adler2;
z_off64_t len2;
{
return adler32_combine_(adler1, adler2, len2);
}
Loading

0 comments on commit a2ab3e0

Please sign in to comment.