diff --git a/docs/checksum32.rst b/docs/checksum32.rst index 5e682afc..5b2013f8 100644 --- a/docs/checksum32.rst +++ b/docs/checksum32.rst @@ -33,3 +33,13 @@ Fletcher32 .. automethod:: encode .. automethod:: decode +JenkinsLookup3 +-------------- + +.. autoclass:: JenkinsLookup3 + + .. autoattribute:: codec_id + .. autoattribute:: initval + .. autoattribute:: prefix + .. automethod:: encode + .. automethod:: decode diff --git a/docs/release.rst b/docs/release.rst index 3f2394b5..c9033a85 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -17,6 +17,8 @@ Enhancements * Add ``fletcher32`` checksum codec By :user:`Martin Durant `, :issue:`410`. +* Add ``jenkins_lookup3`` checksum codec + By :user:`Mark Kittisopkul `, :issue:`445`. Fix ~~~ diff --git a/numcodecs/__init__.py b/numcodecs/__init__.py index 1e3c8536..3d7befe2 100644 --- a/numcodecs/__init__.py +++ b/numcodecs/__init__.py @@ -98,9 +98,10 @@ from numcodecs.msgpacks import MsgPack register_codec(MsgPack) -from numcodecs.checksum32 import CRC32, Adler32 +from numcodecs.checksum32 import CRC32, Adler32, JenkinsLookup3 register_codec(CRC32) register_codec(Adler32) +register_codec(JenkinsLookup3) from numcodecs.json import JSON register_codec(JSON) diff --git a/numcodecs/checksum32.py b/numcodecs/checksum32.py index 06dfbdb4..35a5ab99 100644 --- a/numcodecs/checksum32.py +++ b/numcodecs/checksum32.py @@ -2,10 +2,12 @@ import numpy as np +import struct from .abc import Codec from .compat import ensure_contiguous_ndarray, ndarray_copy +from .jenkins import jenkins_lookup3 class Checksum32(Codec): @@ -40,3 +42,58 @@ class Adler32(Checksum32): codec_id = 'adler32' checksum = zlib.adler32 + + +class JenkinsLookup3(Checksum32): + """Bob Jenkin's lookup3 checksum with 32-bit output + + This is the HDF5 implementation. + https://github.com/HDFGroup/hdf5/blob/577c192518598c7e2945683655feffcdbdf5a91b/src/H5checksum.c#L378-L472 + + With this codec, the checksum is concatenated on the end of the data + bytes when encoded. At decode time, the checksum is performed on + the data portion and compared with the four-byte checksum, raising + RuntimeError if inconsistent. + + Attributes: + initval: initial seed passed to the hash algorithm, default: 0 + prefix: bytes prepended to the buffer before evaluating the hash, default: None + """ + + checksum = jenkins_lookup3 + codec_id = "jenkins_lookup3" + + def __init__(self, initval: int = 0, prefix=None): + self.initval = initval + if prefix is None: + self.prefix = None + else: + self.prefix = np.frombuffer(prefix, dtype='uint8') + + def encode(self, buf): + """Return buffer plus 4-byte Bob Jenkin's lookup3 checksum""" + buf = ensure_contiguous_ndarray(buf).ravel().view('uint8') + if self.prefix is None: + val = jenkins_lookup3(buf, self.initval) + else: + val = jenkins_lookup3(np.hstack((self.prefix, buf)), self.initval) + return buf.tobytes() + struct.pack("0xdeadbeef) + (length) + initval + + # Return immediately for empty bytes + if length == 0: + return c + + cdef: + const uint8_t *k = &_data[0] + + # We are adding uint32_t values (words) byte by byte so we do not assume endianness or alignment + # lookup3.c hashlittle checks for alignment + + # all but the last block: affect some 32 bits of (a,b,c) + while length > 12: + a += k[0] + a += (k[1]) << 8 + a += (k[2]) << 16 + a += (k[3]) << 24 + b += k[4] + b += (k[5]) << 8 + b += (k[6]) << 16 + b += (k[7]) << 24 + c += k[8] + c += (k[9]) << 8 + c += (k[10]) << 16 + c += (k[11]) << 24 + a, b, c = _jenkins_lookup3_mix(a, b, c) + length -= 12 + k += 12 + + # -------------------------------- last block: affect all 32 bits of (c) + if length == 12: + c += (k[11]) << 24 + length -= 1 + + if length == 11: + c += (k[10]) << 16 + length -= 1 + + if length == 10: + c += (k[9]) << 8 + length -= 1 + + if length == 9: + c += k[8] + length -= 1 + + if length == 8: + b += (k[7]) << 24 + length -= 1 + + if length == 7: + b += (k[6]) << 16 + length -= 1 + + if length == 6: + b += (k[5]) << 8 + length -= 1 + + if length == 5: + b += k[4] + length -= 1 + + if length == 4: + a += (k[3]) << 24 + length -= 1 + + if length == 3: + a += (k[2]) << 16 + length -= 1 + + if length == 2: + a += (k[1]) << 8 + length -= 1 + + if length == 1: + a += k[0] + length -= 1 + + if length == 0: + pass + + return _jenkins_lookup3_final(a, b, c) + +cdef inline uint32_t _jenkins_lookup3_final(uint32_t a, uint32_t b, uint32_t c): + """ + _jenkins_lookup3_final -- final mixing of 3 32-bit values (a,b,c) into c + + Pairs of (a,b,c) values differing in only a few bits will usually + produce values of c that look totally different. This was tested for + * pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). + * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. + * the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + + These constants passed: + 14 11 25 16 4 14 24 + 12 14 25 16 4 14 24 + and these came close: + 4 8 15 26 3 22 24 + 10 8 15 26 3 22 24 + 11 8 15 26 3 22 24 + """ + c ^= b + c -= _jenkins_lookup3_rot(b,14) + a ^= c + a -= _jenkins_lookup3_rot(c,11) + b ^= a + b -= _jenkins_lookup3_rot(a,25) + c ^= b + c -= _jenkins_lookup3_rot(b,16) + a ^= c + a -= _jenkins_lookup3_rot(c,4) + b ^= a + b -= _jenkins_lookup3_rot(a,14) + c ^= b + c -= _jenkins_lookup3_rot(b,24) + return c + +cdef inline uint32_t _jenkins_lookup3_rot(uint32_t x, uint8_t k): + return (((x) << (k)) ^ ((x) >> (32 - (k)))) + +cdef inline (uint32_t, uint32_t, uint32_t) _jenkins_lookup3_mix(uint32_t a, uint32_t b, uint32_t c): + """ + _jenkins_lookup3_mix -- mix 3 32-bit values reversibly. + + This is reversible, so any information in (a,b,c) before mix() is + still in (a,b,c) after mix(). + + If four pairs of (a,b,c) inputs are run through mix(), or through + mix() in reverse, there are at least 32 bits of the output that + are sometimes the same for one pair and different for another pair. + This was tested for: + * pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). + * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. + * the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + + Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that + satisfy this are + 4 6 8 16 19 4 + 9 15 3 18 27 15 + 14 9 3 7 17 3 + Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing + for "differ" defined as + with a one-bit base and a two-bit delta. I + used http://burtleburtle.net/bob/hash/avalanche.html to choose + the operations, constants, and arrangements of the variables. + + This does not achieve avalanche. There are input bits of (a,b,c) + that fail to affect some output bits of (a,b,c), especially of a. The + most thoroughly mixed value is c, but it doesn't really even achieve + avalanche in c. + + This allows some parallelism. Read-after-writes are good at doubling + the number of bits affected, so the goal of mixing pulls in the opposite + direction as the goal of parallelism. I did what I could. Rotates + seem to cost as much as shifts on every machine I could lay my hands + on, and rotates are much kinder to the top and bottom bits, so I used + rotates. + """ + a -= c + a ^= _jenkins_lookup3_rot(c, 4) + c += b + b -= a + b ^= _jenkins_lookup3_rot(a, 6) + a += c + c -= b + c ^= _jenkins_lookup3_rot(b, 8) + b += a + a -= c + a ^= _jenkins_lookup3_rot(c, 16) + c += b + b -= a + b ^= _jenkins_lookup3_rot(a, 19) + a += c + c -= b + c ^= _jenkins_lookup3_rot(b, 4) + b += a + return a, b, c + + diff --git a/numcodecs/tests/test_jenkins.py b/numcodecs/tests/test_jenkins.py new file mode 100644 index 00000000..4873e44f --- /dev/null +++ b/numcodecs/tests/test_jenkins.py @@ -0,0 +1,151 @@ +import numpy as np +import pytest + +from numcodecs.jenkins import jenkins_lookup3 +from numcodecs.checksum32 import JenkinsLookup3 + + +def test_jenkins_lookup3(): + h = jenkins_lookup3(b"", 0) + assert h == 0xdeadbeef + h = jenkins_lookup3(b"", 0xdeadbeef) + assert h == 0xbd5b7dde + h = jenkins_lookup3(b"Four score and seven years ago", 0) + assert h == 0x17770551 + h = jenkins_lookup3(b"Four score and seven years ago", 1) + assert h == 0xcd628161 + + # jenkins-cffi example + h = jenkins_lookup3(b"jenkins", 0) + assert h == 202276345 + + h_last = [0] + + h = jenkins_lookup3(b"", h_last[-1]) + assert h not in h_last + h_last.append(h) + + h = jenkins_lookup3(b"", h_last[-1]) + assert h not in h_last + h_last.append(h) + + h = jenkins_lookup3(b"", h_last[-1]) + assert h not in h_last + h_last.append(h) + + h = jenkins_lookup3(b"", h_last[-1]) + assert h not in h_last + h_last.append(h) + + h = jenkins_lookup3(b"", h_last[-1]) + assert h not in h_last + h_last.append(h) + + h = jenkins_lookup3(b"", h_last[-1]) + assert h not in h_last + h_last.append(h) + + h = jenkins_lookup3(b"", h_last[-1]) + assert h not in h_last + h_last.append(h) + + h = jenkins_lookup3(b"", h_last[-1]) + assert h not in h_last + h_last.append(h) + + h = jenkins_lookup3(b"", h_last[-1]) + assert h not in h_last + h_last.append(h) + + a = np.frombuffer(b"Four score and seven years ago", dtype="uint8") + h = jenkins_lookup3(a, 0) + assert h == 0x17770551 + + +def test_jenkins_lookup3_codec(): + s = b"Four score and seven years ago" + j = JenkinsLookup3() + result = j.encode(s) + assert result[-4:] == b'\x51\x05\x77\x17' + assert bytes(j.decode(result)) == s + + j = JenkinsLookup3(initval=0xdeadbeef) + result = j.encode(s) + assert bytes(j.decode(result)) == s + + j = JenkinsLookup3(initval=1230) + result = j.encode(s) + assert result[-4:] == b'\xd7Z\xe2\x0e' + assert bytes(j.decode(result)) == s + + j = JenkinsLookup3(initval=1230, prefix=b"Hello world") + result = j.encode(s) + assert bytes(j.decode(result)) == s + + chunk_index = b"\x00\x08\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x17\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"\xee'\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"\xe57\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"\xdcG\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"\xd3W\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"\xcag\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"\xc1w\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"\xb8\x87\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"\xaf\x97\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"\xa6\xa7\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"\x9d\xb7\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"\x94\xc7\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"\x8b\xd7\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"\x82\xe7\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"y\xf7\x00\x00\x00\x00\x00\x00" + \ + b"\xf7\x0f\x00\x00\x00\x00\x00\x00" + \ + b"n\x96\x07\x85" + hdf5_fadb_prefix = b'FADB\x00\x01\xcf\x01\x00\x00\x00\x00\x00\x00' + j = JenkinsLookup3(prefix=hdf5_fadb_prefix) + result = j.encode(chunk_index[:-4]) + j.decode(result) + assert result == chunk_index + + +@pytest.mark.parametrize( + "dtype", + ["uint8", "int32", "float32"] +) +def test_with_data(dtype): + data = np.arange(100, dtype=dtype) + j = JenkinsLookup3() + arr = np.frombuffer(j.decode(j.encode(data)), dtype=dtype) + assert (arr == data).all() + + +def test_error(): + data = np.arange(100) + j = JenkinsLookup3() + enc = j.encode(data) + enc2 = bytearray(enc) + enc2[0] += 1 + with pytest.raises(RuntimeError) as e: + j.decode(enc2) + assert "Bob Jenkin's lookup3 checksum" in str(e.value) + + +def test_out(): + data = np.frombuffer(bytearray(b"Hello World"), dtype="uint8") + j = JenkinsLookup3() + result = j.encode(data) + j.decode(result, out=data) diff --git a/setup.py b/setup.py index 14072445..f07cf8d4 100644 --- a/setup.py +++ b/setup.py @@ -223,6 +223,31 @@ def fletcher_extension(): return extensions +def jenkins_extension(): + info('setting up jenkins extension') + + extra_compile_args = base_compile_args.copy() + define_macros = [] + + # setup sources + include_dirs = ['numcodecs'] + define_macros += [('CYTHON_TRACE', '1')] + + sources = ['numcodecs/jenkins.pyx'] + + # define extension module + extensions = [ + Extension('numcodecs.jenkins', + sources=sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ), + ] + + return extensions + + def compat_extension(): info('setting up compat extension') @@ -291,7 +316,7 @@ def run_setup(with_extensions): if with_extensions: ext_modules = (blosc_extension() + zstd_extension() + lz4_extension() + compat_extension() + shuffle_extension() + vlen_extension() + - fletcher_extension()) + fletcher_extension() + jenkins_extension()) cmdclass = dict(build_ext=ve_build_ext) else: