From bf992328364a64ca7ec1cfb4618099268d2d6af2 Mon Sep 17 00:00:00 2001 From: Potuz Date: Wed, 10 May 2023 10:14:38 -0300 Subject: [PATCH 1/2] Slices without reflect This PR changes the assembly code to receive pointers to the first bytes of the slices instead of the slices themselves. This allows the Go binding to simply pass pointers to the first byte and be able to expose functions that take either `[][32]byte` or `[]byte`. The generic fallback does copy chunks and digests in the case of byte slice arguments. --- hash.go | 27 ++++++++++++--------------- hash_amd64.s | 20 ++++++++++---------- hash_arm64.s | 4 ++-- 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/hash.go b/hash.go index 9223c27..f350440 100644 --- a/hash.go +++ b/hash.go @@ -25,11 +25,9 @@ package gohashtree import ( "fmt" - "reflect" - "unsafe" ) -func _hash(digests *byte, p [][32]byte, count uint32) +func _hash(digests *byte, p *byte, count uint32) func Hash(digests [][32]byte, chunks [][32]byte) error { if len(chunks) == 0 { @@ -43,7 +41,7 @@ func Hash(digests [][32]byte, chunks [][32]byte) error { return fmt.Errorf("not enough digest length, need at least %v, got %v", len(chunks)/2, len(digests)) } if supportedCPU { - _hash(&digests[0][0], chunks, uint32(len(chunks)/2)) + _hash(&digests[0][0], &chunks[0][0], uint32(len(chunks)/2)) } else { sha256_1_generic(digests, chunks) } @@ -51,7 +49,7 @@ func Hash(digests [][32]byte, chunks [][32]byte) error { } func HashChunks(digests [][32]byte, chunks [][32]byte) { - _hash(&digests[0][0], chunks, uint32(len(chunks)/2)) + _hash(&digests[0][0], &chunks[0][0], uint32(len(chunks)/2)) } func HashByteSlice(digests []byte, chunks []byte) error { @@ -69,18 +67,17 @@ func HashByteSlice(digests []byte, chunks []byte) error { } // We use an unsafe pointer to cast []byte to [][32]byte. The length and // capacity of the slice need to be divided accordingly by 32. - header := *(*reflect.SliceHeader)(unsafe.Pointer(&chunks)) - header.Len <<= 5 - header.Cap <<= 5 - chunkedChunks := *(*[][32]byte)(unsafe.Pointer(&header)) - if supportedCPU { - _hash(&digests[0], chunkedChunks, uint32(len(chunks)/64)) + _hash(&digests[0], &chunks[0], uint32(len(chunks)/64)) } else { - headerDigest := *(*reflect.SliceHeader)(unsafe.Pointer(&digests)) - headerDigest.Len <<= 5 - headerDigest.Cap <<= 5 - chunkedDigest := *(*[][32]byte)(unsafe.Pointer(&headerDigest)) + chunkedChunks := make([][32]byte, len(chunks)/32) + for i := range chunkedChunks { + copy(chunkedChunks[i][:], chunks[32*i:32*i+32]) + } + chunkedDigest := make([][32]byte, len(digests)/32) + for i := range chunkedDigest { + copy(chunkedDigest[i][:], digests[32*i:32*i+32]) + } sha256_1_generic(chunkedDigest, chunkedChunks) } return nil diff --git a/hash_amd64.s b/hash_amd64.s index a107304..dce336e 100644 --- a/hash_amd64.s +++ b/hash_amd64.s @@ -783,8 +783,8 @@ TEXT ·_hash(SB), 0, $928-36 JE avx2 MOVQ digests+0(FP), OUTPUT_PTR // digests *[][32]byte - MOVQ p_base+8(FP), DATA_PTR // p [][32]byte - MOVL count+32(FP), NUM_BLKS // NUM_BLKS uint32 + MOVQ p+8(FP), DATA_PTR // p *[][32]byte or *[]byte + MOVL count+16(FP), NUM_BLKS // NUM_BLKS uint32 avx1: CMPL NUM_BLKS, $4 @@ -1314,9 +1314,9 @@ sha256_1_avx_epilog: // 8 blocks at a time with AVX2 avx2: - MOVL count+32(FP), NUM_BLKS // NUMBLKS uint32 - MOVQ digests+0(FP), OUTPUT_PTR // digests *[][32]byte - MOVQ p_base+8(FP), DATA_PTR // p [][32]byte + MOVL count+16(FP), NUM_BLKS // NUMBLKS uint32 + MOVQ digests+0(FP), OUTPUT_PTR // digests *[][32]byte or *[]byte + MOVQ p+8(FP), DATA_PTR // p *[][32]byte or p *[]byte sha256_8_avx2_loop: CMPL NUM_BLKS, $8 @@ -1591,8 +1591,8 @@ sha256_8_avx2_loop: // AVX 512 section avx512: MOVQ digests+0(FP), OUTPUT_PTR - MOVQ p_base+8(FP), DATA_PTR - MOVL count+32(FP), NUM_BLKS + MOVQ p+8(FP), DATA_PTR + MOVL count+16(FP), NUM_BLKS MOVQ $_DIGEST_16<>(SB), DIGESTAVX512 MOVQ $_PADDING_16<>(SB), PADDINGAVX512 @@ -2046,9 +2046,9 @@ avx512_loop: // SHA-ni section shani: - MOVQ digests+0(FP), OUTPUT_PTR // digests *[][32]byte - MOVQ p_base+8(FP), DATA_PTR // p [][32]byte - MOVL count+32(FP), NUM_BLKS // NUM_BLKS uint32 + MOVQ digests+0(FP), OUTPUT_PTR // digests *[][32]byte or *[]byte + MOVQ p+8(FP), DATA_PTR // p *[][32]byte or *[]byte + MOVL count+16(FP), NUM_BLKS // NUM_BLKS uint32 // Golang assembly does not guarantee stack aligned at 16 bytes MOVQ SP, SAVE_SP diff --git a/hash_arm64.s b/hash_arm64.s index dc5c7c5..36dba36 100644 --- a/hash_arm64.s +++ b/hash_arm64.s @@ -461,8 +461,8 @@ Copied parts are TEXT ·_hash(SB), 0, $1024-36 MOVD digests+0(FP), OUTPUT_PTR - MOVD p_base+8(FP), DATA_PTR - MOVWU count+32(FP), NUM_BLKS + MOVD p+8(FP), DATA_PTR + MOVWU count+16(FP), NUM_BLKS MOVBU ·hasShani(SB), check_shani CBNZ check_shani, shani From 41f42fe2b56a7ea665c49b575092b9dc3ed20ef0 Mon Sep 17 00:00:00 2001 From: Potuz Date: Tue, 16 May 2023 15:40:02 -0300 Subject: [PATCH 2/2] add benches --- hash_test.go | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/hash_test.go b/hash_test.go index 53dbb69..8d2e814 100644 --- a/hash_test.go +++ b/hash_test.go @@ -316,6 +316,15 @@ func BenchmarkHash_1(b *testing.B) { } } +func BenchmarkHash_slice_1(b *testing.B) { + chunks := make([]byte, 64) + digests := make([]byte, 32) + b.ResetTimer() + for i := 0; i < b.N; i++ { + gohashtree.HashByteSlice(digests, chunks) + } +} + func BenchmarkHash_4_minio(b *testing.B) { chunks := [64 * 4]byte{'A'} digests := make([][32]byte, 4) @@ -336,6 +345,15 @@ func BenchmarkHash_4(b *testing.B) { } } +func BenchmarkHash_slice_4(b *testing.B) { + chunks := make([]byte, 8*32) + digests := make([]byte, 4*32) + b.ResetTimer() + for i := 0; i < b.N; i++ { + gohashtree.HashByteSlice(digests, chunks) + } +} + func BenchmarkHash_8_minio(b *testing.B) { chunks := [64 * 8]byte{'A'} digests := make([][32]byte, 8) @@ -356,6 +374,15 @@ func BenchmarkHash_8(b *testing.B) { } } +func BenchmarkHash_slice_8(b *testing.B) { + chunks := make([]byte, 16*32) + digests := make([]byte, 8*32) + b.ResetTimer() + for i := 0; i < b.N; i++ { + gohashtree.HashByteSlice(digests, chunks) + } +} + func BenchmarkHash_16_minio(b *testing.B) { chunks := [64 * 16]byte{'A'} digests := make([][32]byte, 16) @@ -376,6 +403,15 @@ func BenchmarkHash_16(b *testing.B) { } } +func BenchmarkHash_slice_16(b *testing.B) { + chunks := make([]byte, 32*32) + digests := make([]byte, 16*32) + b.ResetTimer() + for i := 0; i < b.N; i++ { + gohashtree.HashByteSlice(digests, chunks) + } +} + func BenchmarkHashLargeList_minio(b *testing.B) { balances := make([][32]byte, 400000) for i := 0; i < len(balances); i++ { @@ -402,3 +438,15 @@ func BenchmarkHashList(b *testing.B) { gohashtree.Hash(digests, balances) } } + +func BenchmarkHashList_slice(b *testing.B) { + balances := make([]byte, 400000*32) + for i := 0; i < len(balances); i += 32 { + balances[i] = byte('A') + } + digests := make([]byte, 200000*32) + b.ResetTimer() + for i := 0; i < b.N; i++ { + gohashtree.HashByteSlice(digests, balances) + } +}