-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
crbytes: CommonPrefix #1
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
// Copyright 2024 The Cockroach Authors. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||
// implied. See the License for the specific language governing | ||
// permissions and limitations under the License. | ||
|
||
package crbytes | ||
|
||
import "encoding/binary" | ||
|
||
// commonPrefixGeneric is used for architectures without a native | ||
// implementation. It is defined here rather than common_generic.go so that the | ||
// benchmarking code can have access to it even when there's a native | ||
// implementation available. | ||
func commonPrefixGeneric(a, b []byte) int { | ||
asUint64 := func(data []byte, i int) uint64 { | ||
return binary.LittleEndian.Uint64(data[i:]) | ||
} | ||
var shared int | ||
n := min(len(a), len(b)) | ||
for shared < n-7 && asUint64(a, shared) == asUint64(b, shared) { | ||
shared += 8 | ||
} | ||
for shared < n && a[shared] == b[shared] { | ||
shared++ | ||
} | ||
return shared | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,284 @@ | ||
// Copyright 2018 The Go Authors. All rights reserved. | ||
// Use of this source code is governed by a BSD-style | ||
// license that can be found in licenses/BSD-golang.txt. | ||
|
||
// This code is based on compare_amd64.s from Go 1.12.5. | ||
|
||
TEXT ·CommonPrefix(SB),$0-56 | ||
// SI = uintptr(unsafe.Pointer(&a[0])) | ||
MOVQ a_base+0(FP), SI | ||
// BX = len(a) | ||
MOVQ a_len+8(FP), BX | ||
// DI = uintptr(unsafe.Pointer(&b[0])) | ||
MOVQ b_base+24(FP), DI | ||
// DX = len(b) | ||
MOVQ b_len+32(FP), DX | ||
|
||
CMPQ BX, DX | ||
MOVQ DX, R8 | ||
CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare | ||
// Throughout this function, DX remembers the original min(alen, blen) and | ||
// R8 is the number of bytes we still need to compare (with bytes 0 to | ||
// DX-R8 known to match). | ||
MOVQ R8, DX | ||
CMPQ R8, $8 | ||
JB small | ||
|
||
CMPQ R8, $63 | ||
JBE loop | ||
JMP big_loop | ||
RET | ||
|
||
// loop is used when we have between 8 and 63 bytes left to compare (8 <= R8 < 64). | ||
// Invariant: 8 <= R8 < 64 | ||
loop: | ||
CMPQ R8, $16 | ||
JB _0through15 | ||
// X0 = a[:16] | ||
MOVOU (SI), X0 | ||
// X0 = b[:16] | ||
MOVOU (DI), X1 | ||
// Compare Packed Data for Equal: | ||
// for i := 0; i < 16; i++ { | ||
// if X0[i] != X1[i] { | ||
// X1[i] = 0 | ||
// } else { | ||
// X1[i] = 0xFF | ||
// } | ||
// } | ||
PCMPEQB X0, X1 | ||
// Move Byte Mask. | ||
// AX = 0 | ||
// for i := 0; i < 16; i++ { | ||
// if X1[i] & 0x80 != 0 { | ||
// AX |= (1 << i) | ||
// } | ||
PMOVMSKB X1, AX | ||
// AX ^= 0xFFFF | ||
XORQ $0xffff, AX // convert EQ to NE | ||
// if AX != 0 { | ||
// goto diff16 | ||
// } | ||
JNE diff16 // branch if at least one byte is not equal | ||
// a = a[16:] | ||
ADDQ $16, SI | ||
// b = b[16:] | ||
ADDQ $16, DI | ||
// R8 -= 16 | ||
SUBQ $16, R8 | ||
JMP loop | ||
|
||
// Invariant: a[0:48] matches b[0:48] and AX contains a bit mask of differences | ||
// between a[48:64] and b[48:64]. | ||
diff64: | ||
// R8 -= 48 | ||
SUBQ $48, R8 | ||
JMP diff16 | ||
|
||
// Invariant: a[0:32] matches b[0:32] and AX contains a bit mask of differences | ||
// between a[32:48] and b[32:48]. | ||
diff48: | ||
// R8 -= 32 | ||
SUBQ $32, R8 | ||
JMP diff16 | ||
|
||
// Invariant: a[0:16] matches b[0:16] and AX contains a bit mask of differences | ||
// between a[16:32] and b[16:32]. | ||
diff32: | ||
// R8 -= 16 | ||
SUBQ $16, R8 | ||
|
||
// Invariant: AX contains a bit mask of differences between a[:16] and b[:16]. | ||
// AX & (1 << i) == 1 iff a[i] != b[i] | ||
diff16: | ||
// Bit Scan Forward (return the index of the least significant set bit) | ||
// BX = bits.TrailingZeros64(AX) | ||
BSFQ AX, BX | ||
// BX is now the prefix of bytes that matched, advance by this much. | ||
// R8 -= BX | ||
SUBQ BX, R8 | ||
|
||
// Return DX (original min(alen, blen)) - R8 (bytes left to compare) | ||
SUBQ R8, DX | ||
MOVQ DX, ret+48(FP) | ||
RET | ||
|
||
// Invariants: | ||
// - original slices contained at least 8 bytes (DX >= 8) | ||
// - we have at most 15 bytes left to compare (R8 < 16) | ||
_0through15: | ||
// if R8 <= 8 { | ||
// goto _0through8 | ||
// } | ||
CMPQ R8, $8 | ||
JBE _0through8 | ||
// AX = a[:8] | ||
MOVQ (SI), AX | ||
// CX = b[:8] | ||
MOVQ (DI), CX | ||
// if AX != CX { | ||
// goto diff8 | ||
// } | ||
CMPQ AX, CX | ||
JNE diff8 | ||
|
||
// Invariants: | ||
// - original slices contained at least 8 bytes (DX >= 8) | ||
// - we have at most 8 bytes left to compare (R8 <= 8) | ||
// | ||
// Because the backing slices have at least 8 bytes and all the bytes so far | ||
// matched, we can (potentially) back up to where we have exactly 8 bytes to | ||
// compare. | ||
_0through8: | ||
// AX = b[len(b)-8:] | ||
MOVQ -8(SI)(R8*1), AX | ||
// CX = b[len(b)-8:] | ||
MOVQ -8(DI)(R8*1), CX | ||
// if AX == CX { | ||
// goto allsame | ||
// } | ||
CMPQ AX, CX | ||
JEQ allsame | ||
// R8 = 8 | ||
MOVQ $8, R8 | ||
|
||
// Invariant: AX contains a bit mask of differences between a[:8] and b[:8]. | ||
// AX & (1 << i) == 1 iff a[i] != b[i] | ||
diff8: | ||
// CX ^= AX | ||
XORQ AX, CX | ||
// Bit Scan Forward (return the index of the least significant set bit) | ||
// CX = bits.TrailingZeros64(CX) | ||
BSFQ CX, CX | ||
// CX /= 8 | ||
SHRQ $3, CX | ||
// CX is now the 0-based index of the first byte that differs. | ||
// R8 -= CX | ||
SUBQ CX, R8 | ||
|
||
// Return DX (original min(alen, blen)) - R8 (bytes left to compare) | ||
SUBQ R8, DX | ||
MOVQ DX, ret+48(FP) | ||
RET | ||
|
||
// Invariant: original min(alen, blen) < 8. DX < 8, R8 = DX. | ||
small: | ||
// CX = R8 * 8 | ||
LEAQ (R8*8), CX | ||
// CX = -CX | ||
// We only care about the lower 6 bits of CX, so this is equivalent to: | ||
// CX = (8-min(alen, blen)) * 8 | ||
NEGQ CX | ||
JEQ allsame | ||
|
||
// We will load 8 bytes, even though some of them are outside the slice | ||
// bounds. We go out of bounds either before or after the slice depending on | ||
// the value of the pointer. | ||
|
||
// if uintptr(unsafe.Pointer(&a[0]) > 0xF8 { | ||
// goto si_high | ||
// } | ||
CMPB SI, $0xf8 | ||
JA si_high | ||
// SI = a[:8] | ||
MOVQ (SI), SI | ||
// Discard the upper bytes which were out of bounds and add 0s (to be | ||
// removed below). | ||
SHLQ CX, SI | ||
JMP si_finish | ||
si_high: | ||
// SI = a[len(a)-8:] | ||
MOVQ -8(SI)(R8*1), SI | ||
si_finish: | ||
// SI = SI >> CX | ||
// Discard the lower bytes which were added by SHLQ in one case, or that | ||
// were out of bounds in the si_high case. | ||
// In both cases, SI = a[:]. | ||
SHRQ CX, SI | ||
|
||
// if uintptr(unsafe.Pointer(&b[0]) > 0xF8 { | ||
// goto di_high | ||
// } | ||
CMPB DI, $0xf8 | ||
JA di_high | ||
// DI = b[:8] | ||
MOVQ (DI), DI | ||
// Discard the upper bytes which were out of bounds and add 0s (to be | ||
// removed below). | ||
SHLQ CX, DI | ||
JMP di_finish | ||
di_high: | ||
// DI = b[len(b)-8:] | ||
MOVQ -8(DI)(R8*1), DI | ||
di_finish: | ||
// DI = DI >> CX | ||
// Discard the lower bytes which were added by SHLQ in one case, or that | ||
// were out of bounds in the di_high case. | ||
// In both cases, DI = b[:]. | ||
SHRQ CX, DI | ||
|
||
// DI ^= SI | ||
XORQ SI, DI | ||
// if DI == 0 { | ||
// goto allsame | ||
// } | ||
JEQ allsame | ||
|
||
// Bit Scan Forward (return the index of the least significant set bit) | ||
// DI = bits.TrailingZeros64(DI) | ||
BSFQ DI, DI | ||
// DI /= 8 | ||
SHRQ $3, DI | ||
// DI is now the 0-based index of the first byte that differs. | ||
// R8 -= DI | ||
SUBQ DI, R8 | ||
|
||
// Return DX (original min(alen, blen)) - R8 (bytes left to compare) | ||
SUBQ R8, DX | ||
allsame: | ||
MOVQ DX, ret+48(FP) | ||
RET | ||
|
||
// big_loop is used when we have at least 64 bytes to compare. It is similar to | ||
// <loop>, except that we do 4 iterations at a time. | ||
big_loop: | ||
MOVOU (SI), X0 | ||
MOVOU (DI), X1 | ||
PCMPEQB X0, X1 | ||
PMOVMSKB X1, AX | ||
XORQ $0xffff, AX | ||
JNE diff16 | ||
|
||
MOVOU 16(SI), X0 | ||
MOVOU 16(DI), X1 | ||
PCMPEQB X0, X1 | ||
PMOVMSKB X1, AX | ||
XORQ $0xffff, AX | ||
JNE diff32 | ||
|
||
MOVOU 32(SI), X0 | ||
MOVOU 32(DI), X1 | ||
PCMPEQB X0, X1 | ||
PMOVMSKB X1, AX | ||
XORQ $0xffff, AX | ||
JNE diff48 | ||
|
||
MOVOU 48(SI), X0 | ||
MOVOU 48(DI), X1 | ||
PCMPEQB X0, X1 | ||
PMOVMSKB X1, AX | ||
XORQ $0xffff, AX | ||
JNE diff64 | ||
|
||
// a = a[64:] | ||
ADDQ $64, SI | ||
// b = b[64:] | ||
ADDQ $64, DI | ||
// R8 -= 64 | ||
SUBQ $64, R8 | ||
CMPQ R8, $64 | ||
// if R8 < 64 { | ||
// goto loop | ||
// } | ||
JBE loop | ||
JMP big_loop |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If anyone actually wants to look at this, here's the diff against
bytes.Compare
: https://editor.mergely.com/V6Bvcmbr