Skip to content

Commit

Permalink
arm64 decoding supports length in [24,64) #20
Browse files Browse the repository at this point in the history
  • Loading branch information
emmansun authored Oct 23, 2024
1 parent 130ba8a commit ddd397f
Show file tree
Hide file tree
Showing 3 changed files with 183 additions and 119 deletions.
2 changes: 1 addition & 1 deletion base64_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func encode(enc *Encoding, dst, src []byte) {

func decode(enc *Encoding, dst, src []byte) (int, error) {
srcLen := len(src)
if srcLen >= 64 {
if srcLen >= 24 {
remain := srcLen
if enc.lut == &encodeStdLut {
remain = decodeAsm(dst, src, &dencodeStdLut)
Expand Down
280 changes: 169 additions & 111 deletions base64_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,14 @@ DATA enc_const<>+0x30(SB)/8, $0x1f1e1b1a17161312 // high part of word
DATA enc_const<>+0x38(SB)/8, $0x0f0e0b0a07060302
GLOBL enc_const<>(SB), (NOPTR+RODATA), $64

DATA dec_const<>+0x00(SB)/8, $0x0140014001400140 // dec_reshuffle_const0
DATA dec_const<>+0x08(SB)/8, $0x0140014001400140
DATA dec_const<>+0x10(SB)/8, $0x0000110000001100 // dec_reshuffle_const1
DATA dec_const<>+0x18(SB)/8, $0x0000110000001100
DATA dec_const<>+0x20(SB)/8, $0x090A040506000102 // dec_reshuffle_mask
DATA dec_const<>+0x28(SB)/8, $0xFFFFFFFF0C0D0E08
GLOBL dec_const<>(SB), (NOPTR+RODATA), $48

//func encodeAsm(dst, src []byte, lut *[64]byte) int
TEXT ·encodeAsm(SB),NOSPLIT,$0
MOVD dst_base+0(FP), R0
Expand All @@ -29,37 +37,37 @@ TEXT ·encodeAsm(SB),NOSPLIT,$0
EOR R5, R5, R5

loop48:
CMP $48, R2
BLT lessThan48

// Move the input bits to where they need to be in the outputs. Except
// for the first output, the high two bits are not cleared.
VLD3.P 48(R1), [V0.B16, V1.B16, V2.B16]
VUSHR $2, V0.B16, V3.B16
VUSHR $4, V1.B16, V4.B16
VUSHR $6, V2.B16, V5.B16
VSLI $4, V0.B16, V4.B16
VSLI $2, V1.B16, V5.B16

// Clear the high two bits in the second, third and fourth output.
VAND V7.B16, V4.B16, V4.B16
VAND V7.B16, V5.B16, V5.B16
VAND V7.B16, V2.B16, V6.B16

// The bits have now been shifted to the right locations;
// translate their values 0..63 to the Base64 alphabet.
// Use a 64-byte table lookup:
VTBL V3.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V3.B16
VTBL V4.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V4.B16
VTBL V5.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V5.B16
VTBL V6.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V6.B16

// Interleave and store output:
VST4.P [V3.B16, V4.B16, V5.B16, V6.B16], 64(R0)

SUB $48, R2
ADD $64, R5
B loop48
CMP $48, R2
BLT lessThan48

// Move the input bits to where they need to be in the outputs. Except
// for the first output, the high two bits are not cleared.
VLD3.P 48(R1), [V0.B16, V1.B16, V2.B16]
VUSHR $2, V0.B16, V3.B16
VUSHR $4, V1.B16, V4.B16
VUSHR $6, V2.B16, V5.B16
VSLI $4, V0.B16, V4.B16
VSLI $2, V1.B16, V5.B16

// Clear the high two bits in the second, third and fourth output.
VAND V7.B16, V4.B16, V4.B16
VAND V7.B16, V5.B16, V5.B16
VAND V7.B16, V2.B16, V6.B16

// The bits have now been shifted to the right locations;
// translate their values 0..63 to the Base64 alphabet.
// Use a 64-byte table lookup:
VTBL V3.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V3.B16
VTBL V4.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V4.B16
VTBL V5.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V5.B16
VTBL V6.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V6.B16

// Interleave and store output:
VST4.P [V3.B16, V4.B16, V5.B16, V6.B16], 64(R0)

SUB $48, R2
ADD $64, R5
B loop48

lessThan48:
// fast return
Expand All @@ -73,29 +81,29 @@ lessThan48:
VSHL $2, V7.S4, V12.S4 // mulhi constant

loop12:
VLD1 (R1), [V0.B16]
VTBL V3.B16, [V0.B16], V0.B16 // shuffle bytes
VAND V4.B16, V0.B16, V1.B16 // AND mulhi mask

WORD $0x2e61c182 // UMULL V1.H8, V12.H8, V2.H8
WORD $0x6e61c181 // UMULL2 V1.H8, V12.H8, V1.H8
VTBL V6.B16, [V1.B16, V2.B16], V1.B16

VAND V0.B16, V5.B16, V0.B16
WORD $0x4e609ce0 // VMUL V0.H8, V7.H8, V0.H8
VORR V0.B16, V1.B16, V0.B16

// The bits have now been shifted to the right locations;
// translate their values 0..63 to the Base64 alphabet.
// Use a 64-byte table lookup:
VTBL V0.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V0.B16
VST1.P [V0.B16], 16(R0)

ADD $12, R1
ADD $16, R5
SUB $12, R2
CMP $16, R2
BGE loop12
VLD1 (R1), [V0.B16]
VTBL V3.B16, [V0.B16], V0.B16 // shuffle bytes
VAND V4.B16, V0.B16, V1.B16 // AND mulhi mask

WORD $0x2e61c182 // UMULL V1.H8, V12.H8, V2.H8
WORD $0x6e61c181 // UMULL2 V1.H8, V12.H8, V1.H8
VTBL V6.B16, [V1.B16, V2.B16], V1.B16

VAND V0.B16, V5.B16, V0.B16
WORD $0x4e609ce0 // VMUL V0.H8, V7.H8, V0.H8
VORR V0.B16, V1.B16, V0.B16

// The bits have now been shifted to the right locations;
// translate their values 0..63 to the Base64 alphabet.
// Use a 64-byte table lookup:
VTBL V0.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V0.B16
VST1.P [V0.B16], 16(R0)

ADD $12, R1
ADD $16, R5
SUB $12, R2
CMP $16, R2
BGE loop12

done:
MOVD R5, ret+56(FP)
Expand All @@ -113,68 +121,118 @@ TEXT ·decodeAsm(SB),NOSPLIT,$0
MOVD $63, R4
VDUP R4, V7.B16

loop:
CMP $64, R2
BLT done
loop64:
CMP $64, R2
BLT lessThan64

VLD4.P 64(R1), [V0.B16, V1.B16, V2.B16, V3.B16]
VLD4.P 64(R1), [V0.B16, V1.B16, V2.B16, V3.B16]

// Get indices for second LUT:
WORD $0x6e272c10 // VUQSUB V7.B16, V0.B16, V16.B16
WORD $0x6e272c31 // VUQSUB V7.B16, V1.B16, V17.B16
WORD $0x6e272c52 // VUQSUB V7.B16, V2.B16, V18.B16
WORD $0x6e272c73 // VUQSUB V7.B16, V3.B16, V19.B16

// Get values from first LUT:
VTBL V0.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V20.B16
VTBL V1.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V21.B16
VTBL V2.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V22.B16
VTBL V3.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V23.B16

// Get values from second LUT:
WORD $0x4e107190 // VTBX V16.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V16.B16
WORD $0x4e117191 // VTBX V17.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V17.B16
WORD $0x4e127192 // VTBX V18.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V18.B16
WORD $0x4e137193 // VTBX V19.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V19.B16

// Get final values:
VORR V16.B16, V20.B16, V0.B16
VORR V17.B16, V21.B16, V1.B16
VORR V18.B16, V22.B16, V2.B16
VORR V19.B16, V23.B16, V3.B16

// Check for invalid input, any value larger than 63:
WORD $0x6e273410 // VCMHI V7.B16, V0.B16, V16.B16
WORD $0x6e273431 // VCMHI V7.B16, V1.B16, V17.B16
WORD $0x6e273452 // VCMHI V7.B16, V2.B16, V18.B16
WORD $0x6e273473 // VCMHI V7.B16, V3.B16, V19.B16

VORR V17.B16, V16.B16, V16.B16
VORR V18.B16, V16.B16, V16.B16
VORR V19.B16, V16.B16, V16.B16

// Check that all bits are zero:
WORD $0x6e30aa11 // VUMAXV V16.B16, V17
VMOV V17.B[0], R5
CBNZ R5, done

// Compress four bytes into three:
VSHL $2, V0.B16, V4.B16
VUSHR $4, V1.B16, V16.B16
VORR V16.B16, V4.B16, V4.B16

VSHL $4, V1.B16, V5.B16
VUSHR $2, V2.B16, V16.B16
VORR V16.B16, V5.B16, V5.B16
// Get indices for second LUT
WORD $0x6e272c10 // VUQSUB V7.B16, V0.B16, V16.B16
WORD $0x6e272c31 // VUQSUB V7.B16, V1.B16, V17.B16
WORD $0x6e272c52 // VUQSUB V7.B16, V2.B16, V18.B16
WORD $0x6e272c73 // VUQSUB V7.B16, V3.B16, V19.B16

// Get values from first LUT
VTBL V0.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V20.B16
VTBL V1.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V21.B16
VTBL V2.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V22.B16
VTBL V3.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V23.B16

// Get values from second LUT
WORD $0x4e107190 // VTBX V16.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V16.B16
WORD $0x4e117191 // VTBX V17.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V17.B16
WORD $0x4e127192 // VTBX V18.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V18.B16
WORD $0x4e137193 // VTBX V19.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V19.B16

// Get final values:
VORR V16.B16, V20.B16, V0.B16
VORR V17.B16, V21.B16, V1.B16
VORR V18.B16, V22.B16, V2.B16
VORR V19.B16, V23.B16, V3.B16

// Check for invalid input, any value larger than 63
WORD $0x6e273410 // VCMHI V7.B16, V0.B16, V16.B16
WORD $0x6e273431 // VCMHI V7.B16, V1.B16, V17.B16
WORD $0x6e273452 // VCMHI V7.B16, V2.B16, V18.B16
WORD $0x6e273473 // VCMHI V7.B16, V3.B16, V19.B16

VORR V17.B16, V16.B16, V16.B16
VORR V18.B16, V16.B16, V16.B16
VORR V19.B16, V16.B16, V16.B16

// Check that all bits are zero:
WORD $0x6e30aa11 // VUMAXV V16.B16, V17
VMOV V17.B[0], R5
CBNZ R5, done

// Compress four bytes into three
VSHL $2, V0.B16, V4.B16
VUSHR $4, V1.B16, V16.B16
VORR V16.B16, V4.B16, V4.B16

VSHL $4, V1.B16, V5.B16
VUSHR $2, V2.B16, V16.B16
VORR V16.B16, V5.B16, V5.B16

VSHL $6, V2.B16, V16.B16
VORR V16.B16, V3.B16, V6.B16
VSHL $6, V2.B16, V16.B16
VORR V16.B16, V3.B16, V6.B16

// Interleave and store decoded result
VST3.P [V4.B16, V5.B16, V6.B16], 48(R0)

SUB $64, R2
B loop64

lessThan64:
// fast return
CMP $24, R2
BLT done

MOVD $dec_const<>(SB), R4
VLD1 (R4), [V1.B16, V2.B16, V3.B16]

// Interleave and store decoded result:
VST3.P [V4.B16, V5.B16, V6.B16], 48(R0)
loop16:
VLD1.P 16(R1), [V0.B16]

SUB $64, R2
B loop
// Get indices for second LUT
WORD $0x6e272c10 // VUQSUB V7.B16, V0.B16, V16.B16

// Get values from first LUT
VTBL V0.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V20.B16

// Get values from second LUT
WORD $0x4e107190 // VTBX V16.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V16.B16

// Get final values:
VORR V16.B16, V20.B16, V0.B16

// Check for invalid input, any value larger than 63
WORD $0x6e273410 // VCMHI V7.B16, V0.B16, V16.B16
// Check that all bits are zero:
WORD $0x6e30aa11 // VUMAXV V16.B16, V17
VMOV V17.B[0], R5
CBNZ R5, done

// Compress four bytes into three
// swap and merge adjacent 6-bit fields
WORD $0x2e20c024 // UMULL V0.B16, V1.B16, V4.H8
WORD $0x6e20c020 // UMULL2 V0.B16, V1.B16, V0.H8
VADDP V0.H8, V4.H8, V0.H8

// swap and merge 12-bit words into a 24-bit word
WORD $0x2e60c044 // UMULL V0.H8, V2.H8, V4.S4
WORD $0x6e60c040 // UMULL2 V0.H8, V2.H8, V0.S4
VADDP V0.S4, V4.S4, V0.S4

// reshuffle bytes
VTBL V3.B16, [V0.B16], V0.B16
VST1 [V0.B16], (R0)

ADD $12, R0
SUB $16, R2
CMP $24, R2
BGE loop16

done:
MOVD R2, ret+56(FP)
Expand Down
20 changes: 13 additions & 7 deletions base64_arm64_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,24 @@ func TestStdEncodeSIMD(t *testing.T) {

func TestStdDecodeSIMD(t *testing.T) {
pairs := []testpair{
{"abcdefghijkl", "YWJjZGVmZ2hpamtsYWJjZGVmZ2hpamts"},
{"\x2b\xf7\xcc\x27\x01\xfe\x43\x97\xb4\x9e\xbe\xed", "K/fMJwH+Q5e0nr7tK/fMJwH+Q5e0nr7t"},
{"abcdefghijklabcdefghijklabcdefghijkl", "YWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamts"},
{"abcdefghijklabcdefghijklabcdefghijklabcdefghijkl", "YWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamts"},
{"abcdefghijklabcdefghijklabcdefghijklabcdefghijkl", "YWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamt="},
{"abcdefghijklabcdefghijklabcdefghijklabcdefghijklabcdefghijklabcdefghijklabcdefghijklabcdefghijkl", "YWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamts"},
}
for _, p := range pairs {
for i, p := range pairs {
expected := []byte(p.decoded)
src := []byte(p.encoded)
dst := make([]byte, len(expected))

ret := decodeAsm(dst, src, &dencodeStdLut)
if ret == len(src) {
t.Fatal("should return decode")
t.Fatalf("case %v should return decode %v, got %v", i, len(src), ret)
}
if !bytes.Equal(dst, expected) {
t.Fatalf("got %x, expected %x", dst, expected)
t.Fatalf("case %v got %x, expected %x", i, dst, expected)
}
}
}
Expand All @@ -58,7 +61,7 @@ func TestUrlEncodeSIMD(t *testing.T) {
pairs := []testpair{
{"!?$*&()'-=@~0000", "IT8kKiYoKSctPUB-"},
{"\x2b\xf7\xcc\x27\x01\xfe\x43\x97\xb4\x9e\xbe\xed\x5a\xcc\x70\x90", "K_fMJwH-Q5e0nr7t"},
{"!?$*&()'-=@~!?$*&()'-=@~0000", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"},
{"!?$*&()'-=@~!?$*&()'-=@~0000", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"},
{"!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"},
{"!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"},
}
Expand All @@ -80,21 +83,24 @@ func TestUrlEncodeSIMD(t *testing.T) {

func TestUrlDecodeSIMD(t *testing.T) {
pairs := []testpair{
{"!?$*&()'-=@~", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"},
{"\x2b\xf7\xcc\x27\x01\xfe\x43\x97\xb4\x9e\xbe\xed", "K_fMJwH-Q5e0nr7tK_fMJwH-Q5e0nr7t"},
{"!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"},
{"!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"},
{"abcdefghijklabcdefghijklabcdefghijklabcdefghijkl", "YWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamt="},
{"!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"},
}
for _, p := range pairs {
for i, p := range pairs {
expected := []byte(p.decoded)
src := []byte(p.encoded)
dst := make([]byte, len(expected))

ret := decodeAsm(dst, src, &dencodeUrlLut)
if ret == len(src) {
t.Fatal("should return decode")
t.Fatalf("case %v should return decode %v, got %v", i, len(src), ret)
}
if !bytes.Equal(dst, expected) {
t.Fatalf("got %x, expected %x", dst, expected)
t.Fatalf("case %v got %x, expected %x", i, dst, expected)
}
}
}

0 comments on commit ddd397f

Please sign in to comment.