diff --git a/base64_arm64.go b/base64_arm64.go index d5664d1..dbdc77b 100644 --- a/base64_arm64.go +++ b/base64_arm64.go @@ -41,7 +41,7 @@ func encode(enc *Encoding, dst, src []byte) { func decode(enc *Encoding, dst, src []byte) (int, error) { srcLen := len(src) - if srcLen >= 64 { + if srcLen >= 24 { remain := srcLen if enc.lut == &encodeStdLut { remain = decodeAsm(dst, src, &dencodeStdLut) diff --git a/base64_arm64.s b/base64_arm64.s index a49a652..d15a6ba 100644 --- a/base64_arm64.s +++ b/base64_arm64.s @@ -15,6 +15,14 @@ DATA enc_const<>+0x30(SB)/8, $0x1f1e1b1a17161312 // high part of word DATA enc_const<>+0x38(SB)/8, $0x0f0e0b0a07060302 GLOBL enc_const<>(SB), (NOPTR+RODATA), $64 +DATA dec_const<>+0x00(SB)/8, $0x0140014001400140 // dec_reshuffle_const0 +DATA dec_const<>+0x08(SB)/8, $0x0140014001400140 +DATA dec_const<>+0x10(SB)/8, $0x0000110000001100 // dec_reshuffle_const1 +DATA dec_const<>+0x18(SB)/8, $0x0000110000001100 +DATA dec_const<>+0x20(SB)/8, $0x090A040506000102 // dec_reshuffle_mask +DATA dec_const<>+0x28(SB)/8, $0xFFFFFFFF0C0D0E08 +GLOBL dec_const<>(SB), (NOPTR+RODATA), $48 + //func encodeAsm(dst, src []byte, lut *[64]byte) int TEXT ·encodeAsm(SB),NOSPLIT,$0 MOVD dst_base+0(FP), R0 @@ -29,37 +37,37 @@ TEXT ·encodeAsm(SB),NOSPLIT,$0 EOR R5, R5, R5 loop48: - CMP $48, R2 - BLT lessThan48 - - // Move the input bits to where they need to be in the outputs. Except - // for the first output, the high two bits are not cleared. - VLD3.P 48(R1), [V0.B16, V1.B16, V2.B16] - VUSHR $2, V0.B16, V3.B16 - VUSHR $4, V1.B16, V4.B16 - VUSHR $6, V2.B16, V5.B16 - VSLI $4, V0.B16, V4.B16 - VSLI $2, V1.B16, V5.B16 - - // Clear the high two bits in the second, third and fourth output. - VAND V7.B16, V4.B16, V4.B16 - VAND V7.B16, V5.B16, V5.B16 - VAND V7.B16, V2.B16, V6.B16 - - // The bits have now been shifted to the right locations; - // translate their values 0..63 to the Base64 alphabet. - // Use a 64-byte table lookup: - VTBL V3.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V3.B16 - VTBL V4.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V4.B16 - VTBL V5.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V5.B16 - VTBL V6.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V6.B16 - - // Interleave and store output: - VST4.P [V3.B16, V4.B16, V5.B16, V6.B16], 64(R0) - - SUB $48, R2 - ADD $64, R5 - B loop48 + CMP $48, R2 + BLT lessThan48 + + // Move the input bits to where they need to be in the outputs. Except + // for the first output, the high two bits are not cleared. + VLD3.P 48(R1), [V0.B16, V1.B16, V2.B16] + VUSHR $2, V0.B16, V3.B16 + VUSHR $4, V1.B16, V4.B16 + VUSHR $6, V2.B16, V5.B16 + VSLI $4, V0.B16, V4.B16 + VSLI $2, V1.B16, V5.B16 + + // Clear the high two bits in the second, third and fourth output. + VAND V7.B16, V4.B16, V4.B16 + VAND V7.B16, V5.B16, V5.B16 + VAND V7.B16, V2.B16, V6.B16 + + // The bits have now been shifted to the right locations; + // translate their values 0..63 to the Base64 alphabet. + // Use a 64-byte table lookup: + VTBL V3.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V3.B16 + VTBL V4.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V4.B16 + VTBL V5.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V5.B16 + VTBL V6.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V6.B16 + + // Interleave and store output: + VST4.P [V3.B16, V4.B16, V5.B16, V6.B16], 64(R0) + + SUB $48, R2 + ADD $64, R5 + B loop48 lessThan48: // fast return @@ -73,29 +81,29 @@ lessThan48: VSHL $2, V7.S4, V12.S4 // mulhi constant loop12: - VLD1 (R1), [V0.B16] - VTBL V3.B16, [V0.B16], V0.B16 // shuffle bytes - VAND V4.B16, V0.B16, V1.B16 // AND mulhi mask - - WORD $0x2e61c182 // UMULL V1.H8, V12.H8, V2.H8 - WORD $0x6e61c181 // UMULL2 V1.H8, V12.H8, V1.H8 - VTBL V6.B16, [V1.B16, V2.B16], V1.B16 - - VAND V0.B16, V5.B16, V0.B16 - WORD $0x4e609ce0 // VMUL V0.H8, V7.H8, V0.H8 - VORR V0.B16, V1.B16, V0.B16 - - // The bits have now been shifted to the right locations; - // translate their values 0..63 to the Base64 alphabet. - // Use a 64-byte table lookup: - VTBL V0.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V0.B16 - VST1.P [V0.B16], 16(R0) - - ADD $12, R1 - ADD $16, R5 - SUB $12, R2 - CMP $16, R2 - BGE loop12 + VLD1 (R1), [V0.B16] + VTBL V3.B16, [V0.B16], V0.B16 // shuffle bytes + VAND V4.B16, V0.B16, V1.B16 // AND mulhi mask + + WORD $0x2e61c182 // UMULL V1.H8, V12.H8, V2.H8 + WORD $0x6e61c181 // UMULL2 V1.H8, V12.H8, V1.H8 + VTBL V6.B16, [V1.B16, V2.B16], V1.B16 + + VAND V0.B16, V5.B16, V0.B16 + WORD $0x4e609ce0 // VMUL V0.H8, V7.H8, V0.H8 + VORR V0.B16, V1.B16, V0.B16 + + // The bits have now been shifted to the right locations; + // translate their values 0..63 to the Base64 alphabet. + // Use a 64-byte table lookup: + VTBL V0.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V0.B16 + VST1.P [V0.B16], 16(R0) + + ADD $12, R1 + ADD $16, R5 + SUB $12, R2 + CMP $16, R2 + BGE loop12 done: MOVD R5, ret+56(FP) @@ -113,68 +121,118 @@ TEXT ·decodeAsm(SB),NOSPLIT,$0 MOVD $63, R4 VDUP R4, V7.B16 -loop: - CMP $64, R2 - BLT done +loop64: + CMP $64, R2 + BLT lessThan64 - VLD4.P 64(R1), [V0.B16, V1.B16, V2.B16, V3.B16] + VLD4.P 64(R1), [V0.B16, V1.B16, V2.B16, V3.B16] - // Get indices for second LUT: - WORD $0x6e272c10 // VUQSUB V7.B16, V0.B16, V16.B16 - WORD $0x6e272c31 // VUQSUB V7.B16, V1.B16, V17.B16 - WORD $0x6e272c52 // VUQSUB V7.B16, V2.B16, V18.B16 - WORD $0x6e272c73 // VUQSUB V7.B16, V3.B16, V19.B16 - - // Get values from first LUT: - VTBL V0.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V20.B16 - VTBL V1.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V21.B16 - VTBL V2.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V22.B16 - VTBL V3.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V23.B16 - - // Get values from second LUT: - WORD $0x4e107190 // VTBX V16.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V16.B16 - WORD $0x4e117191 // VTBX V17.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V17.B16 - WORD $0x4e127192 // VTBX V18.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V18.B16 - WORD $0x4e137193 // VTBX V19.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V19.B16 - - // Get final values: - VORR V16.B16, V20.B16, V0.B16 - VORR V17.B16, V21.B16, V1.B16 - VORR V18.B16, V22.B16, V2.B16 - VORR V19.B16, V23.B16, V3.B16 - - // Check for invalid input, any value larger than 63: - WORD $0x6e273410 // VCMHI V7.B16, V0.B16, V16.B16 - WORD $0x6e273431 // VCMHI V7.B16, V1.B16, V17.B16 - WORD $0x6e273452 // VCMHI V7.B16, V2.B16, V18.B16 - WORD $0x6e273473 // VCMHI V7.B16, V3.B16, V19.B16 - - VORR V17.B16, V16.B16, V16.B16 - VORR V18.B16, V16.B16, V16.B16 - VORR V19.B16, V16.B16, V16.B16 - - // Check that all bits are zero: - WORD $0x6e30aa11 // VUMAXV V16.B16, V17 - VMOV V17.B[0], R5 - CBNZ R5, done - - // Compress four bytes into three: - VSHL $2, V0.B16, V4.B16 - VUSHR $4, V1.B16, V16.B16 - VORR V16.B16, V4.B16, V4.B16 - - VSHL $4, V1.B16, V5.B16 - VUSHR $2, V2.B16, V16.B16 - VORR V16.B16, V5.B16, V5.B16 + // Get indices for second LUT + WORD $0x6e272c10 // VUQSUB V7.B16, V0.B16, V16.B16 + WORD $0x6e272c31 // VUQSUB V7.B16, V1.B16, V17.B16 + WORD $0x6e272c52 // VUQSUB V7.B16, V2.B16, V18.B16 + WORD $0x6e272c73 // VUQSUB V7.B16, V3.B16, V19.B16 + + // Get values from first LUT + VTBL V0.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V20.B16 + VTBL V1.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V21.B16 + VTBL V2.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V22.B16 + VTBL V3.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V23.B16 + + // Get values from second LUT + WORD $0x4e107190 // VTBX V16.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V16.B16 + WORD $0x4e117191 // VTBX V17.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V17.B16 + WORD $0x4e127192 // VTBX V18.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V18.B16 + WORD $0x4e137193 // VTBX V19.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V19.B16 + + // Get final values: + VORR V16.B16, V20.B16, V0.B16 + VORR V17.B16, V21.B16, V1.B16 + VORR V18.B16, V22.B16, V2.B16 + VORR V19.B16, V23.B16, V3.B16 + + // Check for invalid input, any value larger than 63 + WORD $0x6e273410 // VCMHI V7.B16, V0.B16, V16.B16 + WORD $0x6e273431 // VCMHI V7.B16, V1.B16, V17.B16 + WORD $0x6e273452 // VCMHI V7.B16, V2.B16, V18.B16 + WORD $0x6e273473 // VCMHI V7.B16, V3.B16, V19.B16 + + VORR V17.B16, V16.B16, V16.B16 + VORR V18.B16, V16.B16, V16.B16 + VORR V19.B16, V16.B16, V16.B16 + + // Check that all bits are zero: + WORD $0x6e30aa11 // VUMAXV V16.B16, V17 + VMOV V17.B[0], R5 + CBNZ R5, done + + // Compress four bytes into three + VSHL $2, V0.B16, V4.B16 + VUSHR $4, V1.B16, V16.B16 + VORR V16.B16, V4.B16, V4.B16 + + VSHL $4, V1.B16, V5.B16 + VUSHR $2, V2.B16, V16.B16 + VORR V16.B16, V5.B16, V5.B16 - VSHL $6, V2.B16, V16.B16 - VORR V16.B16, V3.B16, V6.B16 + VSHL $6, V2.B16, V16.B16 + VORR V16.B16, V3.B16, V6.B16 + + // Interleave and store decoded result + VST3.P [V4.B16, V5.B16, V6.B16], 48(R0) + + SUB $64, R2 + B loop64 + +lessThan64: + // fast return + CMP $24, R2 + BLT done + + MOVD $dec_const<>(SB), R4 + VLD1 (R4), [V1.B16, V2.B16, V3.B16] - // Interleave and store decoded result: - VST3.P [V4.B16, V5.B16, V6.B16], 48(R0) +loop16: + VLD1.P 16(R1), [V0.B16] - SUB $64, R2 - B loop + // Get indices for second LUT + WORD $0x6e272c10 // VUQSUB V7.B16, V0.B16, V16.B16 + + // Get values from first LUT + VTBL V0.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V20.B16 + + // Get values from second LUT + WORD $0x4e107190 // VTBX V16.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V16.B16 + + // Get final values: + VORR V16.B16, V20.B16, V0.B16 + + // Check for invalid input, any value larger than 63 + WORD $0x6e273410 // VCMHI V7.B16, V0.B16, V16.B16 + // Check that all bits are zero: + WORD $0x6e30aa11 // VUMAXV V16.B16, V17 + VMOV V17.B[0], R5 + CBNZ R5, done + + // Compress four bytes into three + // swap and merge adjacent 6-bit fields + WORD $0x2e20c024 // UMULL V0.B16, V1.B16, V4.H8 + WORD $0x6e20c020 // UMULL2 V0.B16, V1.B16, V0.H8 + VADDP V0.H8, V4.H8, V0.H8 + + // swap and merge 12-bit words into a 24-bit word + WORD $0x2e60c044 // UMULL V0.H8, V2.H8, V4.S4 + WORD $0x6e60c040 // UMULL2 V0.H8, V2.H8, V0.S4 + VADDP V0.S4, V4.S4, V0.S4 + + // reshuffle bytes + VTBL V3.B16, [V0.B16], V0.B16 + VST1 [V0.B16], (R0) + + ADD $12, R0 + SUB $16, R2 + CMP $24, R2 + BGE loop16 done: MOVD R2, ret+56(FP) diff --git a/base64_arm64_test.go b/base64_arm64_test.go index 2ab87de..ffc79e8 100644 --- a/base64_arm64_test.go +++ b/base64_arm64_test.go @@ -35,21 +35,24 @@ func TestStdEncodeSIMD(t *testing.T) { func TestStdDecodeSIMD(t *testing.T) { pairs := []testpair{ + {"abcdefghijkl", "YWJjZGVmZ2hpamtsYWJjZGVmZ2hpamts"}, + {"\x2b\xf7\xcc\x27\x01\xfe\x43\x97\xb4\x9e\xbe\xed", "K/fMJwH+Q5e0nr7tK/fMJwH+Q5e0nr7t"}, + {"abcdefghijklabcdefghijklabcdefghijkl", "YWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamts"}, {"abcdefghijklabcdefghijklabcdefghijklabcdefghijkl", "YWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamts"}, {"abcdefghijklabcdefghijklabcdefghijklabcdefghijkl", "YWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamt="}, {"abcdefghijklabcdefghijklabcdefghijklabcdefghijklabcdefghijklabcdefghijklabcdefghijklabcdefghijkl", "YWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamts"}, } - for _, p := range pairs { + for i, p := range pairs { expected := []byte(p.decoded) src := []byte(p.encoded) dst := make([]byte, len(expected)) ret := decodeAsm(dst, src, &dencodeStdLut) if ret == len(src) { - t.Fatal("should return decode") + t.Fatalf("case %v should return decode %v, got %v", i, len(src), ret) } if !bytes.Equal(dst, expected) { - t.Fatalf("got %x, expected %x", dst, expected) + t.Fatalf("case %v got %x, expected %x", i, dst, expected) } } } @@ -58,7 +61,7 @@ func TestUrlEncodeSIMD(t *testing.T) { pairs := []testpair{ {"!?$*&()'-=@~0000", "IT8kKiYoKSctPUB-"}, {"\x2b\xf7\xcc\x27\x01\xfe\x43\x97\xb4\x9e\xbe\xed\x5a\xcc\x70\x90", "K_fMJwH-Q5e0nr7t"}, - {"!?$*&()'-=@~!?$*&()'-=@~0000", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"}, + {"!?$*&()'-=@~!?$*&()'-=@~0000", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"}, {"!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"}, {"!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"}, } @@ -80,21 +83,24 @@ func TestUrlEncodeSIMD(t *testing.T) { func TestUrlDecodeSIMD(t *testing.T) { pairs := []testpair{ + {"!?$*&()'-=@~", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"}, + {"\x2b\xf7\xcc\x27\x01\xfe\x43\x97\xb4\x9e\xbe\xed", "K_fMJwH-Q5e0nr7tK_fMJwH-Q5e0nr7t"}, + {"!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"}, {"!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"}, {"abcdefghijklabcdefghijklabcdefghijklabcdefghijkl", "YWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamt="}, {"!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"}, } - for _, p := range pairs { + for i, p := range pairs { expected := []byte(p.decoded) src := []byte(p.encoded) dst := make([]byte, len(expected)) ret := decodeAsm(dst, src, &dencodeUrlLut) if ret == len(src) { - t.Fatal("should return decode") + t.Fatalf("case %v should return decode %v, got %v", i, len(src), ret) } if !bytes.Equal(dst, expected) { - t.Fatalf("got %x, expected %x", dst, expected) + t.Fatalf("case %v got %x, expected %x", i, dst, expected) } } }