From 5f40f9a1400558e1c37593698646c256982a894a Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Tue, 3 Sep 2024 11:20:08 +1000 Subject: [PATCH 1/2] Thumb-2 ChaCha: implemention in assembly Implementation of ChaCha algorithm for ARM Thumb-2. --- src/include.am | 8 + wolfcrypt/src/chacha.c | 3 +- wolfcrypt/src/port/arm/thumb2-chacha-asm.S | 575 +++++++++++++++ wolfcrypt/src/port/arm/thumb2-chacha-asm_c.c | 731 +++++++++++++++++++ wolfcrypt/src/port/arm/thumb2-chacha.c | 187 +++++ 5 files changed, 1503 insertions(+), 1 deletion(-) create mode 100644 wolfcrypt/src/port/arm/thumb2-chacha-asm.S create mode 100644 wolfcrypt/src/port/arm/thumb2-chacha-asm_c.c create mode 100644 wolfcrypt/src/port/arm/thumb2-chacha.c diff --git a/src/include.am b/src/include.am index 36607f5ce9..61f89f86d4 100644 --- a/src/include.am +++ b/src/include.am @@ -996,6 +996,14 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha.c if BUILD_ARMASM_NEON src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-chacha.c else +if BUILD_ARMASM +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-chacha.c +if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-chacha-asm_c.c +else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-chacha-asm.S +endif !BUILD_ARMASM_INLINE +endif BUILD_ARMASM if BUILD_RISCV_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/riscv/riscv-64-chacha.c endif BUILD_RISCV_ASM diff --git a/wolfcrypt/src/chacha.c b/wolfcrypt/src/chacha.c index 1bad41dfb3..f7ee6bba38 100644 --- a/wolfcrypt/src/chacha.c +++ b/wolfcrypt/src/chacha.c @@ -72,7 +72,8 @@ Public domain. #endif /* HAVE_CHACHA */ -#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_NEON) +#if defined(WOLFSSL_ARMASM) && (!defined(WOLFSSL_ARMASM_NO_NEON) || \ + defined(__thumb__)) /* implementation is located in wolfcrypt/src/port/arm/armv8-chacha.c */ #elif defined(WOLFSSL_RISCV_ASM) diff --git a/wolfcrypt/src/port/arm/thumb2-chacha-asm.S b/wolfcrypt/src/port/arm/thumb2-chacha-asm.S new file mode 100644 index 0000000000..4c3c2e7e77 --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-chacha-asm.S @@ -0,0 +1,575 @@ +/* thumb2-chacha-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./chacha/chacha.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-chacha-asm.S + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__thumb__) +#ifndef WOLFSSL_ARMASM_INLINE + .thumb + .syntax unified +#ifdef HAVE_CHACHA + .text + .align 4 + .globl wc_chacha_setiv + .type wc_chacha_setiv, %function +wc_chacha_setiv: + PUSH {r4, r5, r6, lr} + ADD r3, r0, #0x34 + LDR r4, [r1] + LDR r5, [r1, #4] + LDR r6, [r1, #8] + STR r2, [r0, #48] +#ifdef BIG_ENDIAN_ORDER + REV r4, r4 + REV r5, r5 + REV r6, r6 +#endif /* BIG_ENDIAN_ORDER */ + STM r3, {r4, r5, r6} + POP {r4, r5, r6, pc} + /* Cycle Count = 26 */ + .size wc_chacha_setiv,.-wc_chacha_setiv + .text + .type L_chacha_thumb2_constants, %object + .size L_chacha_thumb2_constants, 32 + .align 4 +L_chacha_thumb2_constants: + .word 0x61707865 + .word 0x3120646e + .word 0x79622d36 + .word 0x6b206574 + .word 0x61707865 + .word 0x3320646e + .word 0x79622d32 + .word 0x6b206574 + .text + .align 4 + .globl wc_chacha_setkey + .type wc_chacha_setkey, %function +wc_chacha_setkey: + PUSH {r4, r5, r6, r7, lr} + ADR r7, L_chacha_thumb2_constants + SUBS r2, r2, #0x10 + ADD r7, r7, r2 + /* Start state with constants */ + LDM r7, {r3, r4, r5, r6} + STM r0!, {r3, r4, r5, r6} + /* Next is first 16 bytes of key. */ + LDR r3, [r1] + LDR r4, [r1, #4] + LDR r5, [r1, #8] + LDR r6, [r1, #12] +#ifdef BIG_ENDIAN_ORDER + REV r3, r3 + REV r4, r4 + REV r5, r5 + REV r6, r6 +#endif /* BIG_ENDIAN_ORDER */ + STM r0!, {r3, r4, r5, r6} + /* Next 16 bytes of key. */ +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_chacha_thumb2_setkey_same_keyb_ytes +#else + BEQ.N L_chacha_thumb2_setkey_same_keyb_ytes +#endif + /* Update key pointer for next 16 bytes. */ + ADD r1, r1, r2 + LDR r3, [r1] + LDR r4, [r1, #4] + LDR r5, [r1, #8] + LDR r6, [r1, #12] +L_chacha_thumb2_setkey_same_keyb_ytes: + STM r0, {r3, r4, r5, r6} + POP {r4, r5, r6, r7, pc} + /* Cycle Count = 60 */ + .size wc_chacha_setkey,.-wc_chacha_setkey + .text + .align 4 + .globl wc_chacha_crypt_bytes + .type wc_chacha_crypt_bytes, %function +wc_chacha_crypt_bytes: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0x34 + MOV lr, r0 + STRD r0, r1, [sp, #32] + STRD r2, r3, [sp, #40] +L_chacha_thumb2_crypt_block: + /* Put x[12]..x[15] onto stack. */ + LDRD r4, r5, [lr, #48] + LDRD r6, r7, [lr, #56] + STRD r4, r5, [sp, #16] + STRD r6, r7, [sp, #24] + /* Load x[0]..x[12] into registers. */ + LDM lr, {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12} + /* 10x 2 full rounds to perform. */ + MOV lr, #0xa + STR lr, [sp, #48] +L_chacha_thumb2_crypt_loop: + /* 0, 4, 8, 12 */ + /* 1, 5, 9, 13 */ + LDR lr, [sp, #20] + ADD r0, r0, r4 + ADD r1, r1, r5 + EOR r12, r12, r0 + EOR lr, lr, r1 + ROR r12, r12, #16 + ROR lr, lr, #16 + ADD r8, r8, r12 + ADD r9, r9, lr + EOR r4, r4, r8 + EOR r5, r5, r9 + ROR r4, r4, #20 + ROR r5, r5, #20 + ADD r0, r0, r4 + ADD r1, r1, r5 + EOR r12, r12, r0 + EOR lr, lr, r1 + ROR r12, r12, #24 + ROR lr, lr, #24 + ADD r8, r8, r12 + ADD r9, r9, lr + EOR r4, r4, r8 + EOR r5, r5, r9 + ROR r4, r4, #25 + ROR r5, r5, #25 + STR r12, [sp, #16] + STR lr, [sp, #20] + /* 2, 6, 10, 14 */ + /* 3, 7, 11, 15 */ + LDR r12, [sp, #24] + LDR lr, [sp, #28] + ADD r2, r2, r6 + ADD r3, r3, r7 + EOR r12, r12, r2 + EOR lr, lr, r3 + ROR r12, r12, #16 + ROR lr, lr, #16 + ADD r10, r10, r12 + ADD r11, r11, lr + EOR r6, r6, r10 + EOR r7, r7, r11 + ROR r6, r6, #20 + ROR r7, r7, #20 + ADD r2, r2, r6 + ADD r3, r3, r7 + EOR r12, r12, r2 + EOR lr, lr, r3 + ROR r12, r12, #24 + ROR lr, lr, #24 + ADD r10, r10, r12 + ADD r11, r11, lr + EOR r6, r6, r10 + EOR r7, r7, r11 + ROR r6, r6, #25 + ROR r7, r7, #25 + /* 3, 4, 9, 14 */ + /* 0, 5, 10, 15 */ + ADD r3, r3, r4 + ADD r0, r0, r5 + EOR r12, r12, r3 + EOR lr, lr, r0 + ROR r12, r12, #16 + ROR lr, lr, #16 + ADD r9, r9, r12 + ADD r10, r10, lr + EOR r4, r4, r9 + EOR r5, r5, r10 + ROR r4, r4, #20 + ROR r5, r5, #20 + ADD r3, r3, r4 + ADD r0, r0, r5 + EOR r12, r12, r3 + EOR lr, lr, r0 + ROR r12, r12, #24 + ROR lr, lr, #24 + ADD r9, r9, r12 + ADD r10, r10, lr + EOR r4, r4, r9 + EOR r5, r5, r10 + ROR r4, r4, #25 + ROR r5, r5, #25 + STR r12, [sp, #24] + STR lr, [sp, #28] + LDR r12, [sp, #16] + LDR lr, [sp, #20] + /* 1, 6, 11, 12 */ + /* 2, 7, 8, 13 */ + ADD r1, r1, r6 + ADD r2, r2, r7 + EOR r12, r12, r1 + EOR lr, lr, r2 + ROR r12, r12, #16 + ROR lr, lr, #16 + ADD r11, r11, r12 + ADD r8, r8, lr + EOR r6, r6, r11 + EOR r7, r7, r8 + ROR r6, r6, #20 + ROR r7, r7, #20 + ADD r1, r1, r6 + ADD r2, r2, r7 + EOR r12, r12, r1 + EOR lr, lr, r2 + ROR r12, r12, #24 + ROR lr, lr, #24 + ADD r11, r11, r12 + ADD r8, r8, lr + EOR r6, r6, r11 + EOR r7, r7, r8 + ROR r6, r6, #25 + ROR r7, r7, #25 + STR lr, [sp, #20] + /* Check if we have done enough rounds. */ + LDR lr, [sp, #48] + SUBS lr, lr, #0x1 + STR lr, [sp, #48] +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BGT L_chacha_thumb2_crypt_loop +#else + BGT.N L_chacha_thumb2_crypt_loop +#endif + STM sp, {r8, r9, r10, r11, r12} + LDR lr, [sp, #32] + MOV r12, sp + /* Add in original state */ + LDM lr!, {r8, r9, r10, r11} + ADD r0, r0, r8 + ADD r1, r1, r9 + ADD r2, r2, r10 + ADD r3, r3, r11 + LDM lr!, {r8, r9, r10, r11} + ADD r4, r4, r8 + ADD r5, r5, r9 + ADD r6, r6, r10 + ADD r7, r7, r11 + LDM r12, {r8, r9} + LDM lr!, {r10, r11} + ADD r8, r8, r10 + ADD r9, r9, r11 + STM r12!, {r8, r9} + LDM r12, {r8, r9} + LDM lr!, {r10, r11} + ADD r8, r8, r10 + ADD r9, r9, r11 + STM r12!, {r8, r9} + LDM r12, {r8, r9} + LDM lr!, {r10, r11} + ADD r8, r8, r10 + ADD r9, r9, r11 + ADD r10, r10, #0x1 + STM r12!, {r8, r9} + STR r10, [lr, #-8] + LDM r12, {r8, r9} + LDM lr, {r10, r11} + ADD r8, r8, r10 + ADD r9, r9, r11 + STM r12, {r8, r9} + LDR r12, [sp, #44] + CMP r12, #0x40 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BLT L_chacha_thumb2_crypt_lt_block +#else + BLT.N L_chacha_thumb2_crypt_lt_block +#endif + LDR r12, [sp, #40] + LDR lr, [sp, #36] + /* XOR state into 64 bytes. */ + LDR r8, [r12] + LDR r9, [r12, #4] + LDR r10, [r12, #8] + LDR r11, [r12, #12] + EOR r0, r0, r8 + EOR r1, r1, r9 + EOR r2, r2, r10 + EOR r3, r3, r11 + STR r0, [lr] + STR r1, [lr, #4] + STR r2, [lr, #8] + STR r3, [lr, #12] + LDR r8, [r12, #16] + LDR r9, [r12, #20] + LDR r10, [r12, #24] + LDR r11, [r12, #28] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + STR r4, [lr, #16] + STR r5, [lr, #20] + STR r6, [lr, #24] + STR r7, [lr, #28] + LDR r4, [sp] + LDR r5, [sp, #4] + LDR r6, [sp, #8] + LDR r7, [sp, #12] + LDR r8, [r12, #32] + LDR r9, [r12, #36] + LDR r10, [r12, #40] + LDR r11, [r12, #44] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + STR r4, [lr, #32] + STR r5, [lr, #36] + STR r6, [lr, #40] + STR r7, [lr, #44] + LDR r4, [sp, #16] + LDR r5, [sp, #20] + LDR r6, [sp, #24] + LDR r7, [sp, #28] + LDR r8, [r12, #48] + LDR r9, [r12, #52] + LDR r10, [r12, #56] + LDR r11, [r12, #60] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + STR r4, [lr, #48] + STR r5, [lr, #52] + STR r6, [lr, #56] + STR r7, [lr, #60] + LDR r3, [sp, #44] + ADD r12, r12, #0x40 + ADD lr, lr, #0x40 + STR r12, [sp, #40] + STR lr, [sp, #36] + SUBS r3, r3, #0x40 + LDR lr, [sp, #32] + STR r3, [sp, #44] +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_chacha_thumb2_crypt_block +#else + BNE.N L_chacha_thumb2_crypt_block +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_chacha_thumb2_crypt_done +#else + B.N L_chacha_thumb2_crypt_done +#endif +L_chacha_thumb2_crypt_lt_block: + /* Store in over field of ChaCha. */ + LDR lr, [sp, #32] + ADD r12, lr, #0x44 + STM r12!, {r0, r1, r2, r3, r4, r5, r6, r7} + LDM sp, {r0, r1, r2, r3, r4, r5, r6, r7} + STM r12, {r0, r1, r2, r3, r4, r5, r6, r7} + LDRD r2, r3, [sp, #40] + LDR r1, [sp, #36] + RSB r12, r3, #0x40 + STR r12, [lr, #64] + ADD lr, lr, #0x44 +L_chacha_thumb2_crypt_16byte_loop: + CMP r3, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BLT L_chacha_thumb2_crypt_word_loop +#else + BLT.N L_chacha_thumb2_crypt_word_loop +#endif + /* 16 bytes of state XORed into message. */ + LDM lr!, {r4, r5, r6, r7} + LDR r8, [r2] + LDR r9, [r2, #4] + LDR r10, [r2, #8] + LDR r11, [r2, #12] + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + SUBS r3, r3, #0x10 + STR r8, [r1] + STR r9, [r1, #4] + STR r10, [r1, #8] + STR r11, [r1, #12] +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_chacha_thumb2_crypt_done +#else + BEQ.N L_chacha_thumb2_crypt_done +#endif + ADD r2, r2, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_chacha_thumb2_crypt_16byte_loop +#else + B.N L_chacha_thumb2_crypt_16byte_loop +#endif +L_chacha_thumb2_crypt_word_loop: + CMP r3, #0x4 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BLT L_chacha_thumb2_crypt_byte_start +#else + BLT.N L_chacha_thumb2_crypt_byte_start +#endif + /* 4 bytes of state XORed into message. */ + LDR r4, [lr] + LDR r8, [r2] + EOR r8, r8, r4 + SUBS r3, r3, #0x4 + STR r8, [r1] +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_chacha_thumb2_crypt_done +#else + BEQ.N L_chacha_thumb2_crypt_done +#endif + ADD lr, lr, #0x4 + ADD r2, r2, #0x4 + ADD r1, r1, #0x4 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_chacha_thumb2_crypt_word_loop +#else + B.N L_chacha_thumb2_crypt_word_loop +#endif +L_chacha_thumb2_crypt_byte_start: + LDR r4, [lr] +L_chacha_thumb2_crypt_byte_loop: + LDRB r8, [r2] + EOR r8, r8, r4 + SUBS r3, r3, #0x1 + STRB r8, [r1] +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_chacha_thumb2_crypt_done +#else + BEQ.N L_chacha_thumb2_crypt_done +#endif + LSR r4, r4, #8 + ADD r2, r2, #0x1 + ADD r1, r1, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_chacha_thumb2_crypt_byte_loop +#else + B.N L_chacha_thumb2_crypt_byte_loop +#endif +L_chacha_thumb2_crypt_done: + ADD sp, sp, #0x34 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 508 */ + .size wc_chacha_crypt_bytes,.-wc_chacha_crypt_bytes + .text + .align 4 + .globl wc_chacha_use_over + .type wc_chacha_use_over, %function +wc_chacha_use_over: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} +L_chacha_thumb2_over_16byte_loop: + CMP r3, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BLT L_chacha_thumb2_over_word_loop +#else + BLT.N L_chacha_thumb2_over_word_loop +#endif + /* 16 bytes of state XORed into message. */ + LDR r4, [r0] + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r8, [r2] + LDR r9, [r2, #4] + LDR r10, [r2, #8] + LDR r11, [r2, #12] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + SUBS r3, r3, #0x10 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_chacha_thumb2_over_done +#else + BEQ.N L_chacha_thumb2_over_done +#endif + ADD r0, r0, #0x10 + ADD r2, r2, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_chacha_thumb2_over_16byte_loop +#else + B.N L_chacha_thumb2_over_16byte_loop +#endif +L_chacha_thumb2_over_word_loop: + CMP r3, #0x4 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BLT L_chacha_thumb2_over_byte_loop +#else + BLT.N L_chacha_thumb2_over_byte_loop +#endif + /* 4 bytes of state XORed into message. */ + LDR r4, [r0] + LDR r8, [r2] + EOR r4, r4, r8 + SUBS r3, r3, #0x4 + STR r4, [r1] +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_chacha_thumb2_over_done +#else + BEQ.N L_chacha_thumb2_over_done +#endif + ADD r0, r0, #0x4 + ADD r2, r2, #0x4 + ADD r1, r1, #0x4 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_chacha_thumb2_over_word_loop +#else + B.N L_chacha_thumb2_over_word_loop +#endif +L_chacha_thumb2_over_byte_loop: + /* 4 bytes of state XORed into message. */ + LDRB r4, [r0] + LDRB r8, [r2] + EOR r4, r4, r8 + SUBS r3, r3, #0x1 + STRB r4, [r1] +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_chacha_thumb2_over_done +#else + BEQ.N L_chacha_thumb2_over_done +#endif + ADD r0, r0, #0x1 + ADD r2, r2, #0x1 + ADD r1, r1, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_chacha_thumb2_over_byte_loop +#else + B.N L_chacha_thumb2_over_byte_loop +#endif +L_chacha_thumb2_over_done: + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 108 */ + .size wc_chacha_use_over,.-wc_chacha_use_over +#endif /* HAVE_CHACHA */ +#endif /* !__aarch64__ && __thumb__ */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-chacha-asm_c.c b/wolfcrypt/src/port/arm/thumb2-chacha-asm_c.c new file mode 100644 index 0000000000..0dcdc4e3eb --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-chacha-asm_c.c @@ -0,0 +1,731 @@ +/* thumb2-chacha-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./chacha/chacha.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-chacha-asm.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__thumb__) +#ifdef WOLFSSL_ARMASM_INLINE + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#define WOLFSSL_NO_VAR_ASSIGN_REG +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ +#ifdef HAVE_CHACHA +#include + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void wc_chacha_setiv(word32* x_p, const byte* iv_p, word32 counter_p) +#else +void wc_chacha_setiv(word32* x, const byte* iv, word32 counter) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register word32* x __asm__ ("r0") = (word32*)x_p; + register const byte* iv __asm__ ("r1") = (const byte*)iv_p; + register word32 counter __asm__ ("r2") = (word32)counter_p; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "ADD r3, %[x], #0x34\n\t" + "LDR r4, [%[iv]]\n\t" + "LDR r5, [%[iv], #4]\n\t" + "LDR r6, [%[iv], #8]\n\t" + "STR %[counter], [%[x], #48]\n\t" +#ifdef BIG_ENDIAN_ORDER + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" +#endif /* BIG_ENDIAN_ORDER */ + "STM r3, {r4, r5, r6}\n\t" + : [x] "+r" (x), [iv] "+r" (iv), [counter] "+r" (counter) + : + : "memory", "r3", "r4", "r5", "r6", "cc" + ); +} + +XALIGNED(16) static const uint32_t L_chacha_thumb2_constants[] = { + 0x61707865, 0x3120646e, 0x79622d36, 0x6b206574, + 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, +}; + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void wc_chacha_setkey(word32* x_p, const byte* key_p, word32 keySz_p) +#else +void wc_chacha_setkey(word32* x, const byte* key, word32 keySz) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register word32* x __asm__ ("r0") = (word32*)x_p; + register const byte* key __asm__ ("r1") = (const byte*)key_p; + register word32 keySz __asm__ ("r2") = (word32)keySz_p; + register uint32_t* L_chacha_thumb2_constants_c __asm__ ("r3") = (uint32_t*)&L_chacha_thumb2_constants; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "MOV r7, %[L_chacha_thumb2_constants]\n\t" + "SUBS %[keySz], %[keySz], #0x10\n\t" + "ADD r7, r7, %[keySz]\n\t" + /* Start state with constants */ + "LDM r7, {r3, r4, r5, r6}\n\t" + "STM %[x]!, {r3, r4, r5, r6}\n\t" + /* Next is first 16 bytes of key. */ + "LDR r3, [%[key]]\n\t" + "LDR r4, [%[key], #4]\n\t" + "LDR r5, [%[key], #8]\n\t" + "LDR r6, [%[key], #12]\n\t" +#ifdef BIG_ENDIAN_ORDER + "REV r3, r3\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" +#endif /* BIG_ENDIAN_ORDER */ + "STM %[x]!, {r3, r4, r5, r6}\n\t" + /* Next 16 bytes of key. */ +#if defined(__GNUC__) + "BEQ L_chacha_thumb2_setkey_same_keyb_ytes_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_chacha_thumb2_setkey_same_keyb_ytes\n\t" +#else + "BEQ.N L_chacha_thumb2_setkey_same_keyb_ytes_%=\n\t" +#endif + /* Update key pointer for next 16 bytes. */ + "ADD %[key], %[key], %[keySz]\n\t" + "LDR r3, [%[key]]\n\t" + "LDR r4, [%[key], #4]\n\t" + "LDR r5, [%[key], #8]\n\t" + "LDR r6, [%[key], #12]\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_chacha_thumb2_setkey_same_keyb_ytes:\n\t" +#else + "L_chacha_thumb2_setkey_same_keyb_ytes_%=:\n\t" +#endif + "STM %[x], {r3, r4, r5, r6}\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [x] "+r" (x), [key] "+r" (key), [keySz] "+r" (keySz), + [L_chacha_thumb2_constants] "+r" (L_chacha_thumb2_constants_c) + : + : "memory", "r4", "r5", "r6", "r7", "cc" +#else + : [x] "+r" (x), [key] "+r" (key), [keySz] "+r" (keySz) + : [L_chacha_thumb2_constants] "r" (L_chacha_thumb2_constants) + : "memory", "r4", "r5", "r6", "r7", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +} + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void wc_chacha_crypt_bytes(ChaCha* ctx_p, byte* c_p, const byte* m_p, word32 len_p) +#else +void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register ChaCha* ctx __asm__ ("r0") = (ChaCha*)ctx_p; + register byte* c __asm__ ("r1") = (byte*)c_p; + register const byte* m __asm__ ("r2") = (const byte*)m_p; + register word32 len __asm__ ("r3") = (word32)len_p; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "SUB sp, sp, #0x34\n\t" + "MOV lr, %[ctx]\n\t" + "STRD %[ctx], %[c], [sp, #32]\n\t" + "STRD %[m], %[len], [sp, #40]\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_chacha_thumb2_crypt_block:\n\t" +#else + "L_chacha_thumb2_crypt_block_%=:\n\t" +#endif + /* Put x[12]..x[15] onto stack. */ + "LDRD r4, r5, [lr, #48]\n\t" + "LDRD r6, r7, [lr, #56]\n\t" + "STRD r4, r5, [sp, #16]\n\t" + "STRD r6, r7, [sp, #24]\n\t" + /* Load x[0]..x[12] into registers. */ + "LDM lr, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + /* 10x 2 full rounds to perform. */ + "MOV lr, #0xa\n\t" + "STR lr, [sp, #48]\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_chacha_thumb2_crypt_loop:\n\t" +#else + "L_chacha_thumb2_crypt_loop_%=:\n\t" +#endif + /* 0, 4, 8, 12 */ + /* 1, 5, 9, 13 */ + "LDR lr, [sp, #20]\n\t" + "ADD %[ctx], %[ctx], r4\n\t" + "ADD %[c], %[c], r5\n\t" + "EOR r12, r12, %[ctx]\n\t" + "EOR lr, lr, %[c]\n\t" + "ROR r12, r12, #16\n\t" + "ROR lr, lr, #16\n\t" + "ADD r8, r8, r12\n\t" + "ADD r9, r9, lr\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "ROR r4, r4, #20\n\t" + "ROR r5, r5, #20\n\t" + "ADD %[ctx], %[ctx], r4\n\t" + "ADD %[c], %[c], r5\n\t" + "EOR r12, r12, %[ctx]\n\t" + "EOR lr, lr, %[c]\n\t" + "ROR r12, r12, #24\n\t" + "ROR lr, lr, #24\n\t" + "ADD r8, r8, r12\n\t" + "ADD r9, r9, lr\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "ROR r4, r4, #25\n\t" + "ROR r5, r5, #25\n\t" + "STR r12, [sp, #16]\n\t" + "STR lr, [sp, #20]\n\t" + /* 2, 6, 10, 14 */ + /* 3, 7, 11, 15 */ + "LDR r12, [sp, #24]\n\t" + "LDR lr, [sp, #28]\n\t" + "ADD %[m], %[m], r6\n\t" + "ADD %[len], %[len], r7\n\t" + "EOR r12, r12, %[m]\n\t" + "EOR lr, lr, %[len]\n\t" + "ROR r12, r12, #16\n\t" + "ROR lr, lr, #16\n\t" + "ADD r10, r10, r12\n\t" + "ADD r11, r11, lr\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "ROR r6, r6, #20\n\t" + "ROR r7, r7, #20\n\t" + "ADD %[m], %[m], r6\n\t" + "ADD %[len], %[len], r7\n\t" + "EOR r12, r12, %[m]\n\t" + "EOR lr, lr, %[len]\n\t" + "ROR r12, r12, #24\n\t" + "ROR lr, lr, #24\n\t" + "ADD r10, r10, r12\n\t" + "ADD r11, r11, lr\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "ROR r6, r6, #25\n\t" + "ROR r7, r7, #25\n\t" + /* 3, 4, 9, 14 */ + /* 0, 5, 10, 15 */ + "ADD %[len], %[len], r4\n\t" + "ADD %[ctx], %[ctx], r5\n\t" + "EOR r12, r12, %[len]\n\t" + "EOR lr, lr, %[ctx]\n\t" + "ROR r12, r12, #16\n\t" + "ROR lr, lr, #16\n\t" + "ADD r9, r9, r12\n\t" + "ADD r10, r10, lr\n\t" + "EOR r4, r4, r9\n\t" + "EOR r5, r5, r10\n\t" + "ROR r4, r4, #20\n\t" + "ROR r5, r5, #20\n\t" + "ADD %[len], %[len], r4\n\t" + "ADD %[ctx], %[ctx], r5\n\t" + "EOR r12, r12, %[len]\n\t" + "EOR lr, lr, %[ctx]\n\t" + "ROR r12, r12, #24\n\t" + "ROR lr, lr, #24\n\t" + "ADD r9, r9, r12\n\t" + "ADD r10, r10, lr\n\t" + "EOR r4, r4, r9\n\t" + "EOR r5, r5, r10\n\t" + "ROR r4, r4, #25\n\t" + "ROR r5, r5, #25\n\t" + "STR r12, [sp, #24]\n\t" + "STR lr, [sp, #28]\n\t" + "LDR r12, [sp, #16]\n\t" + "LDR lr, [sp, #20]\n\t" + /* 1, 6, 11, 12 */ + /* 2, 7, 8, 13 */ + "ADD %[c], %[c], r6\n\t" + "ADD %[m], %[m], r7\n\t" + "EOR r12, r12, %[c]\n\t" + "EOR lr, lr, %[m]\n\t" + "ROR r12, r12, #16\n\t" + "ROR lr, lr, #16\n\t" + "ADD r11, r11, r12\n\t" + "ADD r8, r8, lr\n\t" + "EOR r6, r6, r11\n\t" + "EOR r7, r7, r8\n\t" + "ROR r6, r6, #20\n\t" + "ROR r7, r7, #20\n\t" + "ADD %[c], %[c], r6\n\t" + "ADD %[m], %[m], r7\n\t" + "EOR r12, r12, %[c]\n\t" + "EOR lr, lr, %[m]\n\t" + "ROR r12, r12, #24\n\t" + "ROR lr, lr, #24\n\t" + "ADD r11, r11, r12\n\t" + "ADD r8, r8, lr\n\t" + "EOR r6, r6, r11\n\t" + "EOR r7, r7, r8\n\t" + "ROR r6, r6, #25\n\t" + "ROR r7, r7, #25\n\t" + "STR lr, [sp, #20]\n\t" + /* Check if we have done enough rounds. */ + "LDR lr, [sp, #48]\n\t" + "SUBS lr, lr, #0x1\n\t" + "STR lr, [sp, #48]\n\t" +#if defined(__GNUC__) + "BGT L_chacha_thumb2_crypt_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BGT.N L_chacha_thumb2_crypt_loop\n\t" +#else + "BGT.N L_chacha_thumb2_crypt_loop_%=\n\t" +#endif + "STM sp, {r8, r9, r10, r11, r12}\n\t" + "LDR lr, [sp, #32]\n\t" + "MOV r12, sp\n\t" + /* Add in original state */ + "LDM lr!, {r8, r9, r10, r11}\n\t" + "ADD %[ctx], %[ctx], r8\n\t" + "ADD %[c], %[c], r9\n\t" + "ADD %[m], %[m], r10\n\t" + "ADD %[len], %[len], r11\n\t" + "LDM lr!, {r8, r9, r10, r11}\n\t" + "ADD r4, r4, r8\n\t" + "ADD r5, r5, r9\n\t" + "ADD r6, r6, r10\n\t" + "ADD r7, r7, r11\n\t" + "LDM r12, {r8, r9}\n\t" + "LDM lr!, {r10, r11}\n\t" + "ADD r8, r8, r10\n\t" + "ADD r9, r9, r11\n\t" + "STM r12!, {r8, r9}\n\t" + "LDM r12, {r8, r9}\n\t" + "LDM lr!, {r10, r11}\n\t" + "ADD r8, r8, r10\n\t" + "ADD r9, r9, r11\n\t" + "STM r12!, {r8, r9}\n\t" + "LDM r12, {r8, r9}\n\t" + "LDM lr!, {r10, r11}\n\t" + "ADD r8, r8, r10\n\t" + "ADD r9, r9, r11\n\t" + "ADD r10, r10, #0x1\n\t" + "STM r12!, {r8, r9}\n\t" + "STR r10, [lr, #-8]\n\t" + "LDM r12, {r8, r9}\n\t" + "LDM lr, {r10, r11}\n\t" + "ADD r8, r8, r10\n\t" + "ADD r9, r9, r11\n\t" + "STM r12, {r8, r9}\n\t" + "LDR r12, [sp, #44]\n\t" + "CMP r12, #0x40\n\t" +#if defined(__GNUC__) + "BLT L_chacha_thumb2_crypt_lt_block_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BLT.N L_chacha_thumb2_crypt_lt_block\n\t" +#else + "BLT.N L_chacha_thumb2_crypt_lt_block_%=\n\t" +#endif + "LDR r12, [sp, #40]\n\t" + "LDR lr, [sp, #36]\n\t" + /* XOR state into 64 bytes. */ + "LDR r8, [r12]\n\t" + "LDR r9, [r12, #4]\n\t" + "LDR r10, [r12, #8]\n\t" + "LDR r11, [r12, #12]\n\t" + "EOR %[ctx], %[ctx], r8\n\t" + "EOR %[c], %[c], r9\n\t" + "EOR %[m], %[m], r10\n\t" + "EOR %[len], %[len], r11\n\t" + "STR %[ctx], [lr]\n\t" + "STR %[c], [lr, #4]\n\t" + "STR %[m], [lr, #8]\n\t" + "STR %[len], [lr, #12]\n\t" + "LDR r8, [r12, #16]\n\t" + "LDR r9, [r12, #20]\n\t" + "LDR r10, [r12, #24]\n\t" + "LDR r11, [r12, #28]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "STR r4, [lr, #16]\n\t" + "STR r5, [lr, #20]\n\t" + "STR r6, [lr, #24]\n\t" + "STR r7, [lr, #28]\n\t" + "LDR r4, [sp]\n\t" + "LDR r5, [sp, #4]\n\t" + "LDR r6, [sp, #8]\n\t" + "LDR r7, [sp, #12]\n\t" + "LDR r8, [r12, #32]\n\t" + "LDR r9, [r12, #36]\n\t" + "LDR r10, [r12, #40]\n\t" + "LDR r11, [r12, #44]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "STR r4, [lr, #32]\n\t" + "STR r5, [lr, #36]\n\t" + "STR r6, [lr, #40]\n\t" + "STR r7, [lr, #44]\n\t" + "LDR r4, [sp, #16]\n\t" + "LDR r5, [sp, #20]\n\t" + "LDR r6, [sp, #24]\n\t" + "LDR r7, [sp, #28]\n\t" + "LDR r8, [r12, #48]\n\t" + "LDR r9, [r12, #52]\n\t" + "LDR r10, [r12, #56]\n\t" + "LDR r11, [r12, #60]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "STR r4, [lr, #48]\n\t" + "STR r5, [lr, #52]\n\t" + "STR r6, [lr, #56]\n\t" + "STR r7, [lr, #60]\n\t" + "LDR %[len], [sp, #44]\n\t" + "ADD r12, r12, #0x40\n\t" + "ADD lr, lr, #0x40\n\t" + "STR r12, [sp, #40]\n\t" + "STR lr, [sp, #36]\n\t" + "SUBS %[len], %[len], #0x40\n\t" + "LDR lr, [sp, #32]\n\t" + "STR %[len], [sp, #44]\n\t" +#if defined(__GNUC__) + "BNE L_chacha_thumb2_crypt_block_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_chacha_thumb2_crypt_block\n\t" +#else + "BNE.N L_chacha_thumb2_crypt_block_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_chacha_thumb2_crypt_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_chacha_thumb2_crypt_done\n\t" +#else + "B.N L_chacha_thumb2_crypt_done_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_chacha_thumb2_crypt_lt_block:\n\t" +#else + "L_chacha_thumb2_crypt_lt_block_%=:\n\t" +#endif + /* Store in over field of ChaCha. */ + "LDR lr, [sp, #32]\n\t" + "ADD r12, lr, #0x44\n\t" + "STM r12!, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7}\n\t" + "LDM sp, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7}\n\t" + "STM r12, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7}\n\t" + "LDRD %[m], %[len], [sp, #40]\n\t" + "LDR %[c], [sp, #36]\n\t" + "RSB r12, %[len], #0x40\n\t" + "STR r12, [lr, #64]\n\t" + "ADD lr, lr, #0x44\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_chacha_thumb2_crypt_16byte_loop:\n\t" +#else + "L_chacha_thumb2_crypt_16byte_loop_%=:\n\t" +#endif + "CMP %[len], #0x10\n\t" +#if defined(__GNUC__) + "BLT L_chacha_thumb2_crypt_word_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BLT.N L_chacha_thumb2_crypt_word_loop\n\t" +#else + "BLT.N L_chacha_thumb2_crypt_word_loop_%=\n\t" +#endif + /* 16 bytes of state XORed into message. */ + "LDM lr!, {r4, r5, r6, r7}\n\t" + "LDR r8, [%[m]]\n\t" + "LDR r9, [%[m], #4]\n\t" + "LDR r10, [%[m], #8]\n\t" + "LDR r11, [%[m], #12]\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "SUBS %[len], %[len], #0x10\n\t" + "STR r8, [%[c]]\n\t" + "STR r9, [%[c], #4]\n\t" + "STR r10, [%[c], #8]\n\t" + "STR r11, [%[c], #12]\n\t" +#if defined(__GNUC__) + "BEQ L_chacha_thumb2_crypt_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_chacha_thumb2_crypt_done\n\t" +#else + "BEQ.N L_chacha_thumb2_crypt_done_%=\n\t" +#endif + "ADD %[m], %[m], #0x10\n\t" + "ADD %[c], %[c], #0x10\n\t" +#if defined(__GNUC__) + "B L_chacha_thumb2_crypt_16byte_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_chacha_thumb2_crypt_16byte_loop\n\t" +#else + "B.N L_chacha_thumb2_crypt_16byte_loop_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_chacha_thumb2_crypt_word_loop:\n\t" +#else + "L_chacha_thumb2_crypt_word_loop_%=:\n\t" +#endif + "CMP %[len], #0x4\n\t" +#if defined(__GNUC__) + "BLT L_chacha_thumb2_crypt_byte_start_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BLT.N L_chacha_thumb2_crypt_byte_start\n\t" +#else + "BLT.N L_chacha_thumb2_crypt_byte_start_%=\n\t" +#endif + /* 4 bytes of state XORed into message. */ + "LDR r4, [lr]\n\t" + "LDR r8, [%[m]]\n\t" + "EOR r8, r8, r4\n\t" + "SUBS %[len], %[len], #0x4\n\t" + "STR r8, [%[c]]\n\t" +#if defined(__GNUC__) + "BEQ L_chacha_thumb2_crypt_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_chacha_thumb2_crypt_done\n\t" +#else + "BEQ.N L_chacha_thumb2_crypt_done_%=\n\t" +#endif + "ADD lr, lr, #0x4\n\t" + "ADD %[m], %[m], #0x4\n\t" + "ADD %[c], %[c], #0x4\n\t" +#if defined(__GNUC__) + "B L_chacha_thumb2_crypt_word_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_chacha_thumb2_crypt_word_loop\n\t" +#else + "B.N L_chacha_thumb2_crypt_word_loop_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_chacha_thumb2_crypt_byte_start:\n\t" +#else + "L_chacha_thumb2_crypt_byte_start_%=:\n\t" +#endif + "LDR r4, [lr]\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_chacha_thumb2_crypt_byte_loop:\n\t" +#else + "L_chacha_thumb2_crypt_byte_loop_%=:\n\t" +#endif + "LDRB r8, [%[m]]\n\t" + "EOR r8, r8, r4\n\t" + "SUBS %[len], %[len], #0x1\n\t" + "STRB r8, [%[c]]\n\t" +#if defined(__GNUC__) + "BEQ L_chacha_thumb2_crypt_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_chacha_thumb2_crypt_done\n\t" +#else + "BEQ.N L_chacha_thumb2_crypt_done_%=\n\t" +#endif + "LSR r4, r4, #8\n\t" + "ADD %[m], %[m], #0x1\n\t" + "ADD %[c], %[c], #0x1\n\t" +#if defined(__GNUC__) + "B L_chacha_thumb2_crypt_byte_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_chacha_thumb2_crypt_byte_loop\n\t" +#else + "B.N L_chacha_thumb2_crypt_byte_loop_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_chacha_thumb2_crypt_done:\n\t" +#else + "L_chacha_thumb2_crypt_done_%=:\n\t" +#endif + "ADD sp, sp, #0x34\n\t" + : [ctx] "+r" (ctx), [c] "+r" (c), [m] "+r" (m), [len] "+r" (len) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" + ); +} + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void wc_chacha_use_over(byte* over_p, byte* output_p, const byte* input_p, word32 len_p) +#else +void wc_chacha_use_over(byte* over, byte* output, const byte* input, word32 len) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register byte* over __asm__ ("r0") = (byte*)over_p; + register byte* output __asm__ ("r1") = (byte*)output_p; + register const byte* input __asm__ ("r2") = (const byte*)input_p; + register word32 len __asm__ ("r3") = (word32)len_p; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_chacha_thumb2_over_16byte_loop:\n\t" +#else + "L_chacha_thumb2_over_16byte_loop_%=:\n\t" +#endif + "CMP %[len], #0x10\n\t" +#if defined(__GNUC__) + "BLT L_chacha_thumb2_over_word_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BLT.N L_chacha_thumb2_over_word_loop\n\t" +#else + "BLT.N L_chacha_thumb2_over_word_loop_%=\n\t" +#endif + /* 16 bytes of state XORed into message. */ + "LDR r4, [%[over]]\n\t" + "LDR r5, [%[over], #4]\n\t" + "LDR r6, [%[over], #8]\n\t" + "LDR r7, [%[over], #12]\n\t" + "LDR r8, [%[input]]\n\t" + "LDR r9, [%[input], #4]\n\t" + "LDR r10, [%[input], #8]\n\t" + "LDR r11, [%[input], #12]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "SUBS %[len], %[len], #0x10\n\t" + "STR r4, [%[output]]\n\t" + "STR r5, [%[output], #4]\n\t" + "STR r6, [%[output], #8]\n\t" + "STR r7, [%[output], #12]\n\t" +#if defined(__GNUC__) + "BEQ L_chacha_thumb2_over_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_chacha_thumb2_over_done\n\t" +#else + "BEQ.N L_chacha_thumb2_over_done_%=\n\t" +#endif + "ADD %[over], %[over], #0x10\n\t" + "ADD %[input], %[input], #0x10\n\t" + "ADD %[output], %[output], #0x10\n\t" +#if defined(__GNUC__) + "B L_chacha_thumb2_over_16byte_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_chacha_thumb2_over_16byte_loop\n\t" +#else + "B.N L_chacha_thumb2_over_16byte_loop_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_chacha_thumb2_over_word_loop:\n\t" +#else + "L_chacha_thumb2_over_word_loop_%=:\n\t" +#endif + "CMP %[len], #0x4\n\t" +#if defined(__GNUC__) + "BLT L_chacha_thumb2_over_byte_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BLT.N L_chacha_thumb2_over_byte_loop\n\t" +#else + "BLT.N L_chacha_thumb2_over_byte_loop_%=\n\t" +#endif + /* 4 bytes of state XORed into message. */ + "LDR r4, [%[over]]\n\t" + "LDR r8, [%[input]]\n\t" + "EOR r4, r4, r8\n\t" + "SUBS %[len], %[len], #0x4\n\t" + "STR r4, [%[output]]\n\t" +#if defined(__GNUC__) + "BEQ L_chacha_thumb2_over_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_chacha_thumb2_over_done\n\t" +#else + "BEQ.N L_chacha_thumb2_over_done_%=\n\t" +#endif + "ADD %[over], %[over], #0x4\n\t" + "ADD %[input], %[input], #0x4\n\t" + "ADD %[output], %[output], #0x4\n\t" +#if defined(__GNUC__) + "B L_chacha_thumb2_over_word_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_chacha_thumb2_over_word_loop\n\t" +#else + "B.N L_chacha_thumb2_over_word_loop_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_chacha_thumb2_over_byte_loop:\n\t" +#else + "L_chacha_thumb2_over_byte_loop_%=:\n\t" +#endif + /* 4 bytes of state XORed into message. */ + "LDRB r4, [%[over]]\n\t" + "LDRB r8, [%[input]]\n\t" + "EOR r4, r4, r8\n\t" + "SUBS %[len], %[len], #0x1\n\t" + "STRB r4, [%[output]]\n\t" +#if defined(__GNUC__) + "BEQ L_chacha_thumb2_over_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_chacha_thumb2_over_done\n\t" +#else + "BEQ.N L_chacha_thumb2_over_done_%=\n\t" +#endif + "ADD %[over], %[over], #0x1\n\t" + "ADD %[input], %[input], #0x1\n\t" + "ADD %[output], %[output], #0x1\n\t" +#if defined(__GNUC__) + "B L_chacha_thumb2_over_byte_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_chacha_thumb2_over_byte_loop\n\t" +#else + "B.N L_chacha_thumb2_over_byte_loop_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_chacha_thumb2_over_done:\n\t" +#else + "L_chacha_thumb2_over_done_%=:\n\t" +#endif + : [over] "+r" (over), [output] "+r" (output), [input] "+r" (input), [len] "+r" (len) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + ); +} + +#endif /* HAVE_CHACHA */ +#endif /* !__aarch64__ && __thumb__ */ +#endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-chacha.c b/wolfcrypt/src/port/arm/thumb2-chacha.c new file mode 100644 index 0000000000..5e8e323ae7 --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-chacha.c @@ -0,0 +1,187 @@ +/* thumb2-chacha.c + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + + +#ifdef HAVE_CONFIG_H + #include +#endif + +#include + +#if defined(WOLFSSL_ARMASM) && defined(__thumb__) +#ifdef HAVE_CHACHA + +#include +#include +#include +#include +#ifdef NO_INLINE + #include +#else + #define WOLFSSL_MISC_INCLUDED + #include +#endif + +#ifdef CHACHA_AEAD_TEST + #include +#endif + +#ifdef CHACHA_TEST + #include +#endif + + +extern void wc_chacha_setiv(word32* x, const byte* iv, word32 counter); + +/* Set the Initialization Vector (IV) and counter into ChaCha context. + * + * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version + * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB. + * + * @param [in] ctx ChaCha context. + * @param [in] iv IV to set. + * @param [in] counter Starting value of counter. + * @return 0 on success. + * @return BAD_FUNC_ARG when ctx or IV is NULL. + */ +int wc_Chacha_SetIV(ChaCha* ctx, const byte* iv, word32 counter) +{ + int ret = 0; +#ifdef CHACHA_AEAD_TEST + word32 i; + + printf("NONCE : "); + if (iv != NULL) { + for (i = 0; i < CHACHA_IV_BYTES; i++) { + printf("%02x", iv[i]); + } + } + printf("\n\n"); +#endif + + /* Validate parameters. */ + if ((ctx == NULL) || (iv == NULL)) { + ret = BAD_FUNC_ARG; + } + if (ret == 0) { + /* No unused bytes to XOR into input. */ + ctx->left = 0; + + /* Set counter and IV into state. */ + wc_chacha_setiv(ctx->X, iv, counter); + } + + return ret; +} + +extern void wc_chacha_setkey(word32* x, const byte* key, word32 keySz); + +/* Set the key into the ChaCha context. + * + * Key setup. 8 word iv (nonce) + * + * @param [in] ctx ChaCha context. + * @param [in] key Key to set. + * @param [in] keySz Length of key in bytes. Valid values: + * CHACHA_MAX_KEY_SZ and (CHACHA_MAX_KEY_SZ / 2) + * @return 0 on success. + * @return BAD_FUNC_ARG when ctx or key is NULL. + * @return BAD_FUNC_ARG when keySz is invalid. + */ +int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) +{ + int ret = 0; + +#ifdef CHACHA_AEAD_TEST + printf("ChaCha key used :\n"); + if (key != NULL) { + word32 i; + for (i = 0; i < keySz; i++) { + printf("%02x", key[i]); + if ((i % 8) == 7) + printf("\n"); + } + } + printf("\n\n"); +#endif + + /* Validate parameters. */ + if ((ctx == NULL) || (key == NULL)) { + ret = BAD_FUNC_ARG; + } + else if ((keySz != (CHACHA_MAX_KEY_SZ / 2)) && + (keySz != CHACHA_MAX_KEY_SZ )) { + ret = BAD_FUNC_ARG; + } + + if (ret == 0) { + ctx->left = 0; + + wc_chacha_setkey(ctx->X, key, keySz); + } + + return ret; +} + +extern void wc_chacha_use_over(byte* over, byte* output, const byte* input, + word32 len); +extern void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, + word32 len); + +/* API to encrypt/decrypt a message of any size. + * + * @param [in] ctx ChaCha context. + * @param [out] output Enciphered output. + * @param [in] input Input to encipher. + * @param [in] len Length of input in bytes. + * @return 0 on success. + * @return BAD_FUNC_ARG when ctx, output or input is NULL. + */ +int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input, word32 len) +{ + int ret = 0; + + if ((ctx == NULL) || (output == NULL) || (input == NULL)) { + ret = BAD_FUNC_ARG; + } + + /* Handle left over bytes from last block. */ + if ((ret == 0) && (len > 0) && (ctx->left > 0)) { + byte* over = ((byte*)ctx->over) + CHACHA_CHUNK_BYTES - ctx->left; + word32 l = min(len, ctx->left); + + wc_chacha_use_over(over, output, input, l); + + ctx->left -= l; + input += l; + output += l; + len -= l; + } + + if ((ret == 0) && (len != 0)) { + wc_chacha_crypt_bytes(ctx, output, input, len); + } + + return ret; +} + +#endif /* HAVE_CHACHA */ +#endif /* WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_NEON */ From 27033c225f75924ecc6ae43306f9891268784783 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Tue, 3 Sep 2024 11:20:08 +1000 Subject: [PATCH 2/2] Thumb-2 ChaCha, Poly1305: implemention in assembly Implementation of ChaCha algorithm for ARM Thumb-2. Implementation of Poly1305 algorithm for ARM Thumb-2. --- src/include.am | 6 + wolfcrypt/src/poly1305.c | 14 +- wolfcrypt/src/port/arm/thumb2-chacha.c | 9 - wolfcrypt/src/port/arm/thumb2-poly1305-asm.S | 369 +++++++++++++++ .../src/port/arm/thumb2-poly1305-asm_c.c | 422 ++++++++++++++++++ wolfcrypt/src/port/arm/thumb2-poly1305.c | 142 ++++++ wolfcrypt/test/test.c | 71 +-- wolfssl/wolfcrypt/chacha.h | 10 + wolfssl/wolfcrypt/poly1305.h | 24 +- 9 files changed, 1017 insertions(+), 50 deletions(-) create mode 100644 wolfcrypt/src/port/arm/thumb2-poly1305-asm.S create mode 100644 wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c create mode 100644 wolfcrypt/src/port/arm/thumb2-poly1305.c diff --git a/src/include.am b/src/include.am index 61f89f86d4..c3d8376a1d 100644 --- a/src/include.am +++ b/src/include.am @@ -922,6 +922,12 @@ if !BUILD_FIPS_RAND if BUILD_POLY1305 if BUILD_ARMASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-poly1305.c +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-poly1305.c +if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c +else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-poly1305-asm.S +endif !BUILD_ARMASM_INLINE endif if BUILD_RISCV_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/riscv/riscv-64-poly1305.c diff --git a/wolfcrypt/src/poly1305.c b/wolfcrypt/src/poly1305.c index b4b5c0f7ea..48529d78c1 100644 --- a/wolfcrypt/src/poly1305.c +++ b/wolfcrypt/src/poly1305.c @@ -231,7 +231,8 @@ extern void poly1305_final_avx2(Poly1305* ctx, byte* mac); p[7] = (byte)(v >> 56); } #endif/* !WOLFSSL_ARMASM && !WOLFSSL_RISCV_ASM */ -#else /* if not 64 bit then use 32 bit */ +/* if not 64 bit then use 32 bit */ +#elif !defined(WOLFSSL_ARMASM) || !defined(__thumb__) static word32 U8TO32(const byte *p) { @@ -268,8 +269,8 @@ static WC_INLINE void u32tole64(const word32 inLe32, byte outLe64[8]) } -#if (!defined(WOLFSSL_ARMASM) || !defined(__aarch64__)) && \ - !defined(WOLFSSL_RISCV_ASM) +#if (!defined(WOLFSSL_ARMASM) || (!defined(__aarch64__) && \ + !defined(__thumb__))) && !defined(WOLFSSL_RISCV_ASM) /* This local function operates on a message with a given number of bytes with a given ctx pointer to a Poly1305 structure. @@ -788,7 +789,8 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac) return 0; } -#endif /* (!WOLFSSL_ARMASM || !__aarch64__) && !WOLFSSL_RISCV_ASM */ +#endif /* (!WOLFSSL_ARMASM || (!__aarch64__ && !__thumb__)) && + * !WOLFSSL_RISCV_ASM */ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) @@ -883,8 +885,8 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) /* process full blocks */ if (bytes >= POLY1305_BLOCK_SIZE) { size_t want = ((size_t)bytes & ~((size_t)POLY1305_BLOCK_SIZE - 1)); -#if (!defined(WOLFSSL_ARMASM) || !defined(__aarch64__)) && \ - !defined(WOLFSSL_RISCV_ASM) +#if (!defined(WOLFSSL_ARMASM) || (!defined(__aarch64__) && \ + !defined(__thumb__))) && !defined(WOLFSSL_RISCV_ASM) int ret; ret = poly1305_blocks(ctx, m, want); if (ret != 0) diff --git a/wolfcrypt/src/port/arm/thumb2-chacha.c b/wolfcrypt/src/port/arm/thumb2-chacha.c index 5e8e323ae7..a189ccddd3 100644 --- a/wolfcrypt/src/port/arm/thumb2-chacha.c +++ b/wolfcrypt/src/port/arm/thumb2-chacha.c @@ -49,8 +49,6 @@ #endif -extern void wc_chacha_setiv(word32* x, const byte* iv, word32 counter); - /* Set the Initialization Vector (IV) and counter into ChaCha context. * * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version @@ -92,8 +90,6 @@ int wc_Chacha_SetIV(ChaCha* ctx, const byte* iv, word32 counter) return ret; } -extern void wc_chacha_setkey(word32* x, const byte* key, word32 keySz); - /* Set the key into the ChaCha context. * * Key setup. 8 word iv (nonce) @@ -141,11 +137,6 @@ int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) return ret; } -extern void wc_chacha_use_over(byte* over, byte* output, const byte* input, - word32 len); -extern void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, - word32 len); - /* API to encrypt/decrypt a message of any size. * * @param [in] ctx ChaCha context. diff --git a/wolfcrypt/src/port/arm/thumb2-poly1305-asm.S b/wolfcrypt/src/port/arm/thumb2-poly1305-asm.S new file mode 100644 index 0000000000..b727e8164e --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-poly1305-asm.S @@ -0,0 +1,369 @@ +/* thumb2-poly1305-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./poly1305/poly1305.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-poly1305-asm.S + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__thumb__) +#ifndef WOLFSSL_ARMASM_INLINE + .thumb + .syntax unified +#ifdef HAVE_POLY1305 + .text + .align 4 + .globl poly1305_blocks_thumb2_16 + .type poly1305_blocks_thumb2_16, %function +poly1305_blocks_thumb2_16: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0x1c + CMP r2, #0x0 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_poly1305_thumb2_16_done +#else + BEQ.N L_poly1305_thumb2_16_done +#endif + ADD lr, sp, #0xc + STM lr, {r0, r1, r2, r3} + /* Get h pointer */ + ADD lr, r0, #0x10 + LDM lr, {r4, r5, r6, r7, r8} +L_poly1305_thumb2_16_loop: + /* Add m to h */ + LDR r1, [sp, #16] + LDR r2, [r1] + LDR r3, [r1, #4] + LDR r9, [r1, #8] + LDR r10, [r1, #12] + LDR r11, [sp, #24] + ADDS r4, r4, r2 + ADCS r5, r5, r3 + ADCS r6, r6, r9 + ADCS r7, r7, r10 + ADD r1, r1, #0x10 + ADC r8, r8, r11 +#ifdef WOLFSSL_SP_NO_UMAAL + STM lr, {r4, r5, r6, r7, r8} +#else + /* h[0]-h[2] in r4-r6 for multiplication. */ + STR r7, [lr, #12] + STR r8, [lr, #16] +#endif /* WOLFSSL_SP_NO_UMAAL */ + STR r1, [sp, #16] + LDR r1, [sp, #12] + /* Multiply h by r */ +#ifdef WOLFSSL_SP_NO_UMAAL + /* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */ + LDR r3, [r1] + EOR r0, r0, r0 + /* r[0] * h[0] */ + /* h[0] in r4 */ + UMULL r4, r5, r3, r4 + /* r[0] * h[2] */ + /* h[2] in r6 */ + UMULL r6, r7, r3, r6 + /* r[0] * h[4] */ + /* h[4] in r8 */ + MUL r8, r3, r8 + /* r[0] * h[1] */ + LDR r2, [lr, #4] + MOV r12, r0 + UMLAL r5, r12, r3, r2 + /* r[0] * h[3] */ + LDR r2, [lr, #12] + ADDS r6, r6, r12 + ADC r7, r7, r0 + UMLAL r7, r8, r3, r2 + /* r[1] * h[0] */ + LDR r3, [r1, #4] + LDR r2, [lr] + MOV r12, r0 + UMLAL r5, r12, r3, r2 + /* r[1] * h[1] */ + LDR r2, [lr, #4] + ADDS r6, r6, r12 + ADC r12, r0, r0 + UMLAL r6, r12, r3, r2 + /* r[1] * h[2] */ + LDR r2, [lr, #8] + ADDS r7, r7, r12 + ADC r12, r0, r0 + UMLAL r7, r12, r3, r2 + /* r[1] * h[3] */ + LDR r2, [lr, #12] + ADDS r8, r8, r12 + ADC r9, r0, r0 + UMLAL r8, r9, r3, r2 + /* r[1] * h[4] */ + LDR r2, [lr, #16] + MLA r9, r3, r2, r9 + /* r[2] * h[0] */ + LDR r3, [r1, #8] + LDR r2, [lr] + MOV r12, r0 + UMLAL r6, r12, r3, r2 + /* r[2] * h[1] */ + LDR r2, [lr, #4] + ADDS r7, r7, r12 + ADC r12, r0, r0 + UMLAL r7, r12, r3, r2 + /* r[2] * h[2] */ + LDR r2, [lr, #8] + ADDS r8, r8, r12 + ADC r12, r0, r0 + UMLAL r8, r12, r3, r2 + /* r[2] * h[3] */ + LDR r2, [lr, #12] + ADDS r9, r9, r12 + ADC r10, r0, r0 + UMLAL r9, r10, r3, r2 + /* r[2] * h[4] */ + LDR r2, [lr, #16] + MLA r10, r3, r2, r10 + /* r[3] * h[0] */ + LDR r3, [r1, #12] + LDR r2, [lr] + MOV r12, r0 + UMLAL r7, r12, r3, r2 + /* r[3] * h[1] */ + LDR r2, [lr, #4] + ADDS r8, r8, r12 + ADC r12, r0, r0 + UMLAL r8, r12, r3, r2 + /* r[3] * h[2] */ + LDR r2, [lr, #8] + ADDS r9, r9, r12 + ADC r10, r10, r0 + UMLAL r9, r10, r3, r2 + /* r[3] * h[3] */ + LDR r2, [lr, #12] + MOV r11, r0 + UMLAL r10, r11, r3, r2 + /* r[3] * h[4] */ + LDR r2, [lr, #16] + MOV r12, r0 + MLA r11, r3, r2, r11 +#else + LDM r1, {r0, r1, r2, r3} + /* r[0] * h[0] */ + UMULL r10, r11, r0, r4 + /* r[1] * h[0] */ + UMULL r12, r7, r1, r4 + /* r[0] * h[1] */ + UMAAL r11, r12, r0, r5 + /* r[2] * h[0] */ + UMULL r8, r9, r2, r4 + /* r[1] * h[1] */ + UMAAL r12, r8, r1, r5 + /* r[0] * h[2] */ + UMAAL r12, r7, r0, r6 + /* r[3] * h[0] */ + UMAAL r8, r9, r3, r4 + STM sp, {r10, r11, r12} + /* r[2] * h[1] */ + UMAAL r7, r8, r2, r5 + /* Replace h[0] with h[3] */ + LDR r4, [lr, #12] + /* r[1] * h[2] */ + UMULL r10, r11, r1, r6 + /* r[2] * h[2] */ + UMAAL r8, r9, r2, r6 + /* r[0] * h[3] */ + UMAAL r7, r10, r0, r4 + /* r[3] * h[1] */ + UMAAL r8, r11, r3, r5 + /* r[1] * h[3] */ + UMAAL r8, r10, r1, r4 + /* r[3] * h[2] */ + UMAAL r9, r11, r3, r6 + /* r[2] * h[3] */ + UMAAL r9, r10, r2, r4 + /* Replace h[1] with h[4] */ + LDR r5, [lr, #16] + /* r[3] * h[3] */ + UMAAL r10, r11, r3, r4 + MOV r12, #0x0 + /* r[0] * h[4] */ + UMAAL r8, r12, r0, r5 + /* r[1] * h[4] */ + UMAAL r9, r12, r1, r5 + /* r[2] * h[4] */ + UMAAL r10, r12, r2, r5 + /* r[3] * h[4] */ + UMAAL r11, r12, r3, r5 + /* DONE */ + LDM sp, {r4, r5, r6} +#endif /* WOLFSSL_SP_NO_UMAAL */ + /* r12 will be zero because r is masked. */ + /* Load length */ + LDR r2, [sp, #20] + /* Reduce mod 2^130 - 5 */ + BIC r3, r8, #0x3 + AND r8, r8, #0x3 + ADDS r4, r4, r3 + LSR r3, r3, #2 + ADCS r5, r5, r9 + ORR r3, r3, r9, LSL #30 + ADCS r6, r6, r10 + LSR r9, r9, #2 + ADCS r7, r7, r11 + ORR r9, r9, r10, LSL #30 + ADC r8, r8, r12 + LSR r10, r10, #2 + ADDS r4, r4, r3 + ORR r10, r10, r11, LSL #30 + ADCS r5, r5, r9 + LSR r11, r11, #2 + ADCS r6, r6, r10 + ADCS r7, r7, r11 + ADC r8, r8, r12 + /* Sub 16 from length. */ + SUBS r2, r2, #0x10 + /* Store length. */ + STR r2, [sp, #20] + /* Loop again if more message to do. */ +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BGT L_poly1305_thumb2_16_loop +#else + BGT.N L_poly1305_thumb2_16_loop +#endif + STM lr, {r4, r5, r6, r7, r8} +L_poly1305_thumb2_16_done: + ADD sp, sp, #0x1c + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 250 */ + .size poly1305_blocks_thumb2_16,.-poly1305_blocks_thumb2_16 + .text + .type L_poly1305_thumb2_clamp, %object + .size L_poly1305_thumb2_clamp, 16 + .align 4 +L_poly1305_thumb2_clamp: + .word 0xfffffff + .word 0xffffffc + .word 0xffffffc + .word 0xffffffc + .text + .align 4 + .globl poly1305_set_key + .type poly1305_set_key, %function +poly1305_set_key: + PUSH {r4, r5, r6, r7, r8, r9, r10, lr} + /* Load mask. */ + ADR r10, L_poly1305_thumb2_clamp + LDM r10, {r6, r7, r8, r9} + /* Load and cache padding. */ + LDR r2, [r1, #16] + LDR r3, [r1, #20] + LDR r4, [r1, #24] + LDR r5, [r1, #28] + ADD r10, r0, #0x24 + STM r10, {r2, r3, r4, r5} + /* Load, mask and store r. */ + LDR r2, [r1] + LDR r3, [r1, #4] + LDR r4, [r1, #8] + LDR r5, [r1, #12] + AND r2, r2, r6 + AND r3, r3, r7 + AND r4, r4, r8 + AND r5, r5, r9 + ADD r10, r0, #0x0 + STM r10, {r2, r3, r4, r5} + /* h (accumulator) = 0 */ + EOR r6, r6, r6 + EOR r7, r7, r7 + EOR r8, r8, r8 + EOR r9, r9, r9 + ADD r10, r0, #0x10 + EOR r5, r5, r5 + STM r10, {r5, r6, r7, r8, r9} + /* Zero leftover */ + STR r5, [r0, #52] + POP {r4, r5, r6, r7, r8, r9, r10, pc} + /* Cycle Count = 70 */ + .size poly1305_set_key,.-poly1305_set_key + .text + .align 4 + .globl poly1305_final + .type poly1305_final, %function +poly1305_final: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + ADD r11, r0, #0x10 + LDM r11, {r2, r3, r4, r5, r6} + /* Add 5 and check for h larger than p. */ + ADDS r7, r2, #0x5 + ADCS r7, r3, #0x0 + ADCS r7, r4, #0x0 + ADCS r7, r5, #0x0 + ADC r7, r6, #0x0 + SUB r7, r7, #0x4 + LSR r7, r7, #31 + SUB r7, r7, #0x1 + AND r7, r7, #0x5 + /* Add 0/5 to h. */ + ADDS r2, r2, r7 + ADCS r3, r3, #0x0 + ADCS r4, r4, #0x0 + ADC r5, r5, #0x0 + /* Add padding */ + ADD r11, r0, #0x24 + LDM r11, {r7, r8, r9, r10} + ADDS r2, r2, r7 + ADCS r3, r3, r8 + ADCS r4, r4, r9 + ADC r5, r5, r10 + /* Store MAC */ + STR r2, [r1] + STR r3, [r1, #4] + STR r4, [r1, #8] + STR r5, [r1, #12] + /* Zero out h. */ + EOR r2, r2, r2 + EOR r3, r3, r3 + EOR r4, r4, r4 + EOR r5, r5, r5 + EOR r6, r6, r6 + ADD r11, r0, #0x10 + STM r11, {r2, r3, r4, r5, r6} + /* Zero out r. */ + ADD r11, r0, #0x0 + STM r11, {r2, r3, r4, r5} + /* Zero out padding. */ + ADD r11, r0, #0x24 + STM r11, {r2, r3, r4, r5} + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 82 */ + .size poly1305_final,.-poly1305_final +#endif /* HAVE_POLY1305 */ +#endif /* !__aarch64__ && __thumb__ */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c b/wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c new file mode 100644 index 0000000000..437141ab06 --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c @@ -0,0 +1,422 @@ +/* thumb2-poly1305-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./poly1305/poly1305.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-poly1305-asm.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__thumb__) +#ifdef WOLFSSL_ARMASM_INLINE + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#define WOLFSSL_NO_VAR_ASSIGN_REG +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ +#ifdef HAVE_POLY1305 +#include + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void poly1305_blocks_thumb2_16(Poly1305* ctx_p, const byte* m_p, word32 len_p, int notLast_p) +#else +void poly1305_blocks_thumb2_16(Poly1305* ctx, const byte* m, word32 len, int notLast) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register Poly1305* ctx __asm__ ("r0") = (Poly1305*)ctx_p; + register const byte* m __asm__ ("r1") = (const byte*)m_p; + register word32 len __asm__ ("r2") = (word32)len_p; + register int notLast __asm__ ("r3") = (int)notLast_p; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "SUB sp, sp, #0x1c\n\t" + "CMP %[len], #0x0\n\t" +#if defined(__GNUC__) + "BEQ L_poly1305_thumb2_16_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_poly1305_thumb2_16_done\n\t" +#else + "BEQ.N L_poly1305_thumb2_16_done_%=\n\t" +#endif + "ADD lr, sp, #0xc\n\t" + "STM lr, {%[ctx], %[m], %[len], %[notLast]}\n\t" + /* Get h pointer */ + "ADD lr, %[ctx], #0x10\n\t" + "LDM lr, {r4, r5, r6, r7, r8}\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_poly1305_thumb2_16_loop:\n\t" +#else + "L_poly1305_thumb2_16_loop_%=:\n\t" +#endif + /* Add m to h */ + "LDR %[m], [sp, #16]\n\t" + "LDR %[len], [%[m]]\n\t" + "LDR %[notLast], [%[m], #4]\n\t" + "LDR r9, [%[m], #8]\n\t" + "LDR r10, [%[m], #12]\n\t" + "LDR r11, [sp, #24]\n\t" + "ADDS r4, r4, %[len]\n\t" + "ADCS r5, r5, %[notLast]\n\t" + "ADCS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADD %[m], %[m], #0x10\n\t" + "ADC r8, r8, r11\n\t" +#ifdef WOLFSSL_SP_NO_UMAAL + "STM lr, {r4, r5, r6, r7, r8}\n\t" +#else + /* h[0]-h[2] in r4-r6 for multiplication. */ + "STR r7, [lr, #12]\n\t" + "STR r8, [lr, #16]\n\t" +#endif /* WOLFSSL_SP_NO_UMAAL */ + "STR %[m], [sp, #16]\n\t" + "LDR %[m], [sp, #12]\n\t" + /* Multiply h by r */ +#ifdef WOLFSSL_SP_NO_UMAAL + /* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */ + "LDR %[notLast], [%[m]]\n\t" + "EOR %[ctx], %[ctx], %[ctx]\n\t" + /* r[0] * h[0] */ + /* h[0] in r4 */ + "UMULL r4, r5, %[notLast], r4\n\t" + /* r[0] * h[2] */ + /* h[2] in r6 */ + "UMULL r6, r7, %[notLast], r6\n\t" + /* r[0] * h[4] */ + /* h[4] in r8 */ + "MUL r8, %[notLast], r8\n\t" + /* r[0] * h[1] */ + "LDR %[len], [lr, #4]\n\t" + "MOV r12, %[ctx]\n\t" + "UMLAL r5, r12, %[notLast], %[len]\n\t" + /* r[0] * h[3] */ + "LDR %[len], [lr, #12]\n\t" + "ADDS r6, r6, r12\n\t" + "ADC r7, r7, %[ctx]\n\t" + "UMLAL r7, r8, %[notLast], %[len]\n\t" + /* r[1] * h[0] */ + "LDR %[notLast], [%[m], #4]\n\t" + "LDR %[len], [lr]\n\t" + "MOV r12, %[ctx]\n\t" + "UMLAL r5, r12, %[notLast], %[len]\n\t" + /* r[1] * h[1] */ + "LDR %[len], [lr, #4]\n\t" + "ADDS r6, r6, r12\n\t" + "ADC r12, %[ctx], %[ctx]\n\t" + "UMLAL r6, r12, %[notLast], %[len]\n\t" + /* r[1] * h[2] */ + "LDR %[len], [lr, #8]\n\t" + "ADDS r7, r7, r12\n\t" + "ADC r12, %[ctx], %[ctx]\n\t" + "UMLAL r7, r12, %[notLast], %[len]\n\t" + /* r[1] * h[3] */ + "LDR %[len], [lr, #12]\n\t" + "ADDS r8, r8, r12\n\t" + "ADC r9, %[ctx], %[ctx]\n\t" + "UMLAL r8, r9, %[notLast], %[len]\n\t" + /* r[1] * h[4] */ + "LDR %[len], [lr, #16]\n\t" + "MLA r9, %[notLast], %[len], r9\n\t" + /* r[2] * h[0] */ + "LDR %[notLast], [%[m], #8]\n\t" + "LDR %[len], [lr]\n\t" + "MOV r12, %[ctx]\n\t" + "UMLAL r6, r12, %[notLast], %[len]\n\t" + /* r[2] * h[1] */ + "LDR %[len], [lr, #4]\n\t" + "ADDS r7, r7, r12\n\t" + "ADC r12, %[ctx], %[ctx]\n\t" + "UMLAL r7, r12, %[notLast], %[len]\n\t" + /* r[2] * h[2] */ + "LDR %[len], [lr, #8]\n\t" + "ADDS r8, r8, r12\n\t" + "ADC r12, %[ctx], %[ctx]\n\t" + "UMLAL r8, r12, %[notLast], %[len]\n\t" + /* r[2] * h[3] */ + "LDR %[len], [lr, #12]\n\t" + "ADDS r9, r9, r12\n\t" + "ADC r10, %[ctx], %[ctx]\n\t" + "UMLAL r9, r10, %[notLast], %[len]\n\t" + /* r[2] * h[4] */ + "LDR %[len], [lr, #16]\n\t" + "MLA r10, %[notLast], %[len], r10\n\t" + /* r[3] * h[0] */ + "LDR %[notLast], [%[m], #12]\n\t" + "LDR %[len], [lr]\n\t" + "MOV r12, %[ctx]\n\t" + "UMLAL r7, r12, %[notLast], %[len]\n\t" + /* r[3] * h[1] */ + "LDR %[len], [lr, #4]\n\t" + "ADDS r8, r8, r12\n\t" + "ADC r12, %[ctx], %[ctx]\n\t" + "UMLAL r8, r12, %[notLast], %[len]\n\t" + /* r[3] * h[2] */ + "LDR %[len], [lr, #8]\n\t" + "ADDS r9, r9, r12\n\t" + "ADC r10, r10, %[ctx]\n\t" + "UMLAL r9, r10, %[notLast], %[len]\n\t" + /* r[3] * h[3] */ + "LDR %[len], [lr, #12]\n\t" + "MOV r11, %[ctx]\n\t" + "UMLAL r10, r11, %[notLast], %[len]\n\t" + /* r[3] * h[4] */ + "LDR %[len], [lr, #16]\n\t" + "MOV r12, %[ctx]\n\t" + "MLA r11, %[notLast], %[len], r11\n\t" +#else + "LDM %[m], {%[ctx], %[m], %[len], %[notLast]}\n\t" + /* r[0] * h[0] */ + "UMULL r10, r11, %[ctx], r4\n\t" + /* r[1] * h[0] */ + "UMULL r12, r7, %[m], r4\n\t" + /* r[0] * h[1] */ + "UMAAL r11, r12, %[ctx], r5\n\t" + /* r[2] * h[0] */ + "UMULL r8, r9, %[len], r4\n\t" + /* r[1] * h[1] */ + "UMAAL r12, r8, %[m], r5\n\t" + /* r[0] * h[2] */ + "UMAAL r12, r7, %[ctx], r6\n\t" + /* r[3] * h[0] */ + "UMAAL r8, r9, %[notLast], r4\n\t" + "STM sp, {r10, r11, r12}\n\t" + /* r[2] * h[1] */ + "UMAAL r7, r8, %[len], r5\n\t" + /* Replace h[0] with h[3] */ + "LDR r4, [lr, #12]\n\t" + /* r[1] * h[2] */ + "UMULL r10, r11, %[m], r6\n\t" + /* r[2] * h[2] */ + "UMAAL r8, r9, %[len], r6\n\t" + /* r[0] * h[3] */ + "UMAAL r7, r10, %[ctx], r4\n\t" + /* r[3] * h[1] */ + "UMAAL r8, r11, %[notLast], r5\n\t" + /* r[1] * h[3] */ + "UMAAL r8, r10, %[m], r4\n\t" + /* r[3] * h[2] */ + "UMAAL r9, r11, %[notLast], r6\n\t" + /* r[2] * h[3] */ + "UMAAL r9, r10, %[len], r4\n\t" + /* Replace h[1] with h[4] */ + "LDR r5, [lr, #16]\n\t" + /* r[3] * h[3] */ + "UMAAL r10, r11, %[notLast], r4\n\t" + "MOV r12, #0x0\n\t" + /* r[0] * h[4] */ + "UMAAL r8, r12, %[ctx], r5\n\t" + /* r[1] * h[4] */ + "UMAAL r9, r12, %[m], r5\n\t" + /* r[2] * h[4] */ + "UMAAL r10, r12, %[len], r5\n\t" + /* r[3] * h[4] */ + "UMAAL r11, r12, %[notLast], r5\n\t" + /* DONE */ + "LDM sp, {r4, r5, r6}\n\t" +#endif /* WOLFSSL_SP_NO_UMAAL */ + /* r12 will be zero because r is masked. */ + /* Load length */ + "LDR %[len], [sp, #20]\n\t" + /* Reduce mod 2^130 - 5 */ + "BIC %[notLast], r8, #0x3\n\t" + "AND r8, r8, #0x3\n\t" + "ADDS r4, r4, %[notLast]\n\t" + "LSR %[notLast], %[notLast], #2\n\t" + "ADCS r5, r5, r9\n\t" + "ORR %[notLast], %[notLast], r9, LSL #30\n\t" + "ADCS r6, r6, r10\n\t" + "LSR r9, r9, #2\n\t" + "ADCS r7, r7, r11\n\t" + "ORR r9, r9, r10, LSL #30\n\t" + "ADC r8, r8, r12\n\t" + "LSR r10, r10, #2\n\t" + "ADDS r4, r4, %[notLast]\n\t" + "ORR r10, r10, r11, LSL #30\n\t" + "ADCS r5, r5, r9\n\t" + "LSR r11, r11, #2\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "ADC r8, r8, r12\n\t" + /* Sub 16 from length. */ + "SUBS %[len], %[len], #0x10\n\t" + /* Store length. */ + "STR %[len], [sp, #20]\n\t" + /* Loop again if more message to do. */ +#if defined(__GNUC__) + "BGT L_poly1305_thumb2_16_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BGT.N L_poly1305_thumb2_16_loop\n\t" +#else + "BGT.N L_poly1305_thumb2_16_loop_%=\n\t" +#endif + "STM lr, {r4, r5, r6, r7, r8}\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_poly1305_thumb2_16_done:\n\t" +#else + "L_poly1305_thumb2_16_done_%=:\n\t" +#endif + "ADD sp, sp, #0x1c\n\t" + : [ctx] "+r" (ctx), [m] "+r" (m), [len] "+r" (len), [notLast] "+r" (notLast) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" + ); +} + +XALIGNED(16) static const uint32_t L_poly1305_thumb2_clamp[] = { + 0x0fffffff, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc, +}; + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void poly1305_set_key(Poly1305* ctx_p, const byte* key_p) +#else +void poly1305_set_key(Poly1305* ctx, const byte* key) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register Poly1305* ctx __asm__ ("r0") = (Poly1305*)ctx_p; + register const byte* key __asm__ ("r1") = (const byte*)key_p; + register uint32_t* L_poly1305_thumb2_clamp_c __asm__ ("r2") = (uint32_t*)&L_poly1305_thumb2_clamp; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + /* Load mask. */ + "MOV r10, %[L_poly1305_thumb2_clamp]\n\t" + "LDM r10, {r6, r7, r8, r9}\n\t" + /* Load and cache padding. */ + "LDR r2, [%[key], #16]\n\t" + "LDR r3, [%[key], #20]\n\t" + "LDR r4, [%[key], #24]\n\t" + "LDR r5, [%[key], #28]\n\t" + "ADD r10, %[ctx], #0x24\n\t" + "STM r10, {r2, r3, r4, r5}\n\t" + /* Load, mask and store r. */ + "LDR r2, [%[key]]\n\t" + "LDR r3, [%[key], #4]\n\t" + "LDR r4, [%[key], #8]\n\t" + "LDR r5, [%[key], #12]\n\t" + "AND r2, r2, r6\n\t" + "AND r3, r3, r7\n\t" + "AND r4, r4, r8\n\t" + "AND r5, r5, r9\n\t" + "ADD r10, %[ctx], #0x0\n\t" + "STM r10, {r2, r3, r4, r5}\n\t" + /* h (accumulator) = 0 */ + "EOR r6, r6, r6\n\t" + "EOR r7, r7, r7\n\t" + "EOR r8, r8, r8\n\t" + "EOR r9, r9, r9\n\t" + "ADD r10, %[ctx], #0x10\n\t" + "EOR r5, r5, r5\n\t" + "STM r10, {r5, r6, r7, r8, r9}\n\t" + /* Zero leftover */ + "STR r5, [%[ctx], #52]\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [ctx] "+r" (ctx), [key] "+r" (key), + [L_poly1305_thumb2_clamp] "+r" (L_poly1305_thumb2_clamp_c) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc" +#else + : [ctx] "+r" (ctx), [key] "+r" (key) + : [L_poly1305_thumb2_clamp] "r" (L_poly1305_thumb2_clamp) + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +} + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void poly1305_final(Poly1305* ctx_p, byte* mac_p) +#else +void poly1305_final(Poly1305* ctx, byte* mac) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register Poly1305* ctx __asm__ ("r0") = (Poly1305*)ctx_p; + register byte* mac __asm__ ("r1") = (byte*)mac_p; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "ADD r11, %[ctx], #0x10\n\t" + "LDM r11, {r2, r3, r4, r5, r6}\n\t" + /* Add 5 and check for h larger than p. */ + "ADDS r7, r2, #0x5\n\t" + "ADCS r7, r3, #0x0\n\t" + "ADCS r7, r4, #0x0\n\t" + "ADCS r7, r5, #0x0\n\t" + "ADC r7, r6, #0x0\n\t" + "SUB r7, r7, #0x4\n\t" + "LSR r7, r7, #31\n\t" + "SUB r7, r7, #0x1\n\t" + "AND r7, r7, #0x5\n\t" + /* Add 0/5 to h. */ + "ADDS r2, r2, r7\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* Add padding */ + "ADD r11, %[ctx], #0x24\n\t" + "LDM r11, {r7, r8, r9, r10}\n\t" + "ADDS r2, r2, r7\n\t" + "ADCS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r5, r5, r10\n\t" + /* Store MAC */ + "STR r2, [%[mac]]\n\t" + "STR r3, [%[mac], #4]\n\t" + "STR r4, [%[mac], #8]\n\t" + "STR r5, [%[mac], #12]\n\t" + /* Zero out h. */ + "EOR r2, r2, r2\n\t" + "EOR r3, r3, r3\n\t" + "EOR r4, r4, r4\n\t" + "EOR r5, r5, r5\n\t" + "EOR r6, r6, r6\n\t" + "ADD r11, %[ctx], #0x10\n\t" + "STM r11, {r2, r3, r4, r5, r6}\n\t" + /* Zero out r. */ + "ADD r11, %[ctx], #0x0\n\t" + "STM r11, {r2, r3, r4, r5}\n\t" + /* Zero out padding. */ + "ADD r11, %[ctx], #0x24\n\t" + "STM r11, {r2, r3, r4, r5}\n\t" + : [ctx] "+r" (ctx), [mac] "+r" (mac) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" + ); +} + +#endif /* HAVE_POLY1305 */ +#endif /* !__aarch64__ && __thumb__ */ +#endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-poly1305.c b/wolfcrypt/src/port/arm/thumb2-poly1305.c new file mode 100644 index 0000000000..0091a3283b --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-poly1305.c @@ -0,0 +1,142 @@ +/* armv8-poly1305.c + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef HAVE_CONFIG_H + #include +#endif + +#include +#include + +#ifdef WOLFSSL_ARMASM +#ifdef __thumb__ + +#ifdef HAVE_POLY1305 +#include +#include +#include +#include +#ifdef NO_INLINE + #include +#else + #define WOLFSSL_MISC_INCLUDED + #include +#endif +#ifdef CHACHA_AEAD_TEST + #include +#endif + +/* Process 16 bytes of message at a time. + * + * @param [in] ctx Poly1305 context. + * @param [in] m Message to process. + * @param [in] bytes Length of message in bytes. + */ +void poly1305_blocks_thumb2(Poly1305* ctx, const unsigned char* m, + size_t bytes) +{ + poly1305_blocks_thumb2_16(ctx, m, bytes, 1); +} + +/* Process 16 bytes of message. + * + * @param [in] ctx Poly1305 context. + * @param [in] m Message to process. + */ +void poly1305_block_thumb2(Poly1305* ctx, const unsigned char* m) +{ + poly1305_blocks_thumb2_16(ctx, m, POLY1305_BLOCK_SIZE, 1); +} + +/* Set the key for the Poly1305 operation. + * + * @param [in] ctx Poly1305 context. + * @param [in] key Key data to use. + * @param [in] keySz Size of key in bytes. Must be 32. + * @return 0 on success. + * @return BAD_FUNC_ARG when ctx or key is NULL or keySz is not 32. + */ +int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) +{ + int ret = 0; + +#ifdef CHACHA_AEAD_TEST + word32 k; + printf("Poly key used:\n"); + if (key != NULL) { + for (k = 0; k < keySz; k++) { + printf("%02x", key[k]); + if ((k+1) % 8 == 0) + printf("\n"); + } + } + printf("\n"); +#endif + + /* Validate parameters. */ + if ((ctx == NULL) || (key == NULL) || (keySz != 32)) { + ret = BAD_FUNC_ARG; + } + + if (ret == 0) { + poly1305_set_key(ctx, key); + } + + return ret; +} + +/* Finalize the Poly1305 operation calculating the MAC. + * + * @param [in] ctx Poly1305 context. + * @param [in] mac Buffer to hold the MAC. Myst be at least 16 bytes long. + * @return 0 on success. + * @return BAD_FUNC_ARG when ctx or mac is NULL. + */ +int wc_Poly1305Final(Poly1305* ctx, byte* mac) +{ + int ret = 0; + + /* Validate parameters. */ + if ((ctx == NULL) || (mac == NULL)) { + ret = BAD_FUNC_ARG; + } + + /* Process the remaining partial block - last block. */ + if (ret == 0) { + if (ctx->leftover) { + size_t i = ctx->leftover; + ctx->buffer[i++] = 1; + for (; i < POLY1305_BLOCK_SIZE; i++) { + ctx->buffer[i] = 0; + } + poly1305_blocks_thumb2_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE, + 0); + } + + poly1305_final(ctx, mac); + } + + return ret; +} + +#endif /* HAVE_POLY1305 */ +#endif /* __aarch64__ */ +#endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 4103ce083c..535377393f 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -7857,8 +7857,7 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t poly1305_test(void) byte tag[16]; Poly1305 enc; - WOLFSSL_SMALL_STACK_STATIC const byte msg1[] = - { + WOLFSSL_SMALL_STACK_STATIC const byte msg1[] = { 0x43,0x72,0x79,0x70,0x74,0x6f,0x67,0x72, 0x61,0x70,0x68,0x69,0x63,0x20,0x46,0x6f, 0x72,0x75,0x6d,0x20,0x52,0x65,0x73,0x65, @@ -7866,22 +7865,19 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t poly1305_test(void) 0x75,0x70 }; - WOLFSSL_SMALL_STACK_STATIC const byte msg2[] = - { + WOLFSSL_SMALL_STACK_STATIC const byte msg2[] = { 0x48,0x65,0x6c,0x6c,0x6f,0x20,0x77,0x6f,0x72, 0x6c,0x64,0x21 }; - WOLFSSL_SMALL_STACK_STATIC const byte msg3[] = - { + WOLFSSL_SMALL_STACK_STATIC const byte msg3[] = { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 }; - WOLFSSL_SMALL_STACK_STATIC const byte msg4[] = - { + WOLFSSL_SMALL_STACK_STATIC const byte msg4[] = { 0xd3,0x1a,0x8d,0x34,0x64,0x8e,0x60,0xdb, 0x7b,0x86,0xaf,0xbc,0x53,0xef,0x7e,0xc2, 0xa4,0xad,0xed,0x51,0x29,0x6e,0x08,0xfe, @@ -7899,14 +7895,12 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t poly1305_test(void) 0x61,0x16 }; - WOLFSSL_SMALL_STACK_STATIC const byte msg5[] = - { + WOLFSSL_SMALL_STACK_STATIC const byte msg5[] = { 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, }; - WOLFSSL_SMALL_STACK_STATIC const byte msg6[] = - { + WOLFSSL_SMALL_STACK_STATIC const byte msg6[] = { 0xd3,0x1a,0x8d,0x34,0x64,0x8e,0x60,0xdb, 0x7b,0x86,0xaf,0xbc,0x53,0xef,0x7e,0xc2, 0xa4,0xad,0xed,0x51,0x29,0x6e,0x08,0xfe, @@ -7928,54 +7922,57 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t poly1305_test(void) 0xe5,0x76,0xd2,0x65,0x86,0xce,0xc6,0x4b, 0x61,0x16 }; + WOLFSSL_SMALL_STACK_STATIC const byte msg7[] = { + 0xe8,0x8c,0x85,0x03,0x43,0xaf,0xa7,0x85, + 0x21,0x6b,0xc3,0x45,0xc4,0x53,0x98,0xf8, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + }; - byte additional[] = - { + byte additional[] = { 0x50,0x51,0x52,0x53,0xc0,0xc1,0xc2,0xc3, 0xc4,0xc5,0xc6,0xc7 }; - WOLFSSL_SMALL_STACK_STATIC const byte correct0[] = - { + WOLFSSL_SMALL_STACK_STATIC const byte correct0[] = { 0x01,0x03,0x80,0x8a,0xfb,0x0d,0xb2,0xfd, 0x4a,0xbf,0xf6,0xaf,0x41,0x49,0xf5,0x1b }; - WOLFSSL_SMALL_STACK_STATIC const byte correct1[] = - { + WOLFSSL_SMALL_STACK_STATIC const byte correct1[] = { 0xa8,0x06,0x1d,0xc1,0x30,0x51,0x36,0xc6, 0xc2,0x2b,0x8b,0xaf,0x0c,0x01,0x27,0xa9 }; - WOLFSSL_SMALL_STACK_STATIC const byte correct2[] = - { + WOLFSSL_SMALL_STACK_STATIC const byte correct2[] = { 0xa6,0xf7,0x45,0x00,0x8f,0x81,0xc9,0x16, 0xa2,0x0d,0xcc,0x74,0xee,0xf2,0xb2,0xf0 }; - WOLFSSL_SMALL_STACK_STATIC const byte correct3[] = - { + WOLFSSL_SMALL_STACK_STATIC const byte correct3[] = { 0x49,0xec,0x78,0x09,0x0e,0x48,0x1e,0xc6, 0xc2,0x6b,0x33,0xb9,0x1c,0xcc,0x03,0x07 }; - WOLFSSL_SMALL_STACK_STATIC const byte correct4[] = - { + WOLFSSL_SMALL_STACK_STATIC const byte correct4[] = { 0x1a,0xe1,0x0b,0x59,0x4f,0x09,0xe2,0x6a, 0x7e,0x90,0x2e,0xcb,0xd0,0x60,0x06,0x91 }; - WOLFSSL_SMALL_STACK_STATIC const byte correct5[] = - { + WOLFSSL_SMALL_STACK_STATIC const byte correct5[] = { 0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, }; - WOLFSSL_SMALL_STACK_STATIC const byte correct6[] = - { + WOLFSSL_SMALL_STACK_STATIC const byte correct6[] = { 0xea,0x11,0x5c,0x4f,0xd0,0xc0,0x10,0xae, 0xf7,0xdf,0xda,0x77,0xa2,0xe9,0xaf,0xca }; + WOLFSSL_SMALL_STACK_STATIC const byte correct7[] = { + 0x14,0x00,0x00,0x88,0x5c,0x00,0x00,0x88, + 0x5c,0x00,0x00,0x88,0x5c,0x00,0x00,0x88 + }; + WOLFSSL_SMALL_STACK_STATIC const byte key[] = { 0x85,0xd6,0xbe,0x78,0x57,0x55,0x6d,0x33, @@ -8005,17 +8002,25 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t poly1305_test(void) 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 }; - const byte* msgs[] = {NULL, msg1, msg2, msg3, msg5, msg6}; + WOLFSSL_SMALL_STACK_STATIC const byte key7[] = { + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + }; + + const byte* msgs[] = {NULL, msg1, msg2, msg3, msg5, msg6, msg7}; word32 szm[] = {0, sizeof(msg1), sizeof(msg2), - sizeof(msg3), sizeof(msg5), sizeof(msg6)}; - const byte* keys[] = {key, key, key2, key2, key5, key}; + sizeof(msg3), sizeof(msg5), sizeof(msg6), + sizeof(msg7)}; + const byte* keys[] = {key, key, key2, key2, key5, key, key7}; const byte* tests[] = {correct0, correct1, correct2, correct3, correct5, - correct6}; + correct6, correct7}; int i; wc_test_ret_t ret = 0; WOLFSSL_ENTER("poly1305_test"); - for (i = 0; i < 6; i++) { + for (i = 0; i < 7; i++) { ret = wc_Poly1305SetKey(&enc, keys[i], 32); if (ret != 0) return WC_TEST_RET_ENC_I(i); diff --git a/wolfssl/wolfcrypt/chacha.h b/wolfssl/wolfcrypt/chacha.h index 987dc9fb14..42e71aee57 100644 --- a/wolfssl/wolfcrypt/chacha.h +++ b/wolfssl/wolfcrypt/chacha.h @@ -107,6 +107,16 @@ WOLFSSL_API int wc_XChacha_SetKey(ChaCha *ctx, const byte *key, word32 keySz, word32 counter); #endif +#if defined(WOLFSSL_ARMASM) && defined(__thumb__) +void wc_chacha_setiv(word32* x, const byte* iv, word32 counter); +void wc_chacha_setkey(word32* x, const byte* key, word32 keySz); +void wc_chacha_use_over(byte* over, byte* output, const byte* input, + word32 len); +void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len); + +#endif + + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/wolfssl/wolfcrypt/poly1305.h b/wolfssl/wolfcrypt/poly1305.h index a765a8775d..bcc48a6298 100644 --- a/wolfssl/wolfcrypt/poly1305.h +++ b/wolfssl/wolfcrypt/poly1305.h @@ -98,6 +98,12 @@ typedef struct Poly1305 { word64 leftover; unsigned char buffer[POLY1305_BLOCK_SIZE]; unsigned char finished; +#elif defined(WOLFSSL_ARMASM) && defined(__thumb__) + word32 r[4]; + word32 h[5]; + word32 pad[4]; + word32 leftover; + unsigned char buffer[POLY1305_BLOCK_SIZE]; #elif defined(WOLFSSL_RISCV_ASM) word64 r[2]; #ifdef WOLFSSL_RISCV_VECTOR @@ -146,16 +152,30 @@ WOLFSSL_API int wc_Poly1305_MAC(Poly1305* ctx, const byte* additional, #define poly1305_block poly1305_block_aarch64 void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m, - size_t bytes); + size_t bytes); void poly1305_block_aarch64(Poly1305* ctx, const unsigned char *m); #endif +#if defined(__thumb__ ) && defined(WOLFSSL_ARMASM) +#define poly1305_blocks poly1305_blocks_thumb2 +#define poly1305_block poly1305_block_thumb2 + +void poly1305_blocks_thumb2(Poly1305* ctx, const unsigned char *m, + size_t bytes); +void poly1305_block_thumb2(Poly1305* ctx, const unsigned char *m); + +void poly1305_blocks_thumb2_16(Poly1305* ctx, const unsigned char* m, + word32 len, int notLast); +void poly1305_set_key(Poly1305* ctx, const byte* key); +void poly1305_final(Poly1305* ctx, byte* mac); +#endif + #if defined(WOLFSSL_RISCV_ASM) #define poly1305_blocks poly1305_blocks_riscv64 #define poly1305_block poly1305_block_riscv64 void poly1305_blocks_riscv64(Poly1305* ctx, const unsigned char *m, - size_t bytes); + size_t bytes); void poly1305_block_riscv64(Poly1305* ctx, const unsigned char *m); #endif