[PATCH net-next v7 15/28] zinc: Poly1305 ARM and ARM64 implementations

Jason A. Donenfeld Fri, 05 Oct 2018 19:58:34 -0700

These wire Andy Polyakov's implementations up to the kernel. We make a
few small changes to the assembly:


- Entries and exits use the proper kernel convention macro.
- CPU feature checking is done in C by the glue code, so that has been
  removed from the assembly.
- The function names have been renamed to fit kernel conventions.
- Labels have been renamed to fit kernel conventions.
- The neon code can jump to the scalar code when it makes sense to do
  so.

The NEON code uses base 2^26, while the scalar code uses base 2^64 on 64-bit
and base 2^32 on 32-bit. If we hit the unfortunate situation of using NEON
and then having to go back to scalar -- because the user is silly and has
called the update function from two separate contexts -- then we need to
convert back to the original base before proceeding. It is possible to
reason that the initial reduction below is sufficient given the
implementation invariants. However, for an avoidance of doubt and because
this is not performance critical, we do the full reduction anyway. This
conversion is found in the glue code, and a proof of correctness may be
easily obtained from Z3: <https://xn--4db.cc/ltPtHCKN/py>.

Signed-off-by: Jason A. Donenfeld <ja...@zx2c4.com>
Cc: Russell King <li...@armlinux.org.uk>
Cc: linux-arm-ker...@lists.infradead.org
Cc: Samuel Neves <sne...@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumas...@gmail.com>
Cc: Andy Lutomirski <l...@kernel.org>
Cc: Greg KH <gre...@linuxfoundation.org>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: kernel-harden...@lists.openwall.com
Cc: linux-crypto@vger.kernel.org
---
 lib/zinc/Makefile                             |   2 +
 lib/zinc/poly1305/poly1305-arm-glue.c         | 140 +++++++++++++++++
 ...ly1305-arm-cryptogams.S => poly1305-arm.S} | 147 ++++++------------
 ...05-arm64-cryptogams.S => poly1305-arm64.S} | 127 +++++----------
 lib/zinc/poly1305/poly1305.c                  |   2 +
 5 files changed, 231 insertions(+), 187 deletions(-)
 create mode 100644 lib/zinc/poly1305/poly1305-arm-glue.c
 rename lib/zinc/poly1305/{poly1305-arm-cryptogams.S => poly1305-arm.S} (91%)
 rename lib/zinc/poly1305/{poly1305-arm64-cryptogams.S => poly1305-arm64.S} 
(89%)

diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index a8943d960b6a..c09fd3de60f9 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -12,4 +12,6 @@ obj-$(CONFIG_ZINC_CHACHA20) += zinc_chacha20.o
 
 zinc_poly1305-y := poly1305/poly1305.o
 zinc_poly1305-$(CONFIG_ZINC_ARCH_X86_64) += poly1305/poly1305-x86_64.o
+zinc_poly1305-$(CONFIG_ZINC_ARCH_ARM) += poly1305/poly1305-arm.o
+zinc_poly1305-$(CONFIG_ZINC_ARCH_ARM64) += poly1305/poly1305-arm64.o
 obj-$(CONFIG_ZINC_POLY1305) += zinc_poly1305.o
diff --git a/lib/zinc/poly1305/poly1305-arm-glue.c 
b/lib/zinc/poly1305/poly1305-arm-glue.c
new file mode 100644
index 000000000000..f4f08ecffbf6
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-arm-glue.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <ja...@zx2c4.com>. All Rights 
Reserved.
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+asmlinkage void poly1305_init_arm(void *ctx, const u8 key[16]);
+asmlinkage void poly1305_blocks_arm(void *ctx, const u8 *inp, const size_t len,
+                                   const u32 padbit);
+asmlinkage void poly1305_emit_arm(void *ctx, u8 mac[16], const u32 nonce[4]);
+asmlinkage void poly1305_blocks_neon(void *ctx, const u8 *inp, const size_t 
len,
+                                    const u32 padbit);
+asmlinkage void poly1305_emit_neon(void *ctx, u8 mac[16], const u32 nonce[4]);
+
+static bool poly1305_use_neon __ro_after_init;
+static bool *const poly1305_nobs[] __initconst = { &poly1305_use_neon };
+
+static void __init poly1305_fpu_init(void)
+{
+#if defined(CONFIG_ZINC_ARCH_ARM64)
+       poly1305_use_neon = elf_hwcap & HWCAP_ASIMD;
+#elif defined(CONFIG_ZINC_ARCH_ARM)
+       poly1305_use_neon = elf_hwcap & HWCAP_NEON;
+#endif
+}
+
+#if defined(CONFIG_ZINC_ARCH_ARM64)
+struct poly1305_arch_internal {
+       union {
+               u32 h[5];
+               struct {
+                       u64 h0, h1, h2;
+               };
+       };
+       u64 is_base2_26;
+       u64 r[2];
+};
+#elif defined(CONFIG_ZINC_ARCH_ARM)
+struct poly1305_arch_internal {
+       union {
+               u32 h[5];
+               struct {
+                       u64 h0, h1;
+                       u32 h2;
+               } __packed;
+       };
+       u32 r[4];
+       u32 is_base2_26;
+};
+#endif
+
+/* The NEON code uses base 2^26, while the scalar code uses base 2^64 on 64-bit
+ * and base 2^32 on 32-bit. If we hit the unfortunate situation of using NEON
+ * and then having to go back to scalar -- because the user is silly and has
+ * called the update function from two separate contexts -- then we need to
+ * convert back to the original base before proceeding. The below function is
+ * written for 64-bit integers, and so we have to swap words at the end on
+ * big-endian 32-bit. It is possible to reason that the initial reduction below
+ * is sufficient given the implementation invariants. However, for an avoidance
+ * of doubt and because this is not performance critical, we do the full
+ * reduction anyway.
+ */
+static void convert_to_base2_64(void *ctx)
+{
+       struct poly1305_arch_internal *state = ctx;
+       u32 cy;
+
+       if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !state->is_base2_26)
+               return;
+
+       cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
+       cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
+       cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
+       cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
+       state->h0 = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | 
state->h[0];
+       state->h1 = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | 
(state->h[2] >> 12);
+       state->h2 = state->h[4] >> 24;
+       if (IS_ENABLED(CONFIG_ZINC_ARCH_ARM) && 
IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) {
+               state->h0 = rol64(state->h0, 32);
+               state->h1 = rol64(state->h1, 32);
+       }
+#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+       cy = (state->h2 >> 2) + (state->h2 & ~3ULL);
+       state->h2 &= 3;
+       state->h0 += cy;
+       state->h1 += (cy = ULT(state->h0, cy));
+       state->h2 += ULT(state->h1, cy);
+#undef ULT
+       state->is_base2_26 = 0;
+}
+
+static inline bool poly1305_init_arch(void *ctx,
+                                     const u8 key[POLY1305_KEY_SIZE])
+{
+       poly1305_init_arm(ctx, key);
+       return true;
+}
+
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
+                                       size_t len, const u32 padbit,
+                                       simd_context_t *simd_context)
+{
+       /* SIMD disables preemption, so relax after processing each page. */
+       BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
+                    PAGE_SIZE % POLY1305_BLOCK_SIZE);
+
+       if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !poly1305_use_neon ||
+           !simd_use(simd_context)) {
+               convert_to_base2_64(ctx);
+               poly1305_blocks_arm(ctx, inp, len, padbit);
+               return true;
+       }
+
+       for (;;) {
+               const size_t bytes = min_t(size_t, len, PAGE_SIZE);
+
+               poly1305_blocks_neon(ctx, inp, bytes, padbit);
+               len -= bytes;
+               if (!len)
+                       break;
+               inp += bytes;
+               simd_relax(simd_context);
+       }
+       return true;
+}
+
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+                                     const u32 nonce[4],
+                                     simd_context_t *simd_context)
+{
+       if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !poly1305_use_neon ||
+           !simd_use(simd_context)) {
+               convert_to_base2_64(ctx);
+               poly1305_emit_arm(ctx, mac, nonce);
+       } else
+               poly1305_emit_neon(ctx, mac, nonce);
+       return true;
+}
diff --git a/lib/zinc/poly1305/poly1305-arm-cryptogams.S 
b/lib/zinc/poly1305/poly1305-arm.S
similarity index 91%
rename from lib/zinc/poly1305/poly1305-arm-cryptogams.S
rename to lib/zinc/poly1305/poly1305-arm.S
index 884b465030e4..4a0e9d451119 100644
--- a/lib/zinc/poly1305/poly1305-arm-cryptogams.S
+++ b/lib/zinc/poly1305/poly1305-arm.S
@@ -1,9 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
 /*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <ja...@zx2c4.com>. All Rights 
Reserved.
  * Copyright (C) 2006-2017 CRYPTOGAMS by <ap...@openssl.org>. All Rights 
Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
  */
 
-#include "arm_arch.h"
+#include <linux/linkage.h>
 
 .text
 #if defined(__thumb2__)
@@ -13,13 +16,8 @@
 .code  32
 #endif
 
-.globl poly1305_emit
-.globl poly1305_blocks
-.globl poly1305_init
-.type  poly1305_init,%function
 .align 5
-poly1305_init:
-.Lpoly1305_init:
+ENTRY(poly1305_init_arm)
        stmdb   sp!,{r4-r11}
 
        eor     r3,r3,r3
@@ -38,10 +36,6 @@ poly1305_init:
        moveq   r0,#0
        beq     .Lno_key
 
-#if    __ARM_MAX_ARCH__>=7
-       adr     r11,.Lpoly1305_init
-       ldr     r12,.LOPENSSL_armcap
-#endif
        ldrb    r4,[r1,#0]
        mov     r10,#0x0fffffff
        ldrb    r5,[r1,#1]
@@ -56,12 +50,6 @@ poly1305_init:
        ldrb    r7,[r1,#6]
        and     r4,r4,r10
 
-#if    __ARM_MAX_ARCH__>=7
-       ldr     r12,[r11,r12]           @ OPENSSL_armcap_P
-# ifdef        __APPLE__
-       ldr     r12,[r12]
-# endif
-#endif
        ldrb    r8,[r1,#7]
        orr     r5,r5,r6,lsl#8
        ldrb    r6,[r1,#8]
@@ -71,35 +59,6 @@ poly1305_init:
        ldrb    r8,[r1,#10]
        and     r5,r5,r3
 
-#if    __ARM_MAX_ARCH__>=7
-       tst     r12,#ARMV7_NEON         @ check for NEON
-# ifdef        __APPLE__
-       adr     r9,poly1305_blocks_neon
-       adr     r11,poly1305_blocks
-#  ifdef __thumb2__
-       it      ne
-#  endif
-       movne   r11,r9
-       adr     r12,poly1305_emit
-       adr     r10,poly1305_emit_neon
-#  ifdef __thumb2__
-       it      ne
-#  endif
-       movne   r12,r10
-# else
-#  ifdef __thumb2__
-       itete   eq
-#  endif
-       addeq   r12,r11,#(poly1305_emit-.Lpoly1305_init)
-       addne   r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
-       addeq   r11,r11,#(poly1305_blocks-.Lpoly1305_init)
-       addne   r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
-# endif
-# ifdef        __thumb2__
-       orr     r12,r12,#1      @ thumb-ify address
-       orr     r11,r11,#1
-# endif
-#endif
        ldrb    r9,[r1,#11]
        orr     r6,r6,r7,lsl#8
        ldrb    r7,[r1,#12]
@@ -118,26 +77,20 @@ poly1305_init:
        str     r6,[r0,#8]
        and     r7,r7,r3
        str     r7,[r0,#12]
-#if    __ARM_MAX_ARCH__>=7
-       stmia   r2,{r11,r12}            @ fill functions table
-       mov     r0,#1
-#else
-       mov     r0,#0
-#endif
 .Lno_key:
        ldmia   sp!,{r4-r11}
-#if    __ARM_ARCH__>=5
+#if __LINUX_ARM_ARCH__ >= 5
        bx      lr                              @ bx    lr
 #else
        tst     lr,#1
        moveq   pc,lr                   @ be binary compatible with V4, yet
        .word   0xe12fff1e                      @ interoperable with Thumb 
ISA:-)
 #endif
-.size  poly1305_init,.-poly1305_init
-.type  poly1305_blocks,%function
+ENDPROC(poly1305_init_arm)
+
 .align 5
-poly1305_blocks:
-.Lpoly1305_blocks:
+ENTRY(poly1305_blocks_arm)
+.Lpoly1305_blocks_arm:
        stmdb   sp!,{r3-r11,lr}
 
        ands    r2,r2,#-16
@@ -158,11 +111,11 @@ poly1305_blocks:
        b       .Loop
 
 .Loop:
-#if __ARM_ARCH__<7
+#if __LINUX_ARM_ARCH__ < 7
        ldrb    r0,[lr],#16             @ load input
-# ifdef        __thumb2__
+#ifdef __thumb2__
        it      hi
-# endif
+#endif
        addhi   r8,r8,#1                @ 1<<128
        ldrb    r1,[lr,#-15]
        ldrb    r2,[lr,#-14]
@@ -201,19 +154,19 @@ poly1305_blocks:
        orr     r3,r2,r3,lsl#24
 #else
        ldr     r0,[lr],#16             @ load input
-# ifdef        __thumb2__
+#ifdef __thumb2__
        it      hi
-# endif
+#endif
        addhi   r8,r8,#1                @ padbit
        ldr     r1,[lr,#-12]
        ldr     r2,[lr,#-8]
        ldr     r3,[lr,#-4]
-# ifdef        __ARMEB__
+#ifdef __ARMEB__
        rev     r0,r0
        rev     r1,r1
        rev     r2,r2
        rev     r3,r3
-# endif
+#endif
        adds    r4,r4,r0                @ accumulate input
        str     lr,[sp,#8]              @ offload input pointer
        adcs    r5,r5,r1
@@ -283,7 +236,7 @@ poly1305_blocks:
        stmia   r0,{r4-r8}              @ store the result
 
 .Lno_data:
-#if    __ARM_ARCH__>=5
+#if __LINUX_ARM_ARCH__ >= 5
        ldmia   sp!,{r3-r11,pc}
 #else
        ldmia   sp!,{r3-r11,lr}
@@ -291,13 +244,12 @@ poly1305_blocks:
        moveq   pc,lr                   @ be binary compatible with V4, yet
        .word   0xe12fff1e                      @ interoperable with Thumb 
ISA:-)
 #endif
-.size  poly1305_blocks,.-poly1305_blocks
-.type  poly1305_emit,%function
+ENDPROC(poly1305_blocks_arm)
+
 .align 5
-poly1305_emit:
+ENTRY(poly1305_emit_arm)
        stmdb   sp!,{r4-r11}
 .Lpoly1305_emit_enter:
-
        ldmia   r0,{r3-r7}
        adds    r8,r3,#5                @ compare to modulus
        adcs    r9,r4,#0
@@ -332,13 +284,13 @@ poly1305_emit:
        adcs    r5,r5,r10
        adc     r6,r6,r11
 
-#if __ARM_ARCH__>=7
-# ifdef __ARMEB__
+#if __LINUX_ARM_ARCH__ >= 7
+#ifdef __ARMEB__
        rev     r3,r3
        rev     r4,r4
        rev     r5,r5
        rev     r6,r6
-# endif
+#endif
        str     r3,[r1,#0]
        str     r4,[r1,#4]
        str     r5,[r1,#8]
@@ -377,20 +329,22 @@ poly1305_emit:
        strb    r6,[r1,#15]
 #endif
        ldmia   sp!,{r4-r11}
-#if    __ARM_ARCH__>=5
+#if __LINUX_ARM_ARCH__ >= 5
        bx      lr                              @ bx    lr
 #else
        tst     lr,#1
        moveq   pc,lr                   @ be binary compatible with V4, yet
        .word   0xe12fff1e                      @ interoperable with Thumb 
ISA:-)
 #endif
-.size  poly1305_emit,.-poly1305_emit
-#if    __ARM_MAX_ARCH__>=7
+ENDPROC(poly1305_emit_arm)
+
+
+#ifdef CONFIG_KERNEL_MODE_NEON
 .fpu   neon
 
-.type  poly1305_init_neon,%function
 .align 5
-poly1305_init_neon:
+ENTRY(poly1305_init_neon)
+.Lpoly1305_init_neon:
        ldr     r4,[r0,#20]             @ load key base 2^32
        ldr     r5,[r0,#24]
        ldr     r6,[r0,#28]
@@ -600,11 +554,10 @@ poly1305_init_neon:
        vst1.32         {d8[1]},[r7]
 
        bx      lr                              @ bx    lr
-.size  poly1305_init_neon,.-poly1305_init_neon
+ENDPROC(poly1305_init_neon)
 
-.type  poly1305_blocks_neon,%function
 .align 5
-poly1305_blocks_neon:
+ENTRY(poly1305_blocks_neon)
        ldr     ip,[r0,#36]             @ is_base2_26
        ands    r2,r2,#-16
        beq     .Lno_data_neon
@@ -612,7 +565,7 @@ poly1305_blocks_neon:
        cmp     r2,#64
        bhs     .Lenter_neon
        tst     ip,ip                   @ is_base2_26?
-       beq     .Lpoly1305_blocks
+       beq     .Lpoly1305_blocks_arm
 
 .Lenter_neon:
        stmdb   sp!,{r4-r7}
@@ -622,7 +575,7 @@ poly1305_blocks_neon:
        bne     .Lbase2_26_neon
 
        stmdb   sp!,{r1-r3,lr}
-       bl      poly1305_init_neon
+       bl      .Lpoly1305_init_neon
 
        ldr     r4,[r0,#0]              @ load hash value base 2^32
        ldr     r5,[r0,#4]
@@ -686,12 +639,12 @@ poly1305_blocks_neon:
        sub             r2,r2,#16
        add             r4,r1,#32
 
-# ifdef        __ARMEB__
+#ifdef __ARMEB__
        vrev32.8        q10,q10
        vrev32.8        q13,q13
        vrev32.8        q11,q11
        vrev32.8        q12,q12
-# endif
+#endif
        vsri.u32        d28,d26,#8      @ base 2^32 -> base 2^26
        vshl.u32        d26,d26,#18
 
@@ -735,12 +688,12 @@ poly1305_blocks_neon:
        addhi           r7,r0,#(48+1*9*4)
        addhi           r6,r0,#(48+3*9*4)
 
-# ifdef        __ARMEB__
+#ifdef __ARMEB__
        vrev32.8        q10,q10
        vrev32.8        q13,q13
        vrev32.8        q11,q11
        vrev32.8        q12,q12
-# endif
+#endif
        vsri.u32        q14,q13,#8              @ base 2^32 -> base 2^26
        vshl.u32        q13,q13,#18
 
@@ -866,12 +819,12 @@ poly1305_blocks_neon:
 
        vld4.32         {d20,d22,d24,d26},[r1]  @ inp[0:1]
        add             r1,r1,#64
-# ifdef        __ARMEB__
+#ifdef __ARMEB__
        vrev32.8        q10,q10
        vrev32.8        q11,q11
        vrev32.8        q12,q12
        vrev32.8        q13,q13
-# endif
+#endif
 
        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
        @ lazy reduction interleaved with base 2^32 -> base 2^26 of
@@ -1086,11 +1039,10 @@ poly1305_blocks_neon:
        ldmia   sp!,{r4-r7}
 .Lno_data_neon:
        bx      lr                                      @ bx    lr
-.size  poly1305_blocks_neon,.-poly1305_blocks_neon
+ENDPROC(poly1305_blocks_neon)
 
-.type  poly1305_emit_neon,%function
 .align 5
-poly1305_emit_neon:
+ENTRY(poly1305_emit_neon)
        ldr     ip,[r0,#36]             @ is_base2_26
 
        stmdb   sp!,{r4-r11}
@@ -1144,12 +1096,12 @@ poly1305_emit_neon:
        adcs    r5,r5,r10
        adc     r6,r6,r11
 
-# ifdef __ARMEB__
+#ifdef __ARMEB__
        rev     r3,r3
        rev     r4,r4
        rev     r5,r5
        rev     r6,r6
-# endif
+#endif
        str     r3,[r1,#0]              @ store the result
        str     r4,[r1,#4]
        str     r5,[r1,#8]
@@ -1157,16 +1109,9 @@ poly1305_emit_neon:
 
        ldmia   sp!,{r4-r11}
        bx      lr                              @ bx    lr
-.size  poly1305_emit_neon,.-poly1305_emit_neon
+ENDPROC(poly1305_emit_neon)
 
 .align 5
 .Lzeros:
 .long  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.LOPENSSL_armcap:
-.word  OPENSSL_armcap_P-.Lpoly1305_init
-#endif
-.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <ap...@openssl.org>"
-.align 2
-#if    __ARM_MAX_ARCH__>=7
-.comm   OPENSSL_armcap_P,4,4
 #endif
diff --git a/lib/zinc/poly1305/poly1305-arm64-cryptogams.S 
b/lib/zinc/poly1305/poly1305-arm64.S
similarity index 89%
rename from lib/zinc/poly1305/poly1305-arm64-cryptogams.S
rename to lib/zinc/poly1305/poly1305-arm64.S
index 0ecb50a83ec0..5f4e7fb0a836 100644
--- a/lib/zinc/poly1305/poly1305-arm64-cryptogams.S
+++ b/lib/zinc/poly1305/poly1305-arm64.S
@@ -1,21 +1,16 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
 /*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <ja...@zx2c4.com>. All Rights 
Reserved.
  * Copyright (C) 2006-2017 CRYPTOGAMS by <ap...@openssl.org>. All Rights 
Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
  */
 
-#include "arm_arch.h"
-
+#include <linux/linkage.h>
 .text
 
-// forward "declarations" are required for Apple
-
-.globl poly1305_blocks
-.globl poly1305_emit
-
-.globl poly1305_init
-.type  poly1305_init,%function
 .align 5
-poly1305_init:
+ENTRY(poly1305_init_arm)
        cmp     x1,xzr
        stp     xzr,xzr,[x0]            // zero hash value
        stp     xzr,xzr,[x0,#16]        // [along with is_base2_26]
@@ -23,18 +18,10 @@ poly1305_init:
        csel    x0,xzr,x0,eq
        b.eq    .Lno_key
 
-#ifdef __ILP32__
-       ldrsw   x11,.LOPENSSL_armcap_P
-#else
-       ldr     x11,.LOPENSSL_armcap_P
-#endif
-       adr     x10,.LOPENSSL_armcap_P
-
        ldp     x7,x8,[x1]              // load key
        mov     x9,#0xfffffffc0fffffff
        movk    x9,#0x0fff,lsl#48
-       ldr     w17,[x10,x11]
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
        rev     x7,x7                   // flip bytes
        rev     x8,x8
 #endif
@@ -43,30 +30,12 @@ poly1305_init:
        and     x8,x8,x9                // &=0ffffffc0ffffffc
        stp     x7,x8,[x0,#32]  // save key value
 
-       tst     w17,#ARMV7_NEON
-
-       adr     x12,poly1305_blocks
-       adr     x7,poly1305_blocks_neon
-       adr     x13,poly1305_emit
-       adr     x8,poly1305_emit_neon
-
-       csel    x12,x12,x7,eq
-       csel    x13,x13,x8,eq
-
-#ifdef __ILP32__
-       stp     w12,w13,[x2]
-#else
-       stp     x12,x13,[x2]
-#endif
-
-       mov     x0,#1
 .Lno_key:
        ret
-.size  poly1305_init,.-poly1305_init
+ENDPROC(poly1305_init_arm)
 
-.type  poly1305_blocks,%function
 .align 5
-poly1305_blocks:
+ENTRY(poly1305_blocks_arm)
        ands    x2,x2,#-16
        b.eq    .Lno_data
 
@@ -80,7 +49,7 @@ poly1305_blocks:
 .Loop:
        ldp     x10,x11,[x1],#16        // load input
        sub     x2,x2,#16
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
        rev     x10,x10
        rev     x11,x11
 #endif
@@ -126,11 +95,10 @@ poly1305_blocks:
 
 .Lno_data:
        ret
-.size  poly1305_blocks,.-poly1305_blocks
+ENDPROC(poly1305_blocks_arm)
 
-.type  poly1305_emit,%function
 .align 5
-poly1305_emit:
+ENTRY(poly1305_emit_arm)
        ldp     x4,x5,[x0]              // load hash base 2^64
        ldr     x6,[x0,#16]
        ldp     x10,x11,[x2]    // load nonce
@@ -144,23 +112,23 @@ poly1305_emit:
        csel    x4,x4,x12,eq
        csel    x5,x5,x13,eq
 
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
        ror     x10,x10,#32             // flip nonce words
        ror     x11,x11,#32
 #endif
        adds    x4,x4,x10               // accumulate nonce
        adc     x5,x5,x11
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
        rev     x4,x4                   // flip output bytes
        rev     x5,x5
 #endif
        stp     x4,x5,[x1]              // write result
 
        ret
-.size  poly1305_emit,.-poly1305_emit
-.type  poly1305_mult,%function
+ENDPROC(poly1305_emit_arm)
+
 .align 5
-poly1305_mult:
+__poly1305_mult:
        mul     x12,x4,x7               // h0*r0
        umulh   x13,x4,x7
 
@@ -193,11 +161,8 @@ poly1305_mult:
        adc     x6,x6,xzr
 
        ret
-.size  poly1305_mult,.-poly1305_mult
 
-.type  poly1305_splat,%function
-.align 5
-poly1305_splat:
+__poly1305_splat:
        and     x12,x4,#0x03ffffff      // base 2^64 -> base 2^26
        ubfx    x13,x4,#26,#26
        extr    x14,x5,x4,#52
@@ -220,15 +185,14 @@ poly1305_splat:
        str     w15,[x0,#16*8]  // s4
 
        ret
-.size  poly1305_splat,.-poly1305_splat
 
-.type  poly1305_blocks_neon,%function
+#ifdef CONFIG_KERNEL_MODE_NEON
 .align 5
-poly1305_blocks_neon:
+ENTRY(poly1305_blocks_neon)
        ldr     x17,[x0,#24]
        cmp     x2,#128
        b.hs    .Lblocks_neon
-       cbz     x17,poly1305_blocks
+       cbz     x17,poly1305_blocks_arm
 
 .Lblocks_neon:
        stp     x29,x30,[sp,#-80]!
@@ -268,7 +232,7 @@ poly1305_blocks_neon:
        adcs    x5,x5,xzr
        adc     x6,x6,xzr
 
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
        rev     x12,x12
        rev     x13,x13
 #endif
@@ -276,7 +240,7 @@ poly1305_blocks_neon:
        adcs    x5,x5,x13
        adc     x6,x6,x3
 
-       bl      poly1305_mult
+       bl      __poly1305_mult
        ldr     x30,[sp,#8]
 
        cbz     x3,.Lstore_base2_64_neon
@@ -314,7 +278,7 @@ poly1305_blocks_neon:
        ldp     x12,x13,[x1],#16        // load input
        sub     x2,x2,#16
        add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
        rev     x12,x12
        rev     x13,x13
 #endif
@@ -322,7 +286,7 @@ poly1305_blocks_neon:
        adcs    x5,x5,x13
        adc     x6,x6,x3
 
-       bl      poly1305_mult
+       bl      __poly1305_mult
 
 .Linit_neon:
        and     x10,x4,#0x03ffffff      // base 2^64 -> base 2^26
@@ -349,19 +313,19 @@ poly1305_blocks_neon:
        mov     x5,x8
        mov     x6,xzr
        add     x0,x0,#48+12
-       bl      poly1305_splat
+       bl      __poly1305_splat
 
-       bl      poly1305_mult           // r^2
+       bl      __poly1305_mult         // r^2
        sub     x0,x0,#4
-       bl      poly1305_splat
+       bl      __poly1305_splat
 
-       bl      poly1305_mult           // r^3
+       bl      __poly1305_mult         // r^3
        sub     x0,x0,#4
-       bl      poly1305_splat
+       bl      __poly1305_splat
 
-       bl      poly1305_mult           // r^4
+       bl      __poly1305_mult         // r^4
        sub     x0,x0,#4
-       bl      poly1305_splat
+       bl      __poly1305_splat
        ldr     x30,[sp,#8]
 
        add     x16,x1,#32
@@ -399,7 +363,7 @@ poly1305_blocks_neon:
        lsl     x3,x3,#24
        add     x15,x0,#48
 
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
        rev     x8,x8
        rev     x12,x12
        rev     x9,x9
@@ -435,7 +399,7 @@ poly1305_blocks_neon:
        ld1     {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
        ld1     {v8.4s},[x15]
 
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
        rev     x8,x8
        rev     x12,x12
        rev     x9,x9
@@ -496,7 +460,7 @@ poly1305_blocks_neon:
        umull   v20.2d,v14.2s,v1.s[2]
        ldp     x9,x13,[x16],#48
        umull   v19.2d,v14.2s,v0.s[2]
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
        rev     x8,x8
        rev     x12,x12
        rev     x9,x9
@@ -561,7 +525,7 @@ poly1305_blocks_neon:
        umlal   v23.2d,v11.2s,v3.s[0]
        umlal   v20.2d,v11.2s,v8.s[0]
        umlal   v21.2d,v11.2s,v0.s[0]
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
        rev     x8,x8
        rev     x12,x12
        rev     x9,x9
@@ -801,13 +765,12 @@ poly1305_blocks_neon:
 .Lno_data_neon:
        ldr     x29,[sp],#80
        ret
-.size  poly1305_blocks_neon,.-poly1305_blocks_neon
+ENDPROC(poly1305_blocks_neon)
 
-.type  poly1305_emit_neon,%function
 .align 5
-poly1305_emit_neon:
+ENTRY(poly1305_emit_neon)
        ldr     x17,[x0,#24]
-       cbz     x17,poly1305_emit
+       cbz     x17,poly1305_emit_arm
 
        ldp     w10,w11,[x0]            // load hash value base 2^26
        ldp     w12,w13,[x0,#8]
@@ -840,30 +803,22 @@ poly1305_emit_neon:
        csel    x4,x4,x12,eq
        csel    x5,x5,x13,eq
 
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
        ror     x10,x10,#32             // flip nonce words
        ror     x11,x11,#32
 #endif
        adds    x4,x4,x10               // accumulate nonce
        adc     x5,x5,x11
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
        rev     x4,x4                   // flip output bytes
        rev     x5,x5
 #endif
        stp     x4,x5,[x1]              // write result
 
        ret
-.size  poly1305_emit_neon,.-poly1305_emit_neon
+ENDPROC(poly1305_emit_neon)
 
 .align 5
 .Lzeros:
 .long  0,0,0,0,0,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef __ILP32__
-.long  OPENSSL_armcap_P-.
-#else
-.quad  OPENSSL_armcap_P-.
 #endif
-.byte  
80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 2
diff --git a/lib/zinc/poly1305/poly1305.c b/lib/zinc/poly1305/poly1305.c
index 51af7045cac8..9dc85f62e806 100644
--- a/lib/zinc/poly1305/poly1305.c
+++ b/lib/zinc/poly1305/poly1305.c
@@ -18,6 +18,8 @@
 
 #if defined(CONFIG_ZINC_ARCH_X86_64)
 #include "poly1305-x86_64-glue.c"
+#elif defined(CONFIG_ZINC_ARCH_ARM) || defined(CONFIG_ZINC_ARCH_ARM64)
+#include "poly1305-arm-glue.c"
 #else
 static inline bool poly1305_init_arch(void *ctx,
                                      const u8 key[POLY1305_KEY_SIZE])
-- 
2.19.0

[PATCH net-next v7 15/28] zinc: Poly1305 ARM and ARM64 implementations

Reply via email to