This provides AVX, AVX-2, and AVX-512F implementations for Poly1305.
The AVX-512F implementation is disabled on Skylake, due to throttling.
These come from Andy Polyakov's implementation, with the following
modifications from Samuel Neves:

  - Some cosmetic changes, like renaming labels to .Lname, constants,
    and other Linux conventions.

  - CPU feature checking is done in C by the glue code, so that has been
    removed from the assembly.

  - poly1305_blocks_avx512 jumped to the middle of the poly1305_blocks_avx2
    for the final blocks. To appease objtool, the relevant tail avx2 code
    was duplicated for the avx512 function.

  - The original uses %rbp as a scratch register. However, the kernel
    expects %rbp to be a valid frame pointer at any given time in order
    to do proper unwinding. Thus we need to alter the code in order to
    preserve it. The most straightforward manner in which this was
    accomplished was by replacing $d3, formerly %r10, by %rdi, and
    replacing %rbp by %r10. Because %rdi, a pointer to the context
    structure, does not change and is not used by poly1305_iteration,
    it is safe to use it here, and the overhead of saving and restoring
    it should be minimal.

  - The original hardcodes returns as .byte 0xf3,0xc3, aka "rep ret".
    We replace this by "ret". "rep ret" was meant to help with AMD K8
    chips, cf. http://repzret.org/p/repzret. It makes no sense to
    continue to use this kludge for code that won't even run on ancient
    AMD chips.

While this is CRYPTOGAMS code, the originating code for this happens to
be the same as OpenSSL's commit 4dfe4310c31c4483705991d9a798ce9be1ed1c68

Cycle counts on a Core i7 6700HQ using the AVX-2 codepath:

size    old     new
----    ----    ----
0       70      68
16      92      90
32      134     104
48      172     120
64      218     136
80      254     158
96      298     174
112     342     192
128     388     212
144     428     228
160     466     246
176     510     264
192     550     282
208     594     302
224     628     316
240     676     334
256     716     354
272     764     374
288     802     352
304     420     366
320     428     360
336     484     378
352     426     384
368     478     400
384     488     394
400     542     408
416     486     416
432     534     430
448     544     422
464     600     438
480     540     448
496     594     464
512     602     456
528     656     476
544     600     480
560     650     494
576     664     490
592     714     508
608     656     514
624     708     532
640     716     524
656     770     536
672     716     548
688     770     562
704     774     552
720     826     568
736     768     574
752     822     592
768     830     584
784     884     602
800     828     610
816     884     628
832     888     618
848     942     632
864     884     644
880     936     660
896     948     652
912     1000    664
928     942     676
944     994     690
960     1002    680
976     1054    694
992     1002    706
1008    1052    720

Cycle counts on a Xeon Gold 5120 using the AVX-512 codepath:

size    old     new
----    ----    ----
0       74      70
16      96      92
32      136     106
48      184     124
64      218     138
80      260     160
96      300     176
112     342     194
128     384     212
144     420     226
160     464     248
176     504     264
192     544     282
208     582     300
224     624     318
240     662     338
256     708     358
272     748     372
288     788     358
304     422     370
320     432     364
336     486     380
352     434     390
368     480     408
384     490     398
400     542     412
416     492     426
432     538     436
448     546     432
464     600     448
480     548     456
496     594     476
512     606     470
528     656     480
544     606     498
560     652     512
576     662     508
592     716     522
608     664     538
624     710     552
640     720     516
656     772     526
672     722     544
688     768     556
704     778     556
720     832     568
736     780     584
752     826     600
768     836     560
784     888     572
800     838     588
816     884     604
832     894     598
848     946     612
864     896     628
880     942     644
896     952     608
912     1004    616
928     954     634
944     1000    646
960     1008    646
976     1062    658
992     1012    674
1008    1058    690

Signed-off-by: Jason A. Donenfeld <ja...@zx2c4.com>
Signed-off-by: Samuel Neves <sne...@dei.uc.pt>
Co-developed-by: Samuel Neves <sne...@dei.uc.pt>
Based-on-code-from: Andy Polyakov <ap...@openssl.org>
Cc: Andy Lutomirski <l...@kernel.org>
Cc: Greg KH <gre...@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumas...@gmail.com>
Cc: Andy Polyakov <ap...@openssl.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: x...@kernel.org
---
 lib/zinc/Makefile                        |    1 +
 lib/zinc/poly1305/poly1305-x86_64-glue.h |  157 ++
 lib/zinc/poly1305/poly1305-x86_64.S      | 2792 ++++++++++++++++++++++
 lib/zinc/poly1305/poly1305.c             |    4 +-
 4 files changed, 2953 insertions(+), 1 deletion(-)
 create mode 100644 lib/zinc/poly1305/poly1305-x86_64-glue.h
 create mode 100644 lib/zinc/poly1305/poly1305-x86_64.S

diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 6fc9626c55fa..a8943d960b6a 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -11,4 +11,5 @@ AFLAGS_chacha20-mips.o += -O2 # This is required to fill the 
branch delay slots
 obj-$(CONFIG_ZINC_CHACHA20) += zinc_chacha20.o
 
 zinc_poly1305-y := poly1305/poly1305.o
+zinc_poly1305-$(CONFIG_ZINC_ARCH_X86_64) += poly1305/poly1305-x86_64.o
 obj-$(CONFIG_ZINC_POLY1305) += zinc_poly1305.o
diff --git a/lib/zinc/poly1305/poly1305-x86_64-glue.h 
b/lib/zinc/poly1305/poly1305-x86_64-glue.h
new file mode 100644
index 000000000000..3ca1bdd02c6b
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-x86_64-glue.h
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <ja...@zx2c4.com>. All Rights 
Reserved.
+ */
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/intel-family.h>
+
+asmlinkage void poly1305_init_x86_64(void *ctx,
+                                    const u8 key[POLY1305_KEY_SIZE]);
+asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
+                                      const size_t len, const u32 padbit);
+asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+                                    const u32 nonce[4]);
+#ifdef CONFIG_AS_AVX
+asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+                                 const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
+                                   const u32 padbit);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t 
len,
+                                    const u32 padbit);
+#endif
+#ifdef CONFIG_AS_AVX512
+asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
+                                      const size_t len, const u32 padbit);
+#endif
+
+static bool poly1305_use_avx __ro_after_init;
+static bool poly1305_use_avx2 __ro_after_init;
+static bool poly1305_use_avx512 __ro_after_init;
+
+static void __init poly1305_fpu_init(void)
+{
+       poly1305_use_avx =
+               boot_cpu_has(X86_FEATURE_AVX) &&
+               cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+       poly1305_use_avx2 =
+               boot_cpu_has(X86_FEATURE_AVX) &&
+               boot_cpu_has(X86_FEATURE_AVX2) &&
+               cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+       poly1305_use_avx512 =
+               boot_cpu_has(X86_FEATURE_AVX) &&
+               boot_cpu_has(X86_FEATURE_AVX2) &&
+               boot_cpu_has(X86_FEATURE_AVX512F) &&
+               cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+                                 XFEATURE_MASK_AVX512, NULL) &&
+               /* Skylake downclocks unacceptably much when using zmm. */
+               boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
+}
+
+static inline bool poly1305_init_arch(void *ctx,
+                                     const u8 key[POLY1305_KEY_SIZE])
+{
+       poly1305_init_x86_64(ctx, key);
+       return true;
+}
+
+struct poly1305_arch_internal {
+       union {
+               struct {
+                       u32 h[5];
+                       u32 is_base2_26;
+               };
+               u64 hs[3];
+       };
+       u64 r[2];
+       u64 pad;
+       struct { u32 r2, r1, r4, r3; } rn[9];
+};
+
+static void convert_to_base2_64(void *ctx)
+{
+       struct poly1305_arch_internal *state = ctx;
+       u32 cy;
+
+       if (!state->is_base2_26)
+               return;
+
+       cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
+       cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
+       cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
+       cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
+       state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | 
state->h[0];
+       state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | 
(state->h[2] >> 12);
+       state->hs[2] = state->h[4] >> 24;
+#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+       cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
+       state->hs[2] &= 3;
+       state->hs[0] += cy;
+       state->hs[1] += (cy = ULT(state->hs[0], cy));
+       state->hs[2] += ULT(state->hs[1], cy);
+#undef ULT
+       state->is_base2_26 = 0;
+}
+
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
+                                       const size_t len, const u32 padbit,
+                                       simd_context_t *simd_context)
+{
+       struct poly1305_arch_internal *state = ctx;
+
+       if (!poly1305_use_avx ||
+           (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
+           !simd_use(simd_context))
+               goto scalar;
+
+#ifdef CONFIG_AS_AVX512
+       if (poly1305_use_avx512) {
+               poly1305_blocks_avx512(ctx, inp, len, padbit);
+               return true;
+       }
+#endif
+
+#ifdef CONFIG_AS_AVX2
+       if (poly1305_use_avx2) {
+               poly1305_blocks_avx2(ctx, inp, len, padbit);
+               return true;
+       }
+#endif
+
+#ifdef CONFIG_AS_AVX
+       if (poly1305_use_avx) {
+               poly1305_blocks_avx(ctx, inp, len, padbit);
+               return true;
+       }
+#endif
+
+scalar:
+       convert_to_base2_64(ctx);
+       poly1305_blocks_x86_64(ctx, inp, len, padbit);
+       return true;
+}
+
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+                                     const u32 nonce[4],
+                                     simd_context_t *simd_context)
+{
+       struct poly1305_arch_internal *state = ctx;
+
+       if (!poly1305_use_avx || !state->is_base2_26 ||!simd_use(simd_context))
+               goto scalar;
+
+#ifdef CONFIG_AS_AVX
+       if (poly1305_use_avx || poly1305_use_avx2 || poly1305_use_avx512) {
+               poly1305_emit_avx(ctx, mac, nonce);
+               return true;
+       }
+#endif
+
+scalar:
+       convert_to_base2_64(ctx);
+       poly1305_emit_x86_64(ctx, mac, nonce);
+       return true;
+}
diff --git a/lib/zinc/poly1305/poly1305-x86_64.S 
b/lib/zinc/poly1305/poly1305-x86_64.S
new file mode 100644
index 000000000000..3c3f2b4d880b
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-x86_64.S
@@ -0,0 +1,2792 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright (C) 2017 Samuel Neves <sne...@dei.uc.pt>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <ja...@zx2c4.com>. All Rights 
Reserved.
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <ap...@openssl.org>. All Rights 
Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst192.Lconst, "aM", @progbits, 192
+.align 64
+.Lconst:
+.long  0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.long  16777216,0,16777216,0,16777216,0,16777216,0
+.long  0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.long  2,2,2,3,2,0,2,1
+.long  0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.text
+
+.align 32
+ENTRY(poly1305_init_x86_64)
+       xorq    %rax,%rax
+       movq    %rax,0(%rdi)
+       movq    %rax,8(%rdi)
+       movq    %rax,16(%rdi)
+
+       cmpq    $0,%rsi
+       je      .Lno_key
+
+       movq    $0x0ffffffc0fffffff,%rax
+       movq    $0x0ffffffc0ffffffc,%rcx
+       andq    0(%rsi),%rax
+       andq    8(%rsi),%rcx
+       movq    %rax,24(%rdi)
+       movq    %rcx,32(%rdi)
+       movl    $1,%eax
+.Lno_key:
+       ret
+ENDPROC(poly1305_init_x86_64)
+
+.align 32
+ENTRY(poly1305_blocks_x86_64)
+.Lblocks:
+       shrq    $4,%rdx
+       jz      .Lno_data
+
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       pushq   %rdi
+
+.Lblocks_body:
+
+       movq    %rdx,%r15
+
+       movq    24(%rdi),%r11
+       movq    32(%rdi),%r13
+
+       movq    0(%rdi),%r14
+       movq    8(%rdi),%rbx
+       movq    16(%rdi),%r10
+
+       movq    %r13,%r12
+       shrq    $2,%r13
+       movq    %r12,%rax
+       addq    %r12,%r13
+       jmp     .Loop
+
+.align 32
+.Loop:
+
+       addq    0(%rsi),%r14
+       adcq    8(%rsi),%rbx
+       leaq    16(%rsi),%rsi
+       adcq    %rcx,%r10
+       mulq    %r14
+       movq    %rax,%r9
+       movq    %r11,%rax
+       movq    %rdx,%rdi
+
+       mulq    %r14
+       movq    %rax,%r14
+       movq    %r11,%rax
+       movq    %rdx,%r8
+
+       mulq    %rbx
+       addq    %rax,%r9
+       movq    %r13,%rax
+       adcq    %rdx,%rdi
+
+       mulq    %rbx
+       movq    %r10,%rbx
+       addq    %rax,%r14
+       adcq    %rdx,%r8
+
+       imulq   %r13,%rbx
+       addq    %rbx,%r9
+       movq    %r8,%rbx
+       adcq    $0,%rdi
+
+       imulq   %r11,%r10
+       addq    %r9,%rbx
+       movq    $-4,%rax
+       adcq    %r10,%rdi
+
+       andq    %rdi,%rax
+       movq    %rdi,%r10
+       shrq    $2,%rdi
+       andq    $3,%r10
+       addq    %rdi,%rax
+       addq    %rax,%r14
+       adcq    $0,%rbx
+       adcq    $0,%r10
+
+       movq    %r12,%rax
+       decq    %r15
+       jnz     .Loop
+
+       movq    0(%rsp),%rdi
+
+       movq    %r14,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %r10,16(%rdi)
+
+       movq    8(%rsp),%r15
+       movq    16(%rsp),%r14
+       movq    24(%rsp),%r13
+       movq    32(%rsp),%r12
+       movq    40(%rsp),%rbx
+       leaq    48(%rsp),%rsp
+.Lno_data:
+.Lblocks_epilogue:
+       ret
+ENDPROC(poly1305_blocks_x86_64)
+
+.align 32
+ENTRY(poly1305_emit_x86_64)
+.Lemit:
+       movq    0(%rdi),%r8
+       movq    8(%rdi),%r9
+       movq    16(%rdi),%r10
+
+       movq    %r8,%rax
+       addq    $5,%r8
+       movq    %r9,%rcx
+       adcq    $0,%r9
+       adcq    $0,%r10
+       shrq    $2,%r10
+       cmovnzq %r8,%rax
+       cmovnzq %r9,%rcx
+
+       addq    0(%rdx),%rax
+       adcq    8(%rdx),%rcx
+       movq    %rax,0(%rsi)
+       movq    %rcx,8(%rsi)
+
+       ret
+ENDPROC(poly1305_emit_x86_64)
+
+.macro __poly1305_block
+       mulq    %r14
+       movq    %rax,%r9
+       movq    %r11,%rax
+       movq    %rdx,%rdi
+
+       mulq    %r14
+       movq    %rax,%r14
+       movq    %r11,%rax
+       movq    %rdx,%r8
+
+       mulq    %rbx
+       addq    %rax,%r9
+       movq    %r13,%rax
+       adcq    %rdx,%rdi
+
+       mulq    %rbx
+       movq    %r10,%rbx
+       addq    %rax,%r14
+       adcq    %rdx,%r8
+
+       imulq   %r13,%rbx
+       addq    %rbx,%r9
+       movq    %r8,%rbx
+       adcq    $0,%rdi
+
+       imulq   %r11,%r10
+       addq    %r9,%rbx
+       movq    $-4,%rax
+       adcq    %r10,%rdi
+
+       andq    %rdi,%rax
+       movq    %rdi,%r10
+       shrq    $2,%rdi
+       andq    $3,%r10
+       addq    %rdi,%rax
+       addq    %rax,%r14
+       adcq    $0,%rbx
+       adcq    $0,%r10
+.endm
+
+.macro __poly1305_init_avx
+       movq    %r11,%r14
+       movq    %r12,%rbx
+       xorq    %r10,%r10
+
+       leaq    48+64(%rdi),%rdi
+
+       movq    %r12,%rax
+       movq    %rdi,0(%rsp)
+       __poly1305_block
+       movq    0(%rsp),%rdi
+
+       movl    $0x3ffffff,%eax
+       movl    $0x3ffffff,%edx
+       movq    %r14,%r8
+       andl    %r14d,%eax
+       movq    %r11,%r9
+       andl    %r11d,%edx
+       movl    %eax,-64(%rdi)
+       shrq    $26,%r8
+       movl    %edx,-60(%rdi)
+       shrq    $26,%r9
+
+       movl    $0x3ffffff,%eax
+       movl    $0x3ffffff,%edx
+       andl    %r8d,%eax
+       andl    %r9d,%edx
+       movl    %eax,-48(%rdi)
+       leal    (%rax,%rax,4),%eax
+       movl    %edx,-44(%rdi)
+       leal    (%rdx,%rdx,4),%edx
+       movl    %eax,-32(%rdi)
+       shrq    $26,%r8
+       movl    %edx,-28(%rdi)
+       shrq    $26,%r9
+
+       movq    %rbx,%rax
+       movq    %r12,%rdx
+       shlq    $12,%rax
+       shlq    $12,%rdx
+       orq     %r8,%rax
+       orq     %r9,%rdx
+       andl    $0x3ffffff,%eax
+       andl    $0x3ffffff,%edx
+       movl    %eax,-16(%rdi)
+       leal    (%rax,%rax,4),%eax
+       movl    %edx,-12(%rdi)
+       leal    (%rdx,%rdx,4),%edx
+       movl    %eax,0(%rdi)
+       movq    %rbx,%r8
+       movl    %edx,4(%rdi)
+       movq    %r12,%r9
+
+       movl    $0x3ffffff,%eax
+       movl    $0x3ffffff,%edx
+       shrq    $14,%r8
+       shrq    $14,%r9
+       andl    %r8d,%eax
+       andl    %r9d,%edx
+       movl    %eax,16(%rdi)
+       leal    (%rax,%rax,4),%eax
+       movl    %edx,20(%rdi)
+       leal    (%rdx,%rdx,4),%edx
+       movl    %eax,32(%rdi)
+       shrq    $26,%r8
+       movl    %edx,36(%rdi)
+       shrq    $26,%r9
+
+       movq    %r10,%rax
+       shlq    $24,%rax
+       orq     %rax,%r8
+       movl    %r8d,48(%rdi)
+       leaq    (%r8,%r8,4),%r8
+       movl    %r9d,52(%rdi)
+       leaq    (%r9,%r9,4),%r9
+       movl    %r8d,64(%rdi)
+       movl    %r9d,68(%rdi)
+
+       movq    %r12,%rax
+       movq    %rdi,0(%rsp)
+       __poly1305_block
+       movq    0(%rsp),%rdi
+
+       movl    $0x3ffffff,%eax
+       movq    %r14,%r8
+       andl    %r14d,%eax
+       shrq    $26,%r8
+       movl    %eax,-52(%rdi)
+
+       movl    $0x3ffffff,%edx
+       andl    %r8d,%edx
+       movl    %edx,-36(%rdi)
+       leal    (%rdx,%rdx,4),%edx
+       shrq    $26,%r8
+       movl    %edx,-20(%rdi)
+
+       movq    %rbx,%rax
+       shlq    $12,%rax
+       orq     %r8,%rax
+       andl    $0x3ffffff,%eax
+       movl    %eax,-4(%rdi)
+       leal    (%rax,%rax,4),%eax
+       movq    %rbx,%r8
+       movl    %eax,12(%rdi)
+
+       movl    $0x3ffffff,%edx
+       shrq    $14,%r8
+       andl    %r8d,%edx
+       movl    %edx,28(%rdi)
+       leal    (%rdx,%rdx,4),%edx
+       shrq    $26,%r8
+       movl    %edx,44(%rdi)
+
+       movq    %r10,%rax
+       shlq    $24,%rax
+       orq     %rax,%r8
+       movl    %r8d,60(%rdi)
+       leaq    (%r8,%r8,4),%r8
+       movl    %r8d,76(%rdi)
+
+       movq    %r12,%rax
+       movq    %rdi,0(%rsp)
+       __poly1305_block
+       movq    0(%rsp),%rdi
+
+       movl    $0x3ffffff,%eax
+       movq    %r14,%r8
+       andl    %r14d,%eax
+       shrq    $26,%r8
+       movl    %eax,-56(%rdi)
+
+       movl    $0x3ffffff,%edx
+       andl    %r8d,%edx
+       movl    %edx,-40(%rdi)
+       leal    (%rdx,%rdx,4),%edx
+       shrq    $26,%r8
+       movl    %edx,-24(%rdi)
+
+       movq    %rbx,%rax
+       shlq    $12,%rax
+       orq     %r8,%rax
+       andl    $0x3ffffff,%eax
+       movl    %eax,-8(%rdi)
+       leal    (%rax,%rax,4),%eax
+       movq    %rbx,%r8
+       movl    %eax,8(%rdi)
+
+       movl    $0x3ffffff,%edx
+       shrq    $14,%r8
+       andl    %r8d,%edx
+       movl    %edx,24(%rdi)
+       leal    (%rdx,%rdx,4),%edx
+       shrq    $26,%r8
+       movl    %edx,40(%rdi)
+
+       movq    %r10,%rax
+       shlq    $24,%rax
+       orq     %rax,%r8
+       movl    %r8d,56(%rdi)
+       leaq    (%r8,%r8,4),%r8
+       movl    %r8d,72(%rdi)
+
+       leaq    -48-64(%rdi),%rdi
+.endm
+
+#ifdef CONFIG_AS_AVX
+.align 32
+ENTRY(poly1305_blocks_avx)
+
+       movl    20(%rdi),%r8d
+       cmpq    $128,%rdx
+       jae     .Lblocks_avx
+       testl   %r8d,%r8d
+       jz      .Lblocks
+
+.Lblocks_avx:
+       andq    $-16,%rdx
+       jz      .Lno_data_avx
+
+       vzeroupper
+
+       testl   %r8d,%r8d
+       jz      .Lbase2_64_avx
+
+       testq   $31,%rdx
+       jz      .Leven_avx
+
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       pushq   %rdi
+
+.Lblocks_avx_body:
+
+       movq    %rdx,%r15
+
+       movq    0(%rdi),%r8
+       movq    8(%rdi),%r9
+       movl    16(%rdi),%r10d
+
+       movq    24(%rdi),%r11
+       movq    32(%rdi),%r13
+
+
+       movl    %r8d,%r14d
+       andq    $-2147483648,%r8
+       movq    %r9,%r12
+       movl    %r9d,%ebx
+       andq    $-2147483648,%r9
+
+       shrq    $6,%r8
+       shlq    $52,%r12
+       addq    %r8,%r14
+       shrq    $12,%rbx
+       shrq    $18,%r9
+       addq    %r12,%r14
+       adcq    %r9,%rbx
+
+       movq    %r10,%r8
+       shlq    $40,%r8
+       shrq    $24,%r10
+       addq    %r8,%rbx
+       adcq    $0,%r10
+
+       movq    $-4,%r9
+       movq    %r10,%r8
+       andq    %r10,%r9
+       shrq    $2,%r8
+       andq    $3,%r10
+       addq    %r9,%r8
+       addq    %r8,%r14
+       adcq    $0,%rbx
+       adcq    $0,%r10
+
+       movq    %r13,%r12
+       movq    %r13,%rax
+       shrq    $2,%r13
+       addq    %r12,%r13
+
+       addq    0(%rsi),%r14
+       adcq    8(%rsi),%rbx
+       leaq    16(%rsi),%rsi
+       adcq    %rcx,%r10
+
+       movq    %rdi,0(%rsp)
+       __poly1305_block
+       movq    0(%rsp),%rdi
+
+       testq   %rcx,%rcx
+       jz      .Lstore_base2_64_avx
+
+
+       movq    %r14,%rax
+       movq    %r14,%rdx
+       shrq    $52,%r14
+       movq    %rbx,%r11
+       movq    %rbx,%r12
+       shrq    $26,%rdx
+       andq    $0x3ffffff,%rax
+       shlq    $12,%r11
+       andq    $0x3ffffff,%rdx
+       shrq    $14,%rbx
+       orq     %r11,%r14
+       shlq    $24,%r10
+       andq    $0x3ffffff,%r14
+       shrq    $40,%r12
+       andq    $0x3ffffff,%rbx
+       orq     %r12,%r10
+
+       subq    $16,%r15
+       jz      .Lstore_base2_26_avx
+
+       vmovd   %eax,%xmm0
+       vmovd   %edx,%xmm1
+       vmovd   %r14d,%xmm2
+       vmovd   %ebx,%xmm3
+       vmovd   %r10d,%xmm4
+       jmp     .Lproceed_avx
+
+.align 32
+.Lstore_base2_64_avx:
+       movq    %r14,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %r10,16(%rdi)
+       jmp     .Ldone_avx
+
+.align 16
+.Lstore_base2_26_avx:
+       movl    %eax,0(%rdi)
+       movl    %edx,4(%rdi)
+       movl    %r14d,8(%rdi)
+       movl    %ebx,12(%rdi)
+       movl    %r10d,16(%rdi)
+.align 16
+.Ldone_avx:
+       movq    8(%rsp),%r15
+       movq    16(%rsp),%r14
+       movq    24(%rsp),%r13
+       movq    32(%rsp),%r12
+       movq    40(%rsp),%rbx
+       leaq    48(%rsp),%rsp
+
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+       ret
+
+.align 32
+.Lbase2_64_avx:
+
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       pushq   %rdi
+
+.Lbase2_64_avx_body:
+
+       movq    %rdx,%r15
+
+       movq    24(%rdi),%r11
+       movq    32(%rdi),%r13
+
+       movq    0(%rdi),%r14
+       movq    8(%rdi),%rbx
+       movl    16(%rdi),%r10d
+
+       movq    %r13,%r12
+       movq    %r13,%rax
+       shrq    $2,%r13
+       addq    %r12,%r13
+
+       testq   $31,%rdx
+       jz      .Linit_avx
+
+       addq    0(%rsi),%r14
+       adcq    8(%rsi),%rbx
+       leaq    16(%rsi),%rsi
+       adcq    %rcx,%r10
+       subq    $16,%r15
+
+       movq    %rdi,0(%rsp)
+       __poly1305_block
+       movq    0(%rsp),%rdi
+
+.Linit_avx:
+
+       movq    %r14,%rax
+       movq    %r14,%rdx
+       shrq    $52,%r14
+       movq    %rbx,%r8
+       movq    %rbx,%r9
+       shrq    $26,%rdx
+       andq    $0x3ffffff,%rax
+       shlq    $12,%r8
+       andq    $0x3ffffff,%rdx
+       shrq    $14,%rbx
+       orq     %r8,%r14
+       shlq    $24,%r10
+       andq    $0x3ffffff,%r14
+       shrq    $40,%r9
+       andq    $0x3ffffff,%rbx
+       orq     %r9,%r10
+
+       vmovd   %eax,%xmm0
+       vmovd   %edx,%xmm1
+       vmovd   %r14d,%xmm2
+       vmovd   %ebx,%xmm3
+       vmovd   %r10d,%xmm4
+       movl    $1,20(%rdi)
+
+       __poly1305_init_avx
+
+.Lproceed_avx:
+       movq    %r15,%rdx
+
+       movq    8(%rsp),%r15
+       movq    16(%rsp),%r14
+       movq    24(%rsp),%r13
+       movq    32(%rsp),%r12
+       movq    40(%rsp),%rbx
+       leaq    48(%rsp),%rax
+       leaq    48(%rsp),%rsp
+
+.Lbase2_64_avx_epilogue:
+       jmp     .Ldo_avx
+
+
+.align 32
+.Leven_avx:
+       vmovd   0(%rdi),%xmm0
+       vmovd   4(%rdi),%xmm1
+       vmovd   8(%rdi),%xmm2
+       vmovd   12(%rdi),%xmm3
+       vmovd   16(%rdi),%xmm4
+
+.Ldo_avx:
+       leaq    8(%rsp),%r10
+       andq    $-32,%rsp
+       subq    $8,%rsp
+       leaq    -88(%rsp),%r11
+       subq    $0x178,%rsp
+       subq    $64,%rdx
+       leaq    -32(%rsi),%rax
+       cmovcq  %rax,%rsi
+
+       vmovdqu 48(%rdi),%xmm14
+       leaq    112(%rdi),%rdi
+       leaq    .Lconst(%rip),%rcx
+
+       vmovdqu 32(%rsi),%xmm5
+       vmovdqu 48(%rsi),%xmm6
+       vmovdqa 64(%rcx),%xmm15
+
+       vpsrldq $6,%xmm5,%xmm7
+       vpsrldq $6,%xmm6,%xmm8
+       vpunpckhqdq     %xmm6,%xmm5,%xmm9
+       vpunpcklqdq     %xmm6,%xmm5,%xmm5
+       vpunpcklqdq     %xmm8,%xmm7,%xmm8
+
+       vpsrlq  $40,%xmm9,%xmm9
+       vpsrlq  $26,%xmm5,%xmm6
+       vpand   %xmm15,%xmm5,%xmm5
+       vpsrlq  $4,%xmm8,%xmm7
+       vpand   %xmm15,%xmm6,%xmm6
+       vpsrlq  $30,%xmm8,%xmm8
+       vpand   %xmm15,%xmm7,%xmm7
+       vpand   %xmm15,%xmm8,%xmm8
+       vpor    32(%rcx),%xmm9,%xmm9
+
+       jbe     .Lskip_loop_avx
+
+
+       vmovdqu -48(%rdi),%xmm11
+       vmovdqu -32(%rdi),%xmm12
+       vpshufd $0xEE,%xmm14,%xmm13
+       vpshufd $0x44,%xmm14,%xmm10
+       vmovdqa %xmm13,-144(%r11)
+       vmovdqa %xmm10,0(%rsp)
+       vpshufd $0xEE,%xmm11,%xmm14
+       vmovdqu -16(%rdi),%xmm10
+       vpshufd $0x44,%xmm11,%xmm11
+       vmovdqa %xmm14,-128(%r11)
+       vmovdqa %xmm11,16(%rsp)
+       vpshufd $0xEE,%xmm12,%xmm13
+       vmovdqu 0(%rdi),%xmm11
+       vpshufd $0x44,%xmm12,%xmm12
+       vmovdqa %xmm13,-112(%r11)
+       vmovdqa %xmm12,32(%rsp)
+       vpshufd $0xEE,%xmm10,%xmm14
+       vmovdqu 16(%rdi),%xmm12
+       vpshufd $0x44,%xmm10,%xmm10
+       vmovdqa %xmm14,-96(%r11)
+       vmovdqa %xmm10,48(%rsp)
+       vpshufd $0xEE,%xmm11,%xmm13
+       vmovdqu 32(%rdi),%xmm10
+       vpshufd $0x44,%xmm11,%xmm11
+       vmovdqa %xmm13,-80(%r11)
+       vmovdqa %xmm11,64(%rsp)
+       vpshufd $0xEE,%xmm12,%xmm14
+       vmovdqu 48(%rdi),%xmm11
+       vpshufd $0x44,%xmm12,%xmm12
+       vmovdqa %xmm14,-64(%r11)
+       vmovdqa %xmm12,80(%rsp)
+       vpshufd $0xEE,%xmm10,%xmm13
+       vmovdqu 64(%rdi),%xmm12
+       vpshufd $0x44,%xmm10,%xmm10
+       vmovdqa %xmm13,-48(%r11)
+       vmovdqa %xmm10,96(%rsp)
+       vpshufd $0xEE,%xmm11,%xmm14
+       vpshufd $0x44,%xmm11,%xmm11
+       vmovdqa %xmm14,-32(%r11)
+       vmovdqa %xmm11,112(%rsp)
+       vpshufd $0xEE,%xmm12,%xmm13
+       vmovdqa 0(%rsp),%xmm14
+       vpshufd $0x44,%xmm12,%xmm12
+       vmovdqa %xmm13,-16(%r11)
+       vmovdqa %xmm12,128(%rsp)
+
+       jmp     .Loop_avx
+
+.align 32
+.Loop_avx:
+
+       vpmuludq        %xmm5,%xmm14,%xmm10
+       vpmuludq        %xmm6,%xmm14,%xmm11
+       vmovdqa %xmm2,32(%r11)
+       vpmuludq        %xmm7,%xmm14,%xmm12
+       vmovdqa 16(%rsp),%xmm2
+       vpmuludq        %xmm8,%xmm14,%xmm13
+       vpmuludq        %xmm9,%xmm14,%xmm14
+
+       vmovdqa %xmm0,0(%r11)
+       vpmuludq        32(%rsp),%xmm9,%xmm0
+       vmovdqa %xmm1,16(%r11)
+       vpmuludq        %xmm8,%xmm2,%xmm1
+       vpaddq  %xmm0,%xmm10,%xmm10
+       vpaddq  %xmm1,%xmm14,%xmm14
+       vmovdqa %xmm3,48(%r11)
+       vpmuludq        %xmm7,%xmm2,%xmm0
+       vpmuludq        %xmm6,%xmm2,%xmm1
+       vpaddq  %xmm0,%xmm13,%xmm13
+       vmovdqa 48(%rsp),%xmm3
+       vpaddq  %xmm1,%xmm12,%xmm12
+       vmovdqa %xmm4,64(%r11)
+       vpmuludq        %xmm5,%xmm2,%xmm2
+       vpmuludq        %xmm7,%xmm3,%xmm0
+       vpaddq  %xmm2,%xmm11,%xmm11
+
+       vmovdqa 64(%rsp),%xmm4
+       vpaddq  %xmm0,%xmm14,%xmm14
+       vpmuludq        %xmm6,%xmm3,%xmm1
+       vpmuludq        %xmm5,%xmm3,%xmm3
+       vpaddq  %xmm1,%xmm13,%xmm13
+       vmovdqa 80(%rsp),%xmm2
+       vpaddq  %xmm3,%xmm12,%xmm12
+       vpmuludq        %xmm9,%xmm4,%xmm0
+       vpmuludq        %xmm8,%xmm4,%xmm4
+       vpaddq  %xmm0,%xmm11,%xmm11
+       vmovdqa 96(%rsp),%xmm3
+       vpaddq  %xmm4,%xmm10,%xmm10
+
+       vmovdqa 128(%rsp),%xmm4
+       vpmuludq        %xmm6,%xmm2,%xmm1
+       vpmuludq        %xmm5,%xmm2,%xmm2
+       vpaddq  %xmm1,%xmm14,%xmm14
+       vpaddq  %xmm2,%xmm13,%xmm13
+       vpmuludq        %xmm9,%xmm3,%xmm0
+       vpmuludq        %xmm8,%xmm3,%xmm1
+       vpaddq  %xmm0,%xmm12,%xmm12
+       vmovdqu 0(%rsi),%xmm0
+       vpaddq  %xmm1,%xmm11,%xmm11
+       vpmuludq        %xmm7,%xmm3,%xmm3
+       vpmuludq        %xmm7,%xmm4,%xmm7
+       vpaddq  %xmm3,%xmm10,%xmm10
+
+       vmovdqu 16(%rsi),%xmm1
+       vpaddq  %xmm7,%xmm11,%xmm11
+       vpmuludq        %xmm8,%xmm4,%xmm8
+       vpmuludq        %xmm9,%xmm4,%xmm9
+       vpsrldq $6,%xmm0,%xmm2
+       vpaddq  %xmm8,%xmm12,%xmm12
+       vpaddq  %xmm9,%xmm13,%xmm13
+       vpsrldq $6,%xmm1,%xmm3
+       vpmuludq        112(%rsp),%xmm5,%xmm9
+       vpmuludq        %xmm6,%xmm4,%xmm5
+       vpunpckhqdq     %xmm1,%xmm0,%xmm4
+       vpaddq  %xmm9,%xmm14,%xmm14
+       vmovdqa -144(%r11),%xmm9
+       vpaddq  %xmm5,%xmm10,%xmm10
+
+       vpunpcklqdq     %xmm1,%xmm0,%xmm0
+       vpunpcklqdq     %xmm3,%xmm2,%xmm3
+
+
+       vpsrldq $5,%xmm4,%xmm4
+       vpsrlq  $26,%xmm0,%xmm1
+       vpand   %xmm15,%xmm0,%xmm0
+       vpsrlq  $4,%xmm3,%xmm2
+       vpand   %xmm15,%xmm1,%xmm1
+       vpand   0(%rcx),%xmm4,%xmm4
+       vpsrlq  $30,%xmm3,%xmm3
+       vpand   %xmm15,%xmm2,%xmm2
+       vpand   %xmm15,%xmm3,%xmm3
+       vpor    32(%rcx),%xmm4,%xmm4
+
+       vpaddq  0(%r11),%xmm0,%xmm0
+       vpaddq  16(%r11),%xmm1,%xmm1
+       vpaddq  32(%r11),%xmm2,%xmm2
+       vpaddq  48(%r11),%xmm3,%xmm3
+       vpaddq  64(%r11),%xmm4,%xmm4
+
+       leaq    32(%rsi),%rax
+       leaq    64(%rsi),%rsi
+       subq    $64,%rdx
+       cmovcq  %rax,%rsi
+
+       vpmuludq        %xmm0,%xmm9,%xmm5
+       vpmuludq        %xmm1,%xmm9,%xmm6
+       vpaddq  %xmm5,%xmm10,%xmm10
+       vpaddq  %xmm6,%xmm11,%xmm11
+       vmovdqa -128(%r11),%xmm7
+       vpmuludq        %xmm2,%xmm9,%xmm5
+       vpmuludq        %xmm3,%xmm9,%xmm6
+       vpaddq  %xmm5,%xmm12,%xmm12
+       vpaddq  %xmm6,%xmm13,%xmm13
+       vpmuludq        %xmm4,%xmm9,%xmm9
+       vpmuludq        -112(%r11),%xmm4,%xmm5
+       vpaddq  %xmm9,%xmm14,%xmm14
+
+       vpaddq  %xmm5,%xmm10,%xmm10
+       vpmuludq        %xmm2,%xmm7,%xmm6
+       vpmuludq        %xmm3,%xmm7,%xmm5
+       vpaddq  %xmm6,%xmm13,%xmm13
+       vmovdqa -96(%r11),%xmm8
+       vpaddq  %xmm5,%xmm14,%xmm14
+       vpmuludq        %xmm1,%xmm7,%xmm6
+       vpmuludq        %xmm0,%xmm7,%xmm7
+       vpaddq  %xmm6,%xmm12,%xmm12
+       vpaddq  %xmm7,%xmm11,%xmm11
+
+       vmovdqa -80(%r11),%xmm9
+       vpmuludq        %xmm2,%xmm8,%xmm5
+       vpmuludq        %xmm1,%xmm8,%xmm6
+       vpaddq  %xmm5,%xmm14,%xmm14
+       vpaddq  %xmm6,%xmm13,%xmm13
+       vmovdqa -64(%r11),%xmm7
+       vpmuludq        %xmm0,%xmm8,%xmm8
+       vpmuludq        %xmm4,%xmm9,%xmm5
+       vpaddq  %xmm8,%xmm12,%xmm12
+       vpaddq  %xmm5,%xmm11,%xmm11
+       vmovdqa -48(%r11),%xmm8
+       vpmuludq        %xmm3,%xmm9,%xmm9
+       vpmuludq        %xmm1,%xmm7,%xmm6
+       vpaddq  %xmm9,%xmm10,%xmm10
+
+       vmovdqa -16(%r11),%xmm9
+       vpaddq  %xmm6,%xmm14,%xmm14
+       vpmuludq        %xmm0,%xmm7,%xmm7
+       vpmuludq        %xmm4,%xmm8,%xmm5
+       vpaddq  %xmm7,%xmm13,%xmm13
+       vpaddq  %xmm5,%xmm12,%xmm12
+       vmovdqu 32(%rsi),%xmm5
+       vpmuludq        %xmm3,%xmm8,%xmm7
+       vpmuludq        %xmm2,%xmm8,%xmm8
+       vpaddq  %xmm7,%xmm11,%xmm11
+       vmovdqu 48(%rsi),%xmm6
+       vpaddq  %xmm8,%xmm10,%xmm10
+
+       vpmuludq        %xmm2,%xmm9,%xmm2
+       vpmuludq        %xmm3,%xmm9,%xmm3
+       vpsrldq $6,%xmm5,%xmm7
+       vpaddq  %xmm2,%xmm11,%xmm11
+       vpmuludq        %xmm4,%xmm9,%xmm4
+       vpsrldq $6,%xmm6,%xmm8
+       vpaddq  %xmm3,%xmm12,%xmm2
+       vpaddq  %xmm4,%xmm13,%xmm3
+       vpmuludq        -32(%r11),%xmm0,%xmm4
+       vpmuludq        %xmm1,%xmm9,%xmm0
+       vpunpckhqdq     %xmm6,%xmm5,%xmm9
+       vpaddq  %xmm4,%xmm14,%xmm4
+       vpaddq  %xmm0,%xmm10,%xmm0
+
+       vpunpcklqdq     %xmm6,%xmm5,%xmm5
+       vpunpcklqdq     %xmm8,%xmm7,%xmm8
+
+
+       vpsrldq $5,%xmm9,%xmm9
+       vpsrlq  $26,%xmm5,%xmm6
+       vmovdqa 0(%rsp),%xmm14
+       vpand   %xmm15,%xmm5,%xmm5
+       vpsrlq  $4,%xmm8,%xmm7
+       vpand   %xmm15,%xmm6,%xmm6
+       vpand   0(%rcx),%xmm9,%xmm9
+       vpsrlq  $30,%xmm8,%xmm8
+       vpand   %xmm15,%xmm7,%xmm7
+       vpand   %xmm15,%xmm8,%xmm8
+       vpor    32(%rcx),%xmm9,%xmm9
+
+       vpsrlq  $26,%xmm3,%xmm13
+       vpand   %xmm15,%xmm3,%xmm3
+       vpaddq  %xmm13,%xmm4,%xmm4
+
+       vpsrlq  $26,%xmm0,%xmm10
+       vpand   %xmm15,%xmm0,%xmm0
+       vpaddq  %xmm10,%xmm11,%xmm1
+
+       vpsrlq  $26,%xmm4,%xmm10
+       vpand   %xmm15,%xmm4,%xmm4
+
+       vpsrlq  $26,%xmm1,%xmm11
+       vpand   %xmm15,%xmm1,%xmm1
+       vpaddq  %xmm11,%xmm2,%xmm2
+
+       vpaddq  %xmm10,%xmm0,%xmm0
+       vpsllq  $2,%xmm10,%xmm10
+       vpaddq  %xmm10,%xmm0,%xmm0
+
+       vpsrlq  $26,%xmm2,%xmm12
+       vpand   %xmm15,%xmm2,%xmm2
+       vpaddq  %xmm12,%xmm3,%xmm3
+
+       vpsrlq  $26,%xmm0,%xmm10
+       vpand   %xmm15,%xmm0,%xmm0
+       vpaddq  %xmm10,%xmm1,%xmm1
+
+       vpsrlq  $26,%xmm3,%xmm13
+       vpand   %xmm15,%xmm3,%xmm3
+       vpaddq  %xmm13,%xmm4,%xmm4
+
+       ja      .Loop_avx
+
+.Lskip_loop_avx:
+       vpshufd $0x10,%xmm14,%xmm14
+       addq    $32,%rdx
+       jnz     .Long_tail_avx
+
+       vpaddq  %xmm2,%xmm7,%xmm7
+       vpaddq  %xmm0,%xmm5,%xmm5
+       vpaddq  %xmm1,%xmm6,%xmm6
+       vpaddq  %xmm3,%xmm8,%xmm8
+       vpaddq  %xmm4,%xmm9,%xmm9
+
+.Long_tail_avx:
+       vmovdqa %xmm2,32(%r11)
+       vmovdqa %xmm0,0(%r11)
+       vmovdqa %xmm1,16(%r11)
+       vmovdqa %xmm3,48(%r11)
+       vmovdqa %xmm4,64(%r11)
+
+       vpmuludq        %xmm7,%xmm14,%xmm12
+       vpmuludq        %xmm5,%xmm14,%xmm10
+       vpshufd $0x10,-48(%rdi),%xmm2
+       vpmuludq        %xmm6,%xmm14,%xmm11
+       vpmuludq        %xmm8,%xmm14,%xmm13
+       vpmuludq        %xmm9,%xmm14,%xmm14
+
+       vpmuludq        %xmm8,%xmm2,%xmm0
+       vpaddq  %xmm0,%xmm14,%xmm14
+       vpshufd $0x10,-32(%rdi),%xmm3
+       vpmuludq        %xmm7,%xmm2,%xmm1
+       vpaddq  %xmm1,%xmm13,%xmm13
+       vpshufd $0x10,-16(%rdi),%xmm4
+       vpmuludq        %xmm6,%xmm2,%xmm0
+       vpaddq  %xmm0,%xmm12,%xmm12
+       vpmuludq        %xmm5,%xmm2,%xmm2
+       vpaddq  %xmm2,%xmm11,%xmm11
+       vpmuludq        %xmm9,%xmm3,%xmm3
+       vpaddq  %xmm3,%xmm10,%xmm10
+
+       vpshufd $0x10,0(%rdi),%xmm2
+       vpmuludq        %xmm7,%xmm4,%xmm1
+       vpaddq  %xmm1,%xmm14,%xmm14
+       vpmuludq        %xmm6,%xmm4,%xmm0
+       vpaddq  %xmm0,%xmm13,%xmm13
+       vpshufd $0x10,16(%rdi),%xmm3
+       vpmuludq        %xmm5,%xmm4,%xmm4
+       vpaddq  %xmm4,%xmm12,%xmm12
+       vpmuludq        %xmm9,%xmm2,%xmm1
+       vpaddq  %xmm1,%xmm11,%xmm11
+       vpshufd $0x10,32(%rdi),%xmm4
+       vpmuludq        %xmm8,%xmm2,%xmm2
+       vpaddq  %xmm2,%xmm10,%xmm10
+
+       vpmuludq        %xmm6,%xmm3,%xmm0
+       vpaddq  %xmm0,%xmm14,%xmm14
+       vpmuludq        %xmm5,%xmm3,%xmm3
+       vpaddq  %xmm3,%xmm13,%xmm13
+       vpshufd $0x10,48(%rdi),%xmm2
+       vpmuludq        %xmm9,%xmm4,%xmm1
+       vpaddq  %xmm1,%xmm12,%xmm12
+       vpshufd $0x10,64(%rdi),%xmm3
+       vpmuludq        %xmm8,%xmm4,%xmm0
+       vpaddq  %xmm0,%xmm11,%xmm11
+       vpmuludq        %xmm7,%xmm4,%xmm4
+       vpaddq  %xmm4,%xmm10,%xmm10
+
+       vpmuludq        %xmm5,%xmm2,%xmm2
+       vpaddq  %xmm2,%xmm14,%xmm14
+       vpmuludq        %xmm9,%xmm3,%xmm1
+       vpaddq  %xmm1,%xmm13,%xmm13
+       vpmuludq        %xmm8,%xmm3,%xmm0
+       vpaddq  %xmm0,%xmm12,%xmm12
+       vpmuludq        %xmm7,%xmm3,%xmm1
+       vpaddq  %xmm1,%xmm11,%xmm11
+       vpmuludq        %xmm6,%xmm3,%xmm3
+       vpaddq  %xmm3,%xmm10,%xmm10
+
+       jz      .Lshort_tail_avx
+
+       vmovdqu 0(%rsi),%xmm0
+       vmovdqu 16(%rsi),%xmm1
+
+       vpsrldq $6,%xmm0,%xmm2
+       vpsrldq $6,%xmm1,%xmm3
+       vpunpckhqdq     %xmm1,%xmm0,%xmm4
+       vpunpcklqdq     %xmm1,%xmm0,%xmm0
+       vpunpcklqdq     %xmm3,%xmm2,%xmm3
+
+       vpsrlq  $40,%xmm4,%xmm4
+       vpsrlq  $26,%xmm0,%xmm1
+       vpand   %xmm15,%xmm0,%xmm0
+       vpsrlq  $4,%xmm3,%xmm2
+       vpand   %xmm15,%xmm1,%xmm1
+       vpsrlq  $30,%xmm3,%xmm3
+       vpand   %xmm15,%xmm2,%xmm2
+       vpand   %xmm15,%xmm3,%xmm3
+       vpor    32(%rcx),%xmm4,%xmm4
+
+       vpshufd $0x32,-64(%rdi),%xmm9
+       vpaddq  0(%r11),%xmm0,%xmm0
+       vpaddq  16(%r11),%xmm1,%xmm1
+       vpaddq  32(%r11),%xmm2,%xmm2
+       vpaddq  48(%r11),%xmm3,%xmm3
+       vpaddq  64(%r11),%xmm4,%xmm4
+
+       vpmuludq        %xmm0,%xmm9,%xmm5
+       vpaddq  %xmm5,%xmm10,%xmm10
+       vpmuludq        %xmm1,%xmm9,%xmm6
+       vpaddq  %xmm6,%xmm11,%xmm11
+       vpmuludq        %xmm2,%xmm9,%xmm5
+       vpaddq  %xmm5,%xmm12,%xmm12
+       vpshufd $0x32,-48(%rdi),%xmm7
+       vpmuludq        %xmm3,%xmm9,%xmm6
+       vpaddq  %xmm6,%xmm13,%xmm13
+       vpmuludq        %xmm4,%xmm9,%xmm9
+       vpaddq  %xmm9,%xmm14,%xmm14
+
+       vpmuludq        %xmm3,%xmm7,%xmm5
+       vpaddq  %xmm5,%xmm14,%xmm14
+       vpshufd $0x32,-32(%rdi),%xmm8
+       vpmuludq        %xmm2,%xmm7,%xmm6
+       vpaddq  %xmm6,%xmm13,%xmm13
+       vpshufd $0x32,-16(%rdi),%xmm9
+       vpmuludq        %xmm1,%xmm7,%xmm5
+       vpaddq  %xmm5,%xmm12,%xmm12
+       vpmuludq        %xmm0,%xmm7,%xmm7
+       vpaddq  %xmm7,%xmm11,%xmm11
+       vpmuludq        %xmm4,%xmm8,%xmm8
+       vpaddq  %xmm8,%xmm10,%xmm10
+
+       vpshufd $0x32,0(%rdi),%xmm7
+       vpmuludq        %xmm2,%xmm9,%xmm6
+       vpaddq  %xmm6,%xmm14,%xmm14
+       vpmuludq        %xmm1,%xmm9,%xmm5
+       vpaddq  %xmm5,%xmm13,%xmm13
+       vpshufd $0x32,16(%rdi),%xmm8
+       vpmuludq        %xmm0,%xmm9,%xmm9
+       vpaddq  %xmm9,%xmm12,%xmm12
+       vpmuludq        %xmm4,%xmm7,%xmm6
+       vpaddq  %xmm6,%xmm11,%xmm11
+       vpshufd $0x32,32(%rdi),%xmm9
+       vpmuludq        %xmm3,%xmm7,%xmm7
+       vpaddq  %xmm7,%xmm10,%xmm10
+
+       vpmuludq        %xmm1,%xmm8,%xmm5
+       vpaddq  %xmm5,%xmm14,%xmm14
+       vpmuludq        %xmm0,%xmm8,%xmm8
+       vpaddq  %xmm8,%xmm13,%xmm13
+       vpshufd $0x32,48(%rdi),%xmm7
+       vpmuludq        %xmm4,%xmm9,%xmm6
+       vpaddq  %xmm6,%xmm12,%xmm12
+       vpshufd $0x32,64(%rdi),%xmm8
+       vpmuludq        %xmm3,%xmm9,%xmm5
+       vpaddq  %xmm5,%xmm11,%xmm11
+       vpmuludq        %xmm2,%xmm9,%xmm9
+       vpaddq  %xmm9,%xmm10,%xmm10
+
+       vpmuludq        %xmm0,%xmm7,%xmm7
+       vpaddq  %xmm7,%xmm14,%xmm14
+       vpmuludq        %xmm4,%xmm8,%xmm6
+       vpaddq  %xmm6,%xmm13,%xmm13
+       vpmuludq        %xmm3,%xmm8,%xmm5
+       vpaddq  %xmm5,%xmm12,%xmm12
+       vpmuludq        %xmm2,%xmm8,%xmm6
+       vpaddq  %xmm6,%xmm11,%xmm11
+       vpmuludq        %xmm1,%xmm8,%xmm8
+       vpaddq  %xmm8,%xmm10,%xmm10
+
+.Lshort_tail_avx:
+
+       vpsrldq $8,%xmm14,%xmm9
+       vpsrldq $8,%xmm13,%xmm8
+       vpsrldq $8,%xmm11,%xmm6
+       vpsrldq $8,%xmm10,%xmm5
+       vpsrldq $8,%xmm12,%xmm7
+       vpaddq  %xmm8,%xmm13,%xmm13
+       vpaddq  %xmm9,%xmm14,%xmm14
+       vpaddq  %xmm5,%xmm10,%xmm10
+       vpaddq  %xmm6,%xmm11,%xmm11
+       vpaddq  %xmm7,%xmm12,%xmm12
+
+       vpsrlq  $26,%xmm13,%xmm3
+       vpand   %xmm15,%xmm13,%xmm13
+       vpaddq  %xmm3,%xmm14,%xmm14
+
+       vpsrlq  $26,%xmm10,%xmm0
+       vpand   %xmm15,%xmm10,%xmm10
+       vpaddq  %xmm0,%xmm11,%xmm11
+
+       vpsrlq  $26,%xmm14,%xmm4
+       vpand   %xmm15,%xmm14,%xmm14
+
+       vpsrlq  $26,%xmm11,%xmm1
+       vpand   %xmm15,%xmm11,%xmm11
+       vpaddq  %xmm1,%xmm12,%xmm12
+
+       vpaddq  %xmm4,%xmm10,%xmm10
+       vpsllq  $2,%xmm4,%xmm4
+       vpaddq  %xmm4,%xmm10,%xmm10
+
+       vpsrlq  $26,%xmm12,%xmm2
+       vpand   %xmm15,%xmm12,%xmm12
+       vpaddq  %xmm2,%xmm13,%xmm13
+
+       vpsrlq  $26,%xmm10,%xmm0
+       vpand   %xmm15,%xmm10,%xmm10
+       vpaddq  %xmm0,%xmm11,%xmm11
+
+       vpsrlq  $26,%xmm13,%xmm3
+       vpand   %xmm15,%xmm13,%xmm13
+       vpaddq  %xmm3,%xmm14,%xmm14
+
+       vmovd   %xmm10,-112(%rdi)
+       vmovd   %xmm11,-108(%rdi)
+       vmovd   %xmm12,-104(%rdi)
+       vmovd   %xmm13,-100(%rdi)
+       vmovd   %xmm14,-96(%rdi)
+       leaq    -8(%r10),%rsp
+
+       vzeroupper
+       ret
+ENDPROC(poly1305_blocks_avx)
+
+.align 32
+ENTRY(poly1305_emit_avx)
+       cmpl    $0,20(%rdi)
+       je      .Lemit
+
+       movl    0(%rdi),%eax
+       movl    4(%rdi),%ecx
+       movl    8(%rdi),%r8d
+       movl    12(%rdi),%r11d
+       movl    16(%rdi),%r10d
+
+       shlq    $26,%rcx
+       movq    %r8,%r9
+       shlq    $52,%r8
+       addq    %rcx,%rax
+       shrq    $12,%r9
+       addq    %rax,%r8
+       adcq    $0,%r9
+
+       shlq    $14,%r11
+       movq    %r10,%rax
+       shrq    $24,%r10
+       addq    %r11,%r9
+       shlq    $40,%rax
+       addq    %rax,%r9
+       adcq    $0,%r10
+
+       movq    %r10,%rax
+       movq    %r10,%rcx
+       andq    $3,%r10
+       shrq    $2,%rax
+       andq    $-4,%rcx
+       addq    %rcx,%rax
+       addq    %rax,%r8
+       adcq    $0,%r9
+       adcq    $0,%r10
+
+       movq    %r8,%rax
+       addq    $5,%r8
+       movq    %r9,%rcx
+       adcq    $0,%r9
+       adcq    $0,%r10
+       shrq    $2,%r10
+       cmovnzq %r8,%rax
+       cmovnzq %r9,%rcx
+
+       addq    0(%rdx),%rax
+       adcq    8(%rdx),%rcx
+       movq    %rax,0(%rsi)
+       movq    %rcx,8(%rsi)
+
+       ret
+ENDPROC(poly1305_emit_avx)
+#endif /* CONFIG_AS_AVX */
+
+#ifdef CONFIG_AS_AVX2
+.align 32
+ENTRY(poly1305_blocks_avx2)
+
+       movl    20(%rdi),%r8d
+       cmpq    $128,%rdx
+       jae     .Lblocks_avx2
+       testl   %r8d,%r8d
+       jz      .Lblocks
+
+.Lblocks_avx2:
+       andq    $-16,%rdx
+       jz      .Lno_data_avx2
+
+       vzeroupper
+
+       testl   %r8d,%r8d
+       jz      .Lbase2_64_avx2
+
+       testq   $63,%rdx
+       jz      .Leven_avx2
+
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       pushq   %rdi
+
+.Lblocks_avx2_body:
+
+       movq    %rdx,%r15
+
+       movq    0(%rdi),%r8
+       movq    8(%rdi),%r9
+       movl    16(%rdi),%r10d
+
+       movq    24(%rdi),%r11
+       movq    32(%rdi),%r13
+
+
+       movl    %r8d,%r14d
+       andq    $-2147483648,%r8
+       movq    %r9,%r12
+       movl    %r9d,%ebx
+       andq    $-2147483648,%r9
+
+       shrq    $6,%r8
+       shlq    $52,%r12
+       addq    %r8,%r14
+       shrq    $12,%rbx
+       shrq    $18,%r9
+       addq    %r12,%r14
+       adcq    %r9,%rbx
+
+       movq    %r10,%r8
+       shlq    $40,%r8
+       shrq    $24,%r10
+       addq    %r8,%rbx
+       adcq    $0,%r10
+
+       movq    $-4,%r9
+       movq    %r10,%r8
+       andq    %r10,%r9
+       shrq    $2,%r8
+       andq    $3,%r10
+       addq    %r9,%r8
+       addq    %r8,%r14
+       adcq    $0,%rbx
+       adcq    $0,%r10
+
+       movq    %r13,%r12
+       movq    %r13,%rax
+       shrq    $2,%r13
+       addq    %r12,%r13
+
+.Lbase2_26_pre_avx2:
+       addq    0(%rsi),%r14
+       adcq    8(%rsi),%rbx
+       leaq    16(%rsi),%rsi
+       adcq    %rcx,%r10
+       subq    $16,%r15
+
+       movq    %rdi,0(%rsp)
+       __poly1305_block
+       movq    0(%rsp),%rdi
+       movq    %r12,%rax
+
+       testq   $63,%r15
+       jnz     .Lbase2_26_pre_avx2
+
+       testq   %rcx,%rcx
+       jz      .Lstore_base2_64_avx2
+
+
+       movq    %r14,%rax
+       movq    %r14,%rdx
+       shrq    $52,%r14
+       movq    %rbx,%r11
+       movq    %rbx,%r12
+       shrq    $26,%rdx
+       andq    $0x3ffffff,%rax
+       shlq    $12,%r11
+       andq    $0x3ffffff,%rdx
+       shrq    $14,%rbx
+       orq     %r11,%r14
+       shlq    $24,%r10
+       andq    $0x3ffffff,%r14
+       shrq    $40,%r12
+       andq    $0x3ffffff,%rbx
+       orq     %r12,%r10
+
+       testq   %r15,%r15
+       jz      .Lstore_base2_26_avx2
+
+       vmovd   %eax,%xmm0
+       vmovd   %edx,%xmm1
+       vmovd   %r14d,%xmm2
+       vmovd   %ebx,%xmm3
+       vmovd   %r10d,%xmm4
+       jmp     .Lproceed_avx2
+
+.align 32
+.Lstore_base2_64_avx2:
+       movq    %r14,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %r10,16(%rdi)
+       jmp     .Ldone_avx2
+
+.align 16
+.Lstore_base2_26_avx2:
+       movl    %eax,0(%rdi)
+       movl    %edx,4(%rdi)
+       movl    %r14d,8(%rdi)
+       movl    %ebx,12(%rdi)
+       movl    %r10d,16(%rdi)
+.align 16
+.Ldone_avx2:
+       movq    8(%rsp),%r15
+       movq    16(%rsp),%r14
+       movq    24(%rsp),%r13
+       movq    32(%rsp),%r12
+       movq    40(%rsp),%rbx
+       leaq    48(%rsp),%rsp
+
+.Lno_data_avx2:
+.Lblocks_avx2_epilogue:
+       ret
+
+
+.align 32
+.Lbase2_64_avx2:
+
+
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       pushq   %rdi
+
+.Lbase2_64_avx2_body:
+
+       movq    %rdx,%r15
+
+       movq    24(%rdi),%r11
+       movq    32(%rdi),%r13
+
+       movq    0(%rdi),%r14
+       movq    8(%rdi),%rbx
+       movl    16(%rdi),%r10d
+
+       movq    %r13,%r12
+       movq    %r13,%rax
+       shrq    $2,%r13
+       addq    %r12,%r13
+
+       testq   $63,%rdx
+       jz      .Linit_avx2
+
+.Lbase2_64_pre_avx2:
+       addq    0(%rsi),%r14
+       adcq    8(%rsi),%rbx
+       leaq    16(%rsi),%rsi
+       adcq    %rcx,%r10
+       subq    $16,%r15
+
+       movq    %rdi,0(%rsp)
+       __poly1305_block
+       movq    0(%rsp),%rdi
+       movq    %r12,%rax
+
+       testq   $63,%r15
+       jnz     .Lbase2_64_pre_avx2
+
+.Linit_avx2:
+
+       movq    %r14,%rax
+       movq    %r14,%rdx
+       shrq    $52,%r14
+       movq    %rbx,%r8
+       movq    %rbx,%r9
+       shrq    $26,%rdx
+       andq    $0x3ffffff,%rax
+       shlq    $12,%r8
+       andq    $0x3ffffff,%rdx
+       shrq    $14,%rbx
+       orq     %r8,%r14
+       shlq    $24,%r10
+       andq    $0x3ffffff,%r14
+       shrq    $40,%r9
+       andq    $0x3ffffff,%rbx
+       orq     %r9,%r10
+
+       vmovd   %eax,%xmm0
+       vmovd   %edx,%xmm1
+       vmovd   %r14d,%xmm2
+       vmovd   %ebx,%xmm3
+       vmovd   %r10d,%xmm4
+       movl    $1,20(%rdi)
+
+       __poly1305_init_avx
+
+.Lproceed_avx2:
+       movq    %r15,%rdx
+
+       movq    8(%rsp),%r15
+       movq    16(%rsp),%r14
+       movq    24(%rsp),%r13
+       movq    32(%rsp),%r12
+       movq    40(%rsp),%rbx
+       leaq    48(%rsp),%rax
+       leaq    48(%rsp),%rsp
+
+.Lbase2_64_avx2_epilogue:
+       jmp     .Ldo_avx2
+
+
+.align 32
+.Leven_avx2:
+
+       vmovd   0(%rdi),%xmm0
+       vmovd   4(%rdi),%xmm1
+       vmovd   8(%rdi),%xmm2
+       vmovd   12(%rdi),%xmm3
+       vmovd   16(%rdi),%xmm4
+
+.Ldo_avx2:
+       leaq    8(%rsp),%r10
+       subq    $0x128,%rsp
+       leaq    .Lconst(%rip),%rcx
+       leaq    48+64(%rdi),%rdi
+       vmovdqa 96(%rcx),%ymm7
+
+
+       vmovdqu -64(%rdi),%xmm9
+       andq    $-512,%rsp
+       vmovdqu -48(%rdi),%xmm10
+       vmovdqu -32(%rdi),%xmm6
+       vmovdqu -16(%rdi),%xmm11
+       vmovdqu 0(%rdi),%xmm12
+       vmovdqu 16(%rdi),%xmm13
+       leaq    144(%rsp),%rax
+       vmovdqu 32(%rdi),%xmm14
+       vpermd  %ymm9,%ymm7,%ymm9
+       vmovdqu 48(%rdi),%xmm15
+       vpermd  %ymm10,%ymm7,%ymm10
+       vmovdqu 64(%rdi),%xmm5
+       vpermd  %ymm6,%ymm7,%ymm6
+       vmovdqa %ymm9,0(%rsp)
+       vpermd  %ymm11,%ymm7,%ymm11
+       vmovdqa %ymm10,32-144(%rax)
+       vpermd  %ymm12,%ymm7,%ymm12
+       vmovdqa %ymm6,64-144(%rax)
+       vpermd  %ymm13,%ymm7,%ymm13
+       vmovdqa %ymm11,96-144(%rax)
+       vpermd  %ymm14,%ymm7,%ymm14
+       vmovdqa %ymm12,128-144(%rax)
+       vpermd  %ymm15,%ymm7,%ymm15
+       vmovdqa %ymm13,160-144(%rax)
+       vpermd  %ymm5,%ymm7,%ymm5
+       vmovdqa %ymm14,192-144(%rax)
+       vmovdqa %ymm15,224-144(%rax)
+       vmovdqa %ymm5,256-144(%rax)
+       vmovdqa 64(%rcx),%ymm5
+
+
+
+       vmovdqu 0(%rsi),%xmm7
+       vmovdqu 16(%rsi),%xmm8
+       vinserti128     $1,32(%rsi),%ymm7,%ymm7
+       vinserti128     $1,48(%rsi),%ymm8,%ymm8
+       leaq    64(%rsi),%rsi
+
+       vpsrldq $6,%ymm7,%ymm9
+       vpsrldq $6,%ymm8,%ymm10
+       vpunpckhqdq     %ymm8,%ymm7,%ymm6
+       vpunpcklqdq     %ymm10,%ymm9,%ymm9
+       vpunpcklqdq     %ymm8,%ymm7,%ymm7
+
+       vpsrlq  $30,%ymm9,%ymm10
+       vpsrlq  $4,%ymm9,%ymm9
+       vpsrlq  $26,%ymm7,%ymm8
+       vpsrlq  $40,%ymm6,%ymm6
+       vpand   %ymm5,%ymm9,%ymm9
+       vpand   %ymm5,%ymm7,%ymm7
+       vpand   %ymm5,%ymm8,%ymm8
+       vpand   %ymm5,%ymm10,%ymm10
+       vpor    32(%rcx),%ymm6,%ymm6
+
+       vpaddq  %ymm2,%ymm9,%ymm2
+       subq    $64,%rdx
+       jz      .Ltail_avx2
+       jmp     .Loop_avx2
+
+.align 32
+.Loop_avx2:
+
+       vpaddq  %ymm0,%ymm7,%ymm0
+       vmovdqa 0(%rsp),%ymm7
+       vpaddq  %ymm1,%ymm8,%ymm1
+       vmovdqa 32(%rsp),%ymm8
+       vpaddq  %ymm3,%ymm10,%ymm3
+       vmovdqa 96(%rsp),%ymm9
+       vpaddq  %ymm4,%ymm6,%ymm4
+       vmovdqa 48(%rax),%ymm10
+       vmovdqa 112(%rax),%ymm5
+
+       vpmuludq        %ymm2,%ymm7,%ymm13
+       vpmuludq        %ymm2,%ymm8,%ymm14
+       vpmuludq        %ymm2,%ymm9,%ymm15
+       vpmuludq        %ymm2,%ymm10,%ymm11
+       vpmuludq        %ymm2,%ymm5,%ymm12
+
+       vpmuludq        %ymm0,%ymm8,%ymm6
+       vpmuludq        %ymm1,%ymm8,%ymm2
+       vpaddq  %ymm6,%ymm12,%ymm12
+       vpaddq  %ymm2,%ymm13,%ymm13
+       vpmuludq        %ymm3,%ymm8,%ymm6
+       vpmuludq        64(%rsp),%ymm4,%ymm2
+       vpaddq  %ymm6,%ymm15,%ymm15
+       vpaddq  %ymm2,%ymm11,%ymm11
+       vmovdqa -16(%rax),%ymm8
+
+       vpmuludq        %ymm0,%ymm7,%ymm6
+       vpmuludq        %ymm1,%ymm7,%ymm2
+       vpaddq  %ymm6,%ymm11,%ymm11
+       vpaddq  %ymm2,%ymm12,%ymm12
+       vpmuludq        %ymm3,%ymm7,%ymm6
+       vpmuludq        %ymm4,%ymm7,%ymm2
+       vmovdqu 0(%rsi),%xmm7
+       vpaddq  %ymm6,%ymm14,%ymm14
+       vpaddq  %ymm2,%ymm15,%ymm15
+       vinserti128     $1,32(%rsi),%ymm7,%ymm7
+
+       vpmuludq        %ymm3,%ymm8,%ymm6
+       vpmuludq        %ymm4,%ymm8,%ymm2
+       vmovdqu 16(%rsi),%xmm8
+       vpaddq  %ymm6,%ymm11,%ymm11
+       vpaddq  %ymm2,%ymm12,%ymm12
+       vmovdqa 16(%rax),%ymm2
+       vpmuludq        %ymm1,%ymm9,%ymm6
+       vpmuludq        %ymm0,%ymm9,%ymm9
+       vpaddq  %ymm6,%ymm14,%ymm14
+       vpaddq  %ymm9,%ymm13,%ymm13
+       vinserti128     $1,48(%rsi),%ymm8,%ymm8
+       leaq    64(%rsi),%rsi
+
+       vpmuludq        %ymm1,%ymm2,%ymm6
+       vpmuludq        %ymm0,%ymm2,%ymm2
+       vpsrldq $6,%ymm7,%ymm9
+       vpaddq  %ymm6,%ymm15,%ymm15
+       vpaddq  %ymm2,%ymm14,%ymm14
+       vpmuludq        %ymm3,%ymm10,%ymm6
+       vpmuludq        %ymm4,%ymm10,%ymm2
+       vpsrldq $6,%ymm8,%ymm10
+       vpaddq  %ymm6,%ymm12,%ymm12
+       vpaddq  %ymm2,%ymm13,%ymm13
+       vpunpckhqdq     %ymm8,%ymm7,%ymm6
+
+       vpmuludq        %ymm3,%ymm5,%ymm3
+       vpmuludq        %ymm4,%ymm5,%ymm4
+       vpunpcklqdq     %ymm8,%ymm7,%ymm7
+       vpaddq  %ymm3,%ymm13,%ymm2
+       vpaddq  %ymm4,%ymm14,%ymm3
+       vpunpcklqdq     %ymm10,%ymm9,%ymm10
+       vpmuludq        80(%rax),%ymm0,%ymm4
+       vpmuludq        %ymm1,%ymm5,%ymm0
+       vmovdqa 64(%rcx),%ymm5
+       vpaddq  %ymm4,%ymm15,%ymm4
+       vpaddq  %ymm0,%ymm11,%ymm0
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpaddq  %ymm11,%ymm12,%ymm1
+
+       vpsrlq  $26,%ymm4,%ymm15
+       vpand   %ymm5,%ymm4,%ymm4
+
+       vpsrlq  $4,%ymm10,%ymm9
+
+       vpsrlq  $26,%ymm1,%ymm12
+       vpand   %ymm5,%ymm1,%ymm1
+       vpaddq  %ymm12,%ymm2,%ymm2
+
+       vpaddq  %ymm15,%ymm0,%ymm0
+       vpsllq  $2,%ymm15,%ymm15
+       vpaddq  %ymm15,%ymm0,%ymm0
+
+       vpand   %ymm5,%ymm9,%ymm9
+       vpsrlq  $26,%ymm7,%ymm8
+
+       vpsrlq  $26,%ymm2,%ymm13
+       vpand   %ymm5,%ymm2,%ymm2
+       vpaddq  %ymm13,%ymm3,%ymm3
+
+       vpaddq  %ymm9,%ymm2,%ymm2
+       vpsrlq  $30,%ymm10,%ymm10
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpaddq  %ymm11,%ymm1,%ymm1
+
+       vpsrlq  $40,%ymm6,%ymm6
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       vpand   %ymm5,%ymm7,%ymm7
+       vpand   %ymm5,%ymm8,%ymm8
+       vpand   %ymm5,%ymm10,%ymm10
+       vpor    32(%rcx),%ymm6,%ymm6
+
+       subq    $64,%rdx
+       jnz     .Loop_avx2
+
+.byte  0x66,0x90
+.Ltail_avx2:
+
+       vpaddq  %ymm0,%ymm7,%ymm0
+       vmovdqu 4(%rsp),%ymm7
+       vpaddq  %ymm1,%ymm8,%ymm1
+       vmovdqu 36(%rsp),%ymm8
+       vpaddq  %ymm3,%ymm10,%ymm3
+       vmovdqu 100(%rsp),%ymm9
+       vpaddq  %ymm4,%ymm6,%ymm4
+       vmovdqu 52(%rax),%ymm10
+       vmovdqu 116(%rax),%ymm5
+
+       vpmuludq        %ymm2,%ymm7,%ymm13
+       vpmuludq        %ymm2,%ymm8,%ymm14
+       vpmuludq        %ymm2,%ymm9,%ymm15
+       vpmuludq        %ymm2,%ymm10,%ymm11
+       vpmuludq        %ymm2,%ymm5,%ymm12
+
+       vpmuludq        %ymm0,%ymm8,%ymm6
+       vpmuludq        %ymm1,%ymm8,%ymm2
+       vpaddq  %ymm6,%ymm12,%ymm12
+       vpaddq  %ymm2,%ymm13,%ymm13
+       vpmuludq        %ymm3,%ymm8,%ymm6
+       vpmuludq        68(%rsp),%ymm4,%ymm2
+       vpaddq  %ymm6,%ymm15,%ymm15
+       vpaddq  %ymm2,%ymm11,%ymm11
+
+       vpmuludq        %ymm0,%ymm7,%ymm6
+       vpmuludq        %ymm1,%ymm7,%ymm2
+       vpaddq  %ymm6,%ymm11,%ymm11
+       vmovdqu -12(%rax),%ymm8
+       vpaddq  %ymm2,%ymm12,%ymm12
+       vpmuludq        %ymm3,%ymm7,%ymm6
+       vpmuludq        %ymm4,%ymm7,%ymm2
+       vpaddq  %ymm6,%ymm14,%ymm14
+       vpaddq  %ymm2,%ymm15,%ymm15
+
+       vpmuludq        %ymm3,%ymm8,%ymm6
+       vpmuludq        %ymm4,%ymm8,%ymm2
+       vpaddq  %ymm6,%ymm11,%ymm11
+       vpaddq  %ymm2,%ymm12,%ymm12
+       vmovdqu 20(%rax),%ymm2
+       vpmuludq        %ymm1,%ymm9,%ymm6
+       vpmuludq        %ymm0,%ymm9,%ymm9
+       vpaddq  %ymm6,%ymm14,%ymm14
+       vpaddq  %ymm9,%ymm13,%ymm13
+
+       vpmuludq        %ymm1,%ymm2,%ymm6
+       vpmuludq        %ymm0,%ymm2,%ymm2
+       vpaddq  %ymm6,%ymm15,%ymm15
+       vpaddq  %ymm2,%ymm14,%ymm14
+       vpmuludq        %ymm3,%ymm10,%ymm6
+       vpmuludq        %ymm4,%ymm10,%ymm2
+       vpaddq  %ymm6,%ymm12,%ymm12
+       vpaddq  %ymm2,%ymm13,%ymm13
+
+       vpmuludq        %ymm3,%ymm5,%ymm3
+       vpmuludq        %ymm4,%ymm5,%ymm4
+       vpaddq  %ymm3,%ymm13,%ymm2
+       vpaddq  %ymm4,%ymm14,%ymm3
+       vpmuludq        84(%rax),%ymm0,%ymm4
+       vpmuludq        %ymm1,%ymm5,%ymm0
+       vmovdqa 64(%rcx),%ymm5
+       vpaddq  %ymm4,%ymm15,%ymm4
+       vpaddq  %ymm0,%ymm11,%ymm0
+
+       vpsrldq $8,%ymm12,%ymm8
+       vpsrldq $8,%ymm2,%ymm9
+       vpsrldq $8,%ymm3,%ymm10
+       vpsrldq $8,%ymm4,%ymm6
+       vpsrldq $8,%ymm0,%ymm7
+       vpaddq  %ymm8,%ymm12,%ymm12
+       vpaddq  %ymm9,%ymm2,%ymm2
+       vpaddq  %ymm10,%ymm3,%ymm3
+       vpaddq  %ymm6,%ymm4,%ymm4
+       vpaddq  %ymm7,%ymm0,%ymm0
+
+       vpermq  $0x2,%ymm3,%ymm10
+       vpermq  $0x2,%ymm4,%ymm6
+       vpermq  $0x2,%ymm0,%ymm7
+       vpermq  $0x2,%ymm12,%ymm8
+       vpermq  $0x2,%ymm2,%ymm9
+       vpaddq  %ymm10,%ymm3,%ymm3
+       vpaddq  %ymm6,%ymm4,%ymm4
+       vpaddq  %ymm7,%ymm0,%ymm0
+       vpaddq  %ymm8,%ymm12,%ymm12
+       vpaddq  %ymm9,%ymm2,%ymm2
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpaddq  %ymm11,%ymm12,%ymm1
+
+       vpsrlq  $26,%ymm4,%ymm15
+       vpand   %ymm5,%ymm4,%ymm4
+
+       vpsrlq  $26,%ymm1,%ymm12
+       vpand   %ymm5,%ymm1,%ymm1
+       vpaddq  %ymm12,%ymm2,%ymm2
+
+       vpaddq  %ymm15,%ymm0,%ymm0
+       vpsllq  $2,%ymm15,%ymm15
+       vpaddq  %ymm15,%ymm0,%ymm0
+
+       vpsrlq  $26,%ymm2,%ymm13
+       vpand   %ymm5,%ymm2,%ymm2
+       vpaddq  %ymm13,%ymm3,%ymm3
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpaddq  %ymm11,%ymm1,%ymm1
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       vmovd   %xmm0,-112(%rdi)
+       vmovd   %xmm1,-108(%rdi)
+       vmovd   %xmm2,-104(%rdi)
+       vmovd   %xmm3,-100(%rdi)
+       vmovd   %xmm4,-96(%rdi)
+       leaq    -8(%r10),%rsp
+
+       vzeroupper
+       ret
+
+ENDPROC(poly1305_blocks_avx2)
+#endif /* CONFIG_AS_AVX2 */
+
+#ifdef CONFIG_AS_AVX512
+.align 32
+ENTRY(poly1305_blocks_avx512)
+
+       movl    20(%rdi),%r8d
+       cmpq    $128,%rdx
+       jae     .Lblocks_avx2_512
+       testl   %r8d,%r8d
+       jz      .Lblocks
+
+.Lblocks_avx2_512:
+       andq    $-16,%rdx
+       jz      .Lno_data_avx2_512
+
+       vzeroupper
+
+       testl   %r8d,%r8d
+       jz      .Lbase2_64_avx2_512
+
+       testq   $63,%rdx
+       jz      .Leven_avx2_512
+
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       pushq   %rdi
+
+.Lblocks_avx2_body_512:
+
+       movq    %rdx,%r15
+
+       movq    0(%rdi),%r8
+       movq    8(%rdi),%r9
+       movl    16(%rdi),%r10d
+
+       movq    24(%rdi),%r11
+       movq    32(%rdi),%r13
+
+
+       movl    %r8d,%r14d
+       andq    $-2147483648,%r8
+       movq    %r9,%r12
+       movl    %r9d,%ebx
+       andq    $-2147483648,%r9
+
+       shrq    $6,%r8
+       shlq    $52,%r12
+       addq    %r8,%r14
+       shrq    $12,%rbx
+       shrq    $18,%r9
+       addq    %r12,%r14
+       adcq    %r9,%rbx
+
+       movq    %r10,%r8
+       shlq    $40,%r8
+       shrq    $24,%r10
+       addq    %r8,%rbx
+       adcq    $0,%r10
+
+       movq    $-4,%r9
+       movq    %r10,%r8
+       andq    %r10,%r9
+       shrq    $2,%r8
+       andq    $3,%r10
+       addq    %r9,%r8
+       addq    %r8,%r14
+       adcq    $0,%rbx
+       adcq    $0,%r10
+
+       movq    %r13,%r12
+       movq    %r13,%rax
+       shrq    $2,%r13
+       addq    %r12,%r13
+
+.Lbase2_26_pre_avx2_512:
+       addq    0(%rsi),%r14
+       adcq    8(%rsi),%rbx
+       leaq    16(%rsi),%rsi
+       adcq    %rcx,%r10
+       subq    $16,%r15
+
+       movq    %rdi,0(%rsp)
+       __poly1305_block
+       movq    0(%rsp),%rdi
+       movq    %r12,%rax
+
+       testq   $63,%r15
+       jnz     .Lbase2_26_pre_avx2_512
+
+       testq   %rcx,%rcx
+       jz      .Lstore_base2_64_avx2_512
+
+
+       movq    %r14,%rax
+       movq    %r14,%rdx
+       shrq    $52,%r14
+       movq    %rbx,%r11
+       movq    %rbx,%r12
+       shrq    $26,%rdx
+       andq    $0x3ffffff,%rax
+       shlq    $12,%r11
+       andq    $0x3ffffff,%rdx
+       shrq    $14,%rbx
+       orq     %r11,%r14
+       shlq    $24,%r10
+       andq    $0x3ffffff,%r14
+       shrq    $40,%r12
+       andq    $0x3ffffff,%rbx
+       orq     %r12,%r10
+
+       testq   %r15,%r15
+       jz      .Lstore_base2_26_avx2_512
+
+       vmovd   %eax,%xmm0
+       vmovd   %edx,%xmm1
+       vmovd   %r14d,%xmm2
+       vmovd   %ebx,%xmm3
+       vmovd   %r10d,%xmm4
+       jmp     .Lproceed_avx2_512
+
+.align 32
+.Lstore_base2_64_avx2_512:
+       movq    %r14,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %r10,16(%rdi)
+       jmp     .Ldone_avx2_512
+
+.align 16
+.Lstore_base2_26_avx2_512:
+       movl    %eax,0(%rdi)
+       movl    %edx,4(%rdi)
+       movl    %r14d,8(%rdi)
+       movl    %ebx,12(%rdi)
+       movl    %r10d,16(%rdi)
+.align 16
+.Ldone_avx2_512:
+       movq    8(%rsp),%r15
+       movq    16(%rsp),%r14
+       movq    24(%rsp),%r13
+       movq    32(%rsp),%r12
+       movq    40(%rsp),%rbx
+       leaq    48(%rsp),%rsp
+
+.Lno_data_avx2_512:
+.Lblocks_avx2_epilogue_512:
+       ret
+
+
+.align 32
+.Lbase2_64_avx2_512:
+
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       pushq   %rdi
+
+.Lbase2_64_avx2_body_512:
+
+       movq    %rdx,%r15
+
+       movq    24(%rdi),%r11
+       movq    32(%rdi),%r13
+
+       movq    0(%rdi),%r14
+       movq    8(%rdi),%rbx
+       movl    16(%rdi),%r10d
+
+       movq    %r13,%r12
+       movq    %r13,%rax
+       shrq    $2,%r13
+       addq    %r12,%r13
+
+       testq   $63,%rdx
+       jz      .Linit_avx2_512
+
+.Lbase2_64_pre_avx2_512:
+       addq    0(%rsi),%r14
+       adcq    8(%rsi),%rbx
+       leaq    16(%rsi),%rsi
+       adcq    %rcx,%r10
+       subq    $16,%r15
+
+       movq    %rdi,0(%rsp)
+       __poly1305_block
+       movq    0(%rsp),%rdi
+       movq    %r12,%rax
+
+       testq   $63,%r15
+       jnz     .Lbase2_64_pre_avx2_512
+
+.Linit_avx2_512:
+
+       movq    %r14,%rax
+       movq    %r14,%rdx
+       shrq    $52,%r14
+       movq    %rbx,%r8
+       movq    %rbx,%r9
+       shrq    $26,%rdx
+       andq    $0x3ffffff,%rax
+       shlq    $12,%r8
+       andq    $0x3ffffff,%rdx
+       shrq    $14,%rbx
+       orq     %r8,%r14
+       shlq    $24,%r10
+       andq    $0x3ffffff,%r14
+       shrq    $40,%r9
+       andq    $0x3ffffff,%rbx
+       orq     %r9,%r10
+
+       vmovd   %eax,%xmm0
+       vmovd   %edx,%xmm1
+       vmovd   %r14d,%xmm2
+       vmovd   %ebx,%xmm3
+       vmovd   %r10d,%xmm4
+       movl    $1,20(%rdi)
+
+       __poly1305_init_avx
+
+.Lproceed_avx2_512:
+       movq    %r15,%rdx
+
+       movq    8(%rsp),%r15
+       movq    16(%rsp),%r14
+       movq    24(%rsp),%r13
+       movq    32(%rsp),%r12
+       movq    40(%rsp),%rbx
+       leaq    48(%rsp),%rax
+       leaq    48(%rsp),%rsp
+
+.Lbase2_64_avx2_epilogue_512:
+       jmp     .Ldo_avx2_512
+
+
+.align 32
+.Leven_avx2_512:
+
+       vmovd   0(%rdi),%xmm0
+       vmovd   4(%rdi),%xmm1
+       vmovd   8(%rdi),%xmm2
+       vmovd   12(%rdi),%xmm3
+       vmovd   16(%rdi),%xmm4
+
+.Ldo_avx2_512:
+       cmpq    $512,%rdx
+       jae     .Lblocks_avx512
+.Lskip_avx512:
+       leaq    8(%rsp),%r10
+
+       subq    $0x128,%rsp
+       leaq    .Lconst(%rip),%rcx
+       leaq    48+64(%rdi),%rdi
+       vmovdqa 96(%rcx),%ymm7
+
+
+       vmovdqu -64(%rdi),%xmm9
+       andq    $-512,%rsp
+       vmovdqu -48(%rdi),%xmm10
+       vmovdqu -32(%rdi),%xmm6
+       vmovdqu -16(%rdi),%xmm11
+       vmovdqu 0(%rdi),%xmm12
+       vmovdqu 16(%rdi),%xmm13
+       leaq    144(%rsp),%rax
+       vmovdqu 32(%rdi),%xmm14
+       vpermd  %ymm9,%ymm7,%ymm9
+       vmovdqu 48(%rdi),%xmm15
+       vpermd  %ymm10,%ymm7,%ymm10
+       vmovdqu 64(%rdi),%xmm5
+       vpermd  %ymm6,%ymm7,%ymm6
+       vmovdqa %ymm9,0(%rsp)
+       vpermd  %ymm11,%ymm7,%ymm11
+       vmovdqa %ymm10,32-144(%rax)
+       vpermd  %ymm12,%ymm7,%ymm12
+       vmovdqa %ymm6,64-144(%rax)
+       vpermd  %ymm13,%ymm7,%ymm13
+       vmovdqa %ymm11,96-144(%rax)
+       vpermd  %ymm14,%ymm7,%ymm14
+       vmovdqa %ymm12,128-144(%rax)
+       vpermd  %ymm15,%ymm7,%ymm15
+       vmovdqa %ymm13,160-144(%rax)
+       vpermd  %ymm5,%ymm7,%ymm5
+       vmovdqa %ymm14,192-144(%rax)
+       vmovdqa %ymm15,224-144(%rax)
+       vmovdqa %ymm5,256-144(%rax)
+       vmovdqa 64(%rcx),%ymm5
+
+
+
+       vmovdqu 0(%rsi),%xmm7
+       vmovdqu 16(%rsi),%xmm8
+       vinserti128     $1,32(%rsi),%ymm7,%ymm7
+       vinserti128     $1,48(%rsi),%ymm8,%ymm8
+       leaq    64(%rsi),%rsi
+
+       vpsrldq $6,%ymm7,%ymm9
+       vpsrldq $6,%ymm8,%ymm10
+       vpunpckhqdq     %ymm8,%ymm7,%ymm6
+       vpunpcklqdq     %ymm10,%ymm9,%ymm9
+       vpunpcklqdq     %ymm8,%ymm7,%ymm7
+
+       vpsrlq  $30,%ymm9,%ymm10
+       vpsrlq  $4,%ymm9,%ymm9
+       vpsrlq  $26,%ymm7,%ymm8
+       vpsrlq  $40,%ymm6,%ymm6
+       vpand   %ymm5,%ymm9,%ymm9
+       vpand   %ymm5,%ymm7,%ymm7
+       vpand   %ymm5,%ymm8,%ymm8
+       vpand   %ymm5,%ymm10,%ymm10
+       vpor    32(%rcx),%ymm6,%ymm6
+
+       vpaddq  %ymm2,%ymm9,%ymm2
+       subq    $64,%rdx
+       jz      .Ltail_avx2_512
+       jmp     .Loop_avx2_512
+
+.align 32
+.Loop_avx2_512:
+
+       vpaddq  %ymm0,%ymm7,%ymm0
+       vmovdqa 0(%rsp),%ymm7
+       vpaddq  %ymm1,%ymm8,%ymm1
+       vmovdqa 32(%rsp),%ymm8
+       vpaddq  %ymm3,%ymm10,%ymm3
+       vmovdqa 96(%rsp),%ymm9
+       vpaddq  %ymm4,%ymm6,%ymm4
+       vmovdqa 48(%rax),%ymm10
+       vmovdqa 112(%rax),%ymm5
+
+       vpmuludq        %ymm2,%ymm7,%ymm13
+       vpmuludq        %ymm2,%ymm8,%ymm14
+       vpmuludq        %ymm2,%ymm9,%ymm15
+       vpmuludq        %ymm2,%ymm10,%ymm11
+       vpmuludq        %ymm2,%ymm5,%ymm12
+
+       vpmuludq        %ymm0,%ymm8,%ymm6
+       vpmuludq        %ymm1,%ymm8,%ymm2
+       vpaddq  %ymm6,%ymm12,%ymm12
+       vpaddq  %ymm2,%ymm13,%ymm13
+       vpmuludq        %ymm3,%ymm8,%ymm6
+       vpmuludq        64(%rsp),%ymm4,%ymm2
+       vpaddq  %ymm6,%ymm15,%ymm15
+       vpaddq  %ymm2,%ymm11,%ymm11
+       vmovdqa -16(%rax),%ymm8
+
+       vpmuludq        %ymm0,%ymm7,%ymm6
+       vpmuludq        %ymm1,%ymm7,%ymm2
+       vpaddq  %ymm6,%ymm11,%ymm11
+       vpaddq  %ymm2,%ymm12,%ymm12
+       vpmuludq        %ymm3,%ymm7,%ymm6
+       vpmuludq        %ymm4,%ymm7,%ymm2
+       vmovdqu 0(%rsi),%xmm7
+       vpaddq  %ymm6,%ymm14,%ymm14
+       vpaddq  %ymm2,%ymm15,%ymm15
+       vinserti128     $1,32(%rsi),%ymm7,%ymm7
+
+       vpmuludq        %ymm3,%ymm8,%ymm6
+       vpmuludq        %ymm4,%ymm8,%ymm2
+       vmovdqu 16(%rsi),%xmm8
+       vpaddq  %ymm6,%ymm11,%ymm11
+       vpaddq  %ymm2,%ymm12,%ymm12
+       vmovdqa 16(%rax),%ymm2
+       vpmuludq        %ymm1,%ymm9,%ymm6
+       vpmuludq        %ymm0,%ymm9,%ymm9
+       vpaddq  %ymm6,%ymm14,%ymm14
+       vpaddq  %ymm9,%ymm13,%ymm13
+       vinserti128     $1,48(%rsi),%ymm8,%ymm8
+       leaq    64(%rsi),%rsi
+
+       vpmuludq        %ymm1,%ymm2,%ymm6
+       vpmuludq        %ymm0,%ymm2,%ymm2
+       vpsrldq $6,%ymm7,%ymm9
+       vpaddq  %ymm6,%ymm15,%ymm15
+       vpaddq  %ymm2,%ymm14,%ymm14
+       vpmuludq        %ymm3,%ymm10,%ymm6
+       vpmuludq        %ymm4,%ymm10,%ymm2
+       vpsrldq $6,%ymm8,%ymm10
+       vpaddq  %ymm6,%ymm12,%ymm12
+       vpaddq  %ymm2,%ymm13,%ymm13
+       vpunpckhqdq     %ymm8,%ymm7,%ymm6
+
+       vpmuludq        %ymm3,%ymm5,%ymm3
+       vpmuludq        %ymm4,%ymm5,%ymm4
+       vpunpcklqdq     %ymm8,%ymm7,%ymm7
+       vpaddq  %ymm3,%ymm13,%ymm2
+       vpaddq  %ymm4,%ymm14,%ymm3
+       vpunpcklqdq     %ymm10,%ymm9,%ymm10
+       vpmuludq        80(%rax),%ymm0,%ymm4
+       vpmuludq        %ymm1,%ymm5,%ymm0
+       vmovdqa 64(%rcx),%ymm5
+       vpaddq  %ymm4,%ymm15,%ymm4
+       vpaddq  %ymm0,%ymm11,%ymm0
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpaddq  %ymm11,%ymm12,%ymm1
+
+       vpsrlq  $26,%ymm4,%ymm15
+       vpand   %ymm5,%ymm4,%ymm4
+
+       vpsrlq  $4,%ymm10,%ymm9
+
+       vpsrlq  $26,%ymm1,%ymm12
+       vpand   %ymm5,%ymm1,%ymm1
+       vpaddq  %ymm12,%ymm2,%ymm2
+
+       vpaddq  %ymm15,%ymm0,%ymm0
+       vpsllq  $2,%ymm15,%ymm15
+       vpaddq  %ymm15,%ymm0,%ymm0
+
+       vpand   %ymm5,%ymm9,%ymm9
+       vpsrlq  $26,%ymm7,%ymm8
+
+       vpsrlq  $26,%ymm2,%ymm13
+       vpand   %ymm5,%ymm2,%ymm2
+       vpaddq  %ymm13,%ymm3,%ymm3
+
+       vpaddq  %ymm9,%ymm2,%ymm2
+       vpsrlq  $30,%ymm10,%ymm10
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpaddq  %ymm11,%ymm1,%ymm1
+
+       vpsrlq  $40,%ymm6,%ymm6
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       vpand   %ymm5,%ymm7,%ymm7
+       vpand   %ymm5,%ymm8,%ymm8
+       vpand   %ymm5,%ymm10,%ymm10
+       vpor    32(%rcx),%ymm6,%ymm6
+
+       subq    $64,%rdx
+       jnz     .Loop_avx2_512
+
+.byte  0x66,0x90
+.Ltail_avx2_512:
+
+       vpaddq  %ymm0,%ymm7,%ymm0
+       vmovdqu 4(%rsp),%ymm7
+       vpaddq  %ymm1,%ymm8,%ymm1
+       vmovdqu 36(%rsp),%ymm8
+       vpaddq  %ymm3,%ymm10,%ymm3
+       vmovdqu 100(%rsp),%ymm9
+       vpaddq  %ymm4,%ymm6,%ymm4
+       vmovdqu 52(%rax),%ymm10
+       vmovdqu 116(%rax),%ymm5
+
+       vpmuludq        %ymm2,%ymm7,%ymm13
+       vpmuludq        %ymm2,%ymm8,%ymm14
+       vpmuludq        %ymm2,%ymm9,%ymm15
+       vpmuludq        %ymm2,%ymm10,%ymm11
+       vpmuludq        %ymm2,%ymm5,%ymm12
+
+       vpmuludq        %ymm0,%ymm8,%ymm6
+       vpmuludq        %ymm1,%ymm8,%ymm2
+       vpaddq  %ymm6,%ymm12,%ymm12
+       vpaddq  %ymm2,%ymm13,%ymm13
+       vpmuludq        %ymm3,%ymm8,%ymm6
+       vpmuludq        68(%rsp),%ymm4,%ymm2
+       vpaddq  %ymm6,%ymm15,%ymm15
+       vpaddq  %ymm2,%ymm11,%ymm11
+
+       vpmuludq        %ymm0,%ymm7,%ymm6
+       vpmuludq        %ymm1,%ymm7,%ymm2
+       vpaddq  %ymm6,%ymm11,%ymm11
+       vmovdqu -12(%rax),%ymm8
+       vpaddq  %ymm2,%ymm12,%ymm12
+       vpmuludq        %ymm3,%ymm7,%ymm6
+       vpmuludq        %ymm4,%ymm7,%ymm2
+       vpaddq  %ymm6,%ymm14,%ymm14
+       vpaddq  %ymm2,%ymm15,%ymm15
+
+       vpmuludq        %ymm3,%ymm8,%ymm6
+       vpmuludq        %ymm4,%ymm8,%ymm2
+       vpaddq  %ymm6,%ymm11,%ymm11
+       vpaddq  %ymm2,%ymm12,%ymm12
+       vmovdqu 20(%rax),%ymm2
+       vpmuludq        %ymm1,%ymm9,%ymm6
+       vpmuludq        %ymm0,%ymm9,%ymm9
+       vpaddq  %ymm6,%ymm14,%ymm14
+       vpaddq  %ymm9,%ymm13,%ymm13
+
+       vpmuludq        %ymm1,%ymm2,%ymm6
+       vpmuludq        %ymm0,%ymm2,%ymm2
+       vpaddq  %ymm6,%ymm15,%ymm15
+       vpaddq  %ymm2,%ymm14,%ymm14
+       vpmuludq        %ymm3,%ymm10,%ymm6
+       vpmuludq        %ymm4,%ymm10,%ymm2
+       vpaddq  %ymm6,%ymm12,%ymm12
+       vpaddq  %ymm2,%ymm13,%ymm13
+
+       vpmuludq        %ymm3,%ymm5,%ymm3
+       vpmuludq        %ymm4,%ymm5,%ymm4
+       vpaddq  %ymm3,%ymm13,%ymm2
+       vpaddq  %ymm4,%ymm14,%ymm3
+       vpmuludq        84(%rax),%ymm0,%ymm4
+       vpmuludq        %ymm1,%ymm5,%ymm0
+       vmovdqa 64(%rcx),%ymm5
+       vpaddq  %ymm4,%ymm15,%ymm4
+       vpaddq  %ymm0,%ymm11,%ymm0
+
+       vpsrldq $8,%ymm12,%ymm8
+       vpsrldq $8,%ymm2,%ymm9
+       vpsrldq $8,%ymm3,%ymm10
+       vpsrldq $8,%ymm4,%ymm6
+       vpsrldq $8,%ymm0,%ymm7
+       vpaddq  %ymm8,%ymm12,%ymm12
+       vpaddq  %ymm9,%ymm2,%ymm2
+       vpaddq  %ymm10,%ymm3,%ymm3
+       vpaddq  %ymm6,%ymm4,%ymm4
+       vpaddq  %ymm7,%ymm0,%ymm0
+
+       vpermq  $0x2,%ymm3,%ymm10
+       vpermq  $0x2,%ymm4,%ymm6
+       vpermq  $0x2,%ymm0,%ymm7
+       vpermq  $0x2,%ymm12,%ymm8
+       vpermq  $0x2,%ymm2,%ymm9
+       vpaddq  %ymm10,%ymm3,%ymm3
+       vpaddq  %ymm6,%ymm4,%ymm4
+       vpaddq  %ymm7,%ymm0,%ymm0
+       vpaddq  %ymm8,%ymm12,%ymm12
+       vpaddq  %ymm9,%ymm2,%ymm2
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpaddq  %ymm11,%ymm12,%ymm1
+
+       vpsrlq  $26,%ymm4,%ymm15
+       vpand   %ymm5,%ymm4,%ymm4
+
+       vpsrlq  $26,%ymm1,%ymm12
+       vpand   %ymm5,%ymm1,%ymm1
+       vpaddq  %ymm12,%ymm2,%ymm2
+
+       vpaddq  %ymm15,%ymm0,%ymm0
+       vpsllq  $2,%ymm15,%ymm15
+       vpaddq  %ymm15,%ymm0,%ymm0
+
+       vpsrlq  $26,%ymm2,%ymm13
+       vpand   %ymm5,%ymm2,%ymm2
+       vpaddq  %ymm13,%ymm3,%ymm3
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpaddq  %ymm11,%ymm1,%ymm1
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       vmovd   %xmm0,-112(%rdi)
+       vmovd   %xmm1,-108(%rdi)
+       vmovd   %xmm2,-104(%rdi)
+       vmovd   %xmm3,-100(%rdi)
+       vmovd   %xmm4,-96(%rdi)
+       leaq    -8(%r10),%rsp
+
+       vzeroupper
+       ret
+
+.Lblocks_avx512:
+
+       movl    $15,%eax
+       kmovw   %eax,%k2
+       leaq    8(%rsp),%r10
+
+       subq    $0x128,%rsp
+       leaq    .Lconst(%rip),%rcx
+       leaq    48+64(%rdi),%rdi
+       vmovdqa 96(%rcx),%ymm9
+
+       vmovdqu32       -64(%rdi),%zmm16{%k2}{z}
+       andq    $-512,%rsp
+       vmovdqu32       -48(%rdi),%zmm17{%k2}{z}
+       movq    $0x20,%rax
+       vmovdqu32       -32(%rdi),%zmm21{%k2}{z}
+       vmovdqu32       -16(%rdi),%zmm18{%k2}{z}
+       vmovdqu32       0(%rdi),%zmm22{%k2}{z}
+       vmovdqu32       16(%rdi),%zmm19{%k2}{z}
+       vmovdqu32       32(%rdi),%zmm23{%k2}{z}
+       vmovdqu32       48(%rdi),%zmm20{%k2}{z}
+       vmovdqu32       64(%rdi),%zmm24{%k2}{z}
+       vpermd  %zmm16,%zmm9,%zmm16
+       vpbroadcastq    64(%rcx),%zmm5
+       vpermd  %zmm17,%zmm9,%zmm17
+       vpermd  %zmm21,%zmm9,%zmm21
+       vpermd  %zmm18,%zmm9,%zmm18
+       vmovdqa64       %zmm16,0(%rsp){%k2}
+       vpsrlq  $32,%zmm16,%zmm7
+       vpermd  %zmm22,%zmm9,%zmm22
+       vmovdqu64       %zmm17,0(%rsp,%rax,1){%k2}
+       vpsrlq  $32,%zmm17,%zmm8
+       vpermd  %zmm19,%zmm9,%zmm19
+       vmovdqa64       %zmm21,64(%rsp){%k2}
+       vpermd  %zmm23,%zmm9,%zmm23
+       vpermd  %zmm20,%zmm9,%zmm20
+       vmovdqu64       %zmm18,64(%rsp,%rax,1){%k2}
+       vpermd  %zmm24,%zmm9,%zmm24
+       vmovdqa64       %zmm22,128(%rsp){%k2}
+       vmovdqu64       %zmm19,128(%rsp,%rax,1){%k2}
+       vmovdqa64       %zmm23,192(%rsp){%k2}
+       vmovdqu64       %zmm20,192(%rsp,%rax,1){%k2}
+       vmovdqa64       %zmm24,256(%rsp){%k2}
+
+       vpmuludq        %zmm7,%zmm16,%zmm11
+       vpmuludq        %zmm7,%zmm17,%zmm12
+       vpmuludq        %zmm7,%zmm18,%zmm13
+       vpmuludq        %zmm7,%zmm19,%zmm14
+       vpmuludq        %zmm7,%zmm20,%zmm15
+       vpsrlq  $32,%zmm18,%zmm9
+
+       vpmuludq        %zmm8,%zmm24,%zmm25
+       vpmuludq        %zmm8,%zmm16,%zmm26
+       vpmuludq        %zmm8,%zmm17,%zmm27
+       vpmuludq        %zmm8,%zmm18,%zmm28
+       vpmuludq        %zmm8,%zmm19,%zmm29
+       vpsrlq  $32,%zmm19,%zmm10
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm27,%zmm13,%zmm13
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+
+       vpmuludq        %zmm9,%zmm23,%zmm25
+       vpmuludq        %zmm9,%zmm24,%zmm26
+       vpmuludq        %zmm9,%zmm17,%zmm28
+       vpmuludq        %zmm9,%zmm18,%zmm29
+       vpmuludq        %zmm9,%zmm16,%zmm27
+       vpsrlq  $32,%zmm20,%zmm6
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vpmuludq        %zmm10,%zmm22,%zmm25
+       vpmuludq        %zmm10,%zmm16,%zmm28
+       vpmuludq        %zmm10,%zmm17,%zmm29
+       vpmuludq        %zmm10,%zmm23,%zmm26
+       vpmuludq        %zmm10,%zmm24,%zmm27
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vpmuludq        %zmm6,%zmm24,%zmm28
+       vpmuludq        %zmm6,%zmm16,%zmm29
+       vpmuludq        %zmm6,%zmm21,%zmm25
+       vpmuludq        %zmm6,%zmm22,%zmm26
+       vpmuludq        %zmm6,%zmm23,%zmm27
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vmovdqu64       0(%rsi),%zmm10
+       vmovdqu64       64(%rsi),%zmm6
+       leaq    128(%rsi),%rsi
+
+       vpsrlq  $26,%zmm14,%zmm28
+       vpandq  %zmm5,%zmm14,%zmm14
+       vpaddq  %zmm28,%zmm15,%zmm15
+
+       vpsrlq  $26,%zmm11,%zmm25
+       vpandq  %zmm5,%zmm11,%zmm11
+       vpaddq  %zmm25,%zmm12,%zmm12
+
+       vpsrlq  $26,%zmm15,%zmm29
+       vpandq  %zmm5,%zmm15,%zmm15
+
+       vpsrlq  $26,%zmm12,%zmm26
+       vpandq  %zmm5,%zmm12,%zmm12
+       vpaddq  %zmm26,%zmm13,%zmm13
+
+       vpaddq  %zmm29,%zmm11,%zmm11
+       vpsllq  $2,%zmm29,%zmm29
+       vpaddq  %zmm29,%zmm11,%zmm11
+
+       vpsrlq  $26,%zmm13,%zmm27
+       vpandq  %zmm5,%zmm13,%zmm13
+       vpaddq  %zmm27,%zmm14,%zmm14
+
+       vpsrlq  $26,%zmm11,%zmm25
+       vpandq  %zmm5,%zmm11,%zmm11
+       vpaddq  %zmm25,%zmm12,%zmm12
+
+       vpsrlq  $26,%zmm14,%zmm28
+       vpandq  %zmm5,%zmm14,%zmm14
+       vpaddq  %zmm28,%zmm15,%zmm15
+
+       vpunpcklqdq     %zmm6,%zmm10,%zmm7
+       vpunpckhqdq     %zmm6,%zmm10,%zmm6
+
+       vmovdqa32       128(%rcx),%zmm25
+       movl    $0x7777,%eax
+       kmovw   %eax,%k1
+
+       vpermd  %zmm16,%zmm25,%zmm16
+       vpermd  %zmm17,%zmm25,%zmm17
+       vpermd  %zmm18,%zmm25,%zmm18
+       vpermd  %zmm19,%zmm25,%zmm19
+       vpermd  %zmm20,%zmm25,%zmm20
+
+       vpermd  %zmm11,%zmm25,%zmm16{%k1}
+       vpermd  %zmm12,%zmm25,%zmm17{%k1}
+       vpermd  %zmm13,%zmm25,%zmm18{%k1}
+       vpermd  %zmm14,%zmm25,%zmm19{%k1}
+       vpermd  %zmm15,%zmm25,%zmm20{%k1}
+
+       vpslld  $2,%zmm17,%zmm21
+       vpslld  $2,%zmm18,%zmm22
+       vpslld  $2,%zmm19,%zmm23
+       vpslld  $2,%zmm20,%zmm24
+       vpaddd  %zmm17,%zmm21,%zmm21
+       vpaddd  %zmm18,%zmm22,%zmm22
+       vpaddd  %zmm19,%zmm23,%zmm23
+       vpaddd  %zmm20,%zmm24,%zmm24
+
+       vpbroadcastq    32(%rcx),%zmm30
+
+       vpsrlq  $52,%zmm7,%zmm9
+       vpsllq  $12,%zmm6,%zmm10
+       vporq   %zmm10,%zmm9,%zmm9
+       vpsrlq  $26,%zmm7,%zmm8
+       vpsrlq  $14,%zmm6,%zmm10
+       vpsrlq  $40,%zmm6,%zmm6
+       vpandq  %zmm5,%zmm9,%zmm9
+       vpandq  %zmm5,%zmm7,%zmm7
+
+       vpaddq  %zmm2,%zmm9,%zmm2
+       subq    $192,%rdx
+       jbe     .Ltail_avx512
+       jmp     .Loop_avx512
+
+.align 32
+.Loop_avx512:
+
+       vpmuludq        %zmm2,%zmm17,%zmm14
+       vpaddq  %zmm0,%zmm7,%zmm0
+       vpmuludq        %zmm2,%zmm18,%zmm15
+       vpandq  %zmm5,%zmm8,%zmm8
+       vpmuludq        %zmm2,%zmm23,%zmm11
+       vpandq  %zmm5,%zmm10,%zmm10
+       vpmuludq        %zmm2,%zmm24,%zmm12
+       vporq   %zmm30,%zmm6,%zmm6
+       vpmuludq        %zmm2,%zmm16,%zmm13
+       vpaddq  %zmm1,%zmm8,%zmm1
+       vpaddq  %zmm3,%zmm10,%zmm3
+       vpaddq  %zmm4,%zmm6,%zmm4
+
+       vmovdqu64       0(%rsi),%zmm10
+       vmovdqu64       64(%rsi),%zmm6
+       leaq    128(%rsi),%rsi
+       vpmuludq        %zmm0,%zmm19,%zmm28
+       vpmuludq        %zmm0,%zmm20,%zmm29
+       vpmuludq        %zmm0,%zmm16,%zmm25
+       vpmuludq        %zmm0,%zmm17,%zmm26
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm26,%zmm12,%zmm12
+
+       vpmuludq        %zmm1,%zmm18,%zmm28
+       vpmuludq        %zmm1,%zmm19,%zmm29
+       vpmuludq        %zmm1,%zmm24,%zmm25
+       vpmuludq        %zmm0,%zmm18,%zmm27
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vpunpcklqdq     %zmm6,%zmm10,%zmm7
+       vpunpckhqdq     %zmm6,%zmm10,%zmm6
+
+       vpmuludq        %zmm3,%zmm16,%zmm28
+       vpmuludq        %zmm3,%zmm17,%zmm29
+       vpmuludq        %zmm1,%zmm16,%zmm26
+       vpmuludq        %zmm1,%zmm17,%zmm27
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vpmuludq        %zmm4,%zmm24,%zmm28
+       vpmuludq        %zmm4,%zmm16,%zmm29
+       vpmuludq        %zmm3,%zmm22,%zmm25
+       vpmuludq        %zmm3,%zmm23,%zmm26
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpmuludq        %zmm3,%zmm24,%zmm27
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vpmuludq        %zmm4,%zmm21,%zmm25
+       vpmuludq        %zmm4,%zmm22,%zmm26
+       vpmuludq        %zmm4,%zmm23,%zmm27
+       vpaddq  %zmm25,%zmm11,%zmm0
+       vpaddq  %zmm26,%zmm12,%zmm1
+       vpaddq  %zmm27,%zmm13,%zmm2
+
+       vpsrlq  $52,%zmm7,%zmm9
+       vpsllq  $12,%zmm6,%zmm10
+
+       vpsrlq  $26,%zmm14,%zmm3
+       vpandq  %zmm5,%zmm14,%zmm14
+       vpaddq  %zmm3,%zmm15,%zmm4
+
+       vporq   %zmm10,%zmm9,%zmm9
+
+       vpsrlq  $26,%zmm0,%zmm11
+       vpandq  %zmm5,%zmm0,%zmm0
+       vpaddq  %zmm11,%zmm1,%zmm1
+
+       vpandq  %zmm5,%zmm9,%zmm9
+
+       vpsrlq  $26,%zmm4,%zmm15
+       vpandq  %zmm5,%zmm4,%zmm4
+
+       vpsrlq  $26,%zmm1,%zmm12
+       vpandq  %zmm5,%zmm1,%zmm1
+       vpaddq  %zmm12,%zmm2,%zmm2
+
+       vpaddq  %zmm15,%zmm0,%zmm0
+       vpsllq  $2,%zmm15,%zmm15
+       vpaddq  %zmm15,%zmm0,%zmm0
+
+       vpaddq  %zmm9,%zmm2,%zmm2
+       vpsrlq  $26,%zmm7,%zmm8
+
+       vpsrlq  $26,%zmm2,%zmm13
+       vpandq  %zmm5,%zmm2,%zmm2
+       vpaddq  %zmm13,%zmm14,%zmm3
+
+       vpsrlq  $14,%zmm6,%zmm10
+
+       vpsrlq  $26,%zmm0,%zmm11
+       vpandq  %zmm5,%zmm0,%zmm0
+       vpaddq  %zmm11,%zmm1,%zmm1
+
+       vpsrlq  $40,%zmm6,%zmm6
+
+       vpsrlq  $26,%zmm3,%zmm14
+       vpandq  %zmm5,%zmm3,%zmm3
+       vpaddq  %zmm14,%zmm4,%zmm4
+
+       vpandq  %zmm5,%zmm7,%zmm7
+
+       subq    $128,%rdx
+       ja      .Loop_avx512
+
+.Ltail_avx512:
+
+       vpsrlq  $32,%zmm16,%zmm16
+       vpsrlq  $32,%zmm17,%zmm17
+       vpsrlq  $32,%zmm18,%zmm18
+       vpsrlq  $32,%zmm23,%zmm23
+       vpsrlq  $32,%zmm24,%zmm24
+       vpsrlq  $32,%zmm19,%zmm19
+       vpsrlq  $32,%zmm20,%zmm20
+       vpsrlq  $32,%zmm21,%zmm21
+       vpsrlq  $32,%zmm22,%zmm22
+
+       leaq    (%rsi,%rdx,1),%rsi
+
+       vpaddq  %zmm0,%zmm7,%zmm0
+
+       vpmuludq        %zmm2,%zmm17,%zmm14
+       vpmuludq        %zmm2,%zmm18,%zmm15
+       vpmuludq        %zmm2,%zmm23,%zmm11
+       vpandq  %zmm5,%zmm8,%zmm8
+       vpmuludq        %zmm2,%zmm24,%zmm12
+       vpandq  %zmm5,%zmm10,%zmm10
+       vpmuludq        %zmm2,%zmm16,%zmm13
+       vporq   %zmm30,%zmm6,%zmm6
+       vpaddq  %zmm1,%zmm8,%zmm1
+       vpaddq  %zmm3,%zmm10,%zmm3
+       vpaddq  %zmm4,%zmm6,%zmm4
+
+       vmovdqu 0(%rsi),%xmm7
+       vpmuludq        %zmm0,%zmm19,%zmm28
+       vpmuludq        %zmm0,%zmm20,%zmm29
+       vpmuludq        %zmm0,%zmm16,%zmm25
+       vpmuludq        %zmm0,%zmm17,%zmm26
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm26,%zmm12,%zmm12
+
+       vmovdqu 16(%rsi),%xmm8
+       vpmuludq        %zmm1,%zmm18,%zmm28
+       vpmuludq        %zmm1,%zmm19,%zmm29
+       vpmuludq        %zmm1,%zmm24,%zmm25
+       vpmuludq        %zmm0,%zmm18,%zmm27
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vinserti128     $1,32(%rsi),%ymm7,%ymm7
+       vpmuludq        %zmm3,%zmm16,%zmm28
+       vpmuludq        %zmm3,%zmm17,%zmm29
+       vpmuludq        %zmm1,%zmm16,%zmm26
+       vpmuludq        %zmm1,%zmm17,%zmm27
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vinserti128     $1,48(%rsi),%ymm8,%ymm8
+       vpmuludq        %zmm4,%zmm24,%zmm28
+       vpmuludq        %zmm4,%zmm16,%zmm29
+       vpmuludq        %zmm3,%zmm22,%zmm25
+       vpmuludq        %zmm3,%zmm23,%zmm26
+       vpmuludq        %zmm3,%zmm24,%zmm27
+       vpaddq  %zmm28,%zmm14,%zmm3
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vpmuludq        %zmm4,%zmm21,%zmm25
+       vpmuludq        %zmm4,%zmm22,%zmm26
+       vpmuludq        %zmm4,%zmm23,%zmm27
+       vpaddq  %zmm25,%zmm11,%zmm0
+       vpaddq  %zmm26,%zmm12,%zmm1
+       vpaddq  %zmm27,%zmm13,%zmm2
+
+       movl    $1,%eax
+       vpermq  $0xb1,%zmm3,%zmm14
+       vpermq  $0xb1,%zmm15,%zmm4
+       vpermq  $0xb1,%zmm0,%zmm11
+       vpermq  $0xb1,%zmm1,%zmm12
+       vpermq  $0xb1,%zmm2,%zmm13
+       vpaddq  %zmm14,%zmm3,%zmm3
+       vpaddq  %zmm15,%zmm4,%zmm4
+       vpaddq  %zmm11,%zmm0,%zmm0
+       vpaddq  %zmm12,%zmm1,%zmm1
+       vpaddq  %zmm13,%zmm2,%zmm2
+
+       kmovw   %eax,%k3
+       vpermq  $0x2,%zmm3,%zmm14
+       vpermq  $0x2,%zmm4,%zmm15
+       vpermq  $0x2,%zmm0,%zmm11
+       vpermq  $0x2,%zmm1,%zmm12
+       vpermq  $0x2,%zmm2,%zmm13
+       vpaddq  %zmm14,%zmm3,%zmm3
+       vpaddq  %zmm15,%zmm4,%zmm4
+       vpaddq  %zmm11,%zmm0,%zmm0
+       vpaddq  %zmm12,%zmm1,%zmm1
+       vpaddq  %zmm13,%zmm2,%zmm2
+
+       vextracti64x4   $0x1,%zmm3,%ymm14
+       vextracti64x4   $0x1,%zmm4,%ymm15
+       vextracti64x4   $0x1,%zmm0,%ymm11
+       vextracti64x4   $0x1,%zmm1,%ymm12
+       vextracti64x4   $0x1,%zmm2,%ymm13
+       vpaddq  %zmm14,%zmm3,%zmm3{%k3}{z}
+       vpaddq  %zmm15,%zmm4,%zmm4{%k3}{z}
+       vpaddq  %zmm11,%zmm0,%zmm0{%k3}{z}
+       vpaddq  %zmm12,%zmm1,%zmm1{%k3}{z}
+       vpaddq  %zmm13,%zmm2,%zmm2{%k3}{z}
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpsrldq $6,%ymm7,%ymm9
+       vpsrldq $6,%ymm8,%ymm10
+       vpunpckhqdq     %ymm8,%ymm7,%ymm6
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpunpcklqdq     %ymm10,%ymm9,%ymm9
+       vpunpcklqdq     %ymm8,%ymm7,%ymm7
+       vpaddq  %ymm11,%ymm1,%ymm1
+
+       vpsrlq  $26,%ymm4,%ymm15
+       vpand   %ymm5,%ymm4,%ymm4
+
+       vpsrlq  $26,%ymm1,%ymm12
+       vpand   %ymm5,%ymm1,%ymm1
+       vpsrlq  $30,%ymm9,%ymm10
+       vpsrlq  $4,%ymm9,%ymm9
+       vpaddq  %ymm12,%ymm2,%ymm2
+
+       vpaddq  %ymm15,%ymm0,%ymm0
+       vpsllq  $2,%ymm15,%ymm15
+       vpsrlq  $26,%ymm7,%ymm8
+       vpsrlq  $40,%ymm6,%ymm6
+       vpaddq  %ymm15,%ymm0,%ymm0
+
+       vpsrlq  $26,%ymm2,%ymm13
+       vpand   %ymm5,%ymm2,%ymm2
+       vpand   %ymm5,%ymm9,%ymm9
+       vpand   %ymm5,%ymm7,%ymm7
+       vpaddq  %ymm13,%ymm3,%ymm3
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpaddq  %ymm2,%ymm9,%ymm2
+       vpand   %ymm5,%ymm8,%ymm8
+       vpaddq  %ymm11,%ymm1,%ymm1
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpand   %ymm5,%ymm10,%ymm10
+       vpor    32(%rcx),%ymm6,%ymm6
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       leaq    144(%rsp),%rax
+       addq    $64,%rdx
+       jnz     .Ltail_avx2_512
+
+       vpsubq  %ymm9,%ymm2,%ymm2
+       vmovd   %xmm0,-112(%rdi)
+       vmovd   %xmm1,-108(%rdi)
+       vmovd   %xmm2,-104(%rdi)
+       vmovd   %xmm3,-100(%rdi)
+       vmovd   %xmm4,-96(%rdi)
+       vzeroall
+       leaq    -8(%r10),%rsp
+
+       ret
+
+ENDPROC(poly1305_blocks_avx512)
+#endif /* CONFIG_AS_AVX512 */
diff --git a/lib/zinc/poly1305/poly1305.c b/lib/zinc/poly1305/poly1305.c
index 538abc359d1d..2ae1b3cb66cd 100644
--- a/lib/zinc/poly1305/poly1305.c
+++ b/lib/zinc/poly1305/poly1305.c
@@ -15,7 +15,9 @@
 #include <linux/module.h>
 #include <linux/init.h>
 
-#ifndef HAVE_POLY1305_ARCH_IMPLEMENTATION
+#if defined(CONFIG_ZINC_ARCH_X86_64)
+#include "poly1305-x86_64-glue.h"
+#else
 static inline bool poly1305_init_arch(void *ctx,
                                      const u8 key[POLY1305_KEY_SIZE])
 {
-- 
2.19.0

Reply via email to