These x86_64 vectorized implementations come from Andy Polyakov's
implementation, and are included here in raw form without modification,
so that subsequent commits that fix these up for the kernel can see how
it has changed.

While this is CRYPTOGAMS code, the originating code for this happens to
be the same as OpenSSL's commit 4dfe4310c31c4483705991d9a798ce9be1ed1c68

Signed-off-by: Jason A. Donenfeld <ja...@zx2c4.com>
Based-on-code-from: Andy Polyakov <ap...@openssl.org>
Cc: Andy Polyakov <ap...@openssl.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: x...@kernel.org
Cc: Samuel Neves <sne...@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumas...@gmail.com>
Cc: Andy Lutomirski <l...@kernel.org>
Cc: Greg KH <gre...@linuxfoundation.org>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: kernel-harden...@lists.openwall.com
Cc: linux-crypto@vger.kernel.org
---
 .../poly1305/poly1305-x86_64-cryptogams.S     | 3565 +++++++++++++++++
 1 file changed, 3565 insertions(+)
 create mode 100644 lib/zinc/poly1305/poly1305-x86_64-cryptogams.S

diff --git a/lib/zinc/poly1305/poly1305-x86_64-cryptogams.S 
b/lib/zinc/poly1305/poly1305-x86_64-cryptogams.S
new file mode 100644
index 000000000000..ed634757354b
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-x86_64-cryptogams.S
@@ -0,0 +1,3565 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <ap...@openssl.org>. All Rights 
Reserved.
+ */
+
+.text  
+
+
+
+.globl poly1305_init
+.hidden        poly1305_init
+.globl poly1305_blocks
+.hidden        poly1305_blocks
+.globl poly1305_emit
+.hidden        poly1305_emit
+
+.type  poly1305_init,@function
+.align 32
+poly1305_init:
+       xorq    %rax,%rax
+       movq    %rax,0(%rdi)
+       movq    %rax,8(%rdi)
+       movq    %rax,16(%rdi)
+
+       cmpq    $0,%rsi
+       je      .Lno_key
+
+       leaq    poly1305_blocks(%rip),%r10
+       leaq    poly1305_emit(%rip),%r11
+       movq    OPENSSL_ia32cap_P+4(%rip),%r9
+       leaq    poly1305_blocks_avx(%rip),%rax
+       leaq    poly1305_emit_avx(%rip),%rcx
+       btq     $28,%r9
+       cmovcq  %rax,%r10
+       cmovcq  %rcx,%r11
+       leaq    poly1305_blocks_avx2(%rip),%rax
+       btq     $37,%r9
+       cmovcq  %rax,%r10
+       movq    $2149646336,%rax
+       shrq    $32,%r9
+       andq    %rax,%r9
+       cmpq    %rax,%r9
+       je      .Linit_base2_44
+       movq    $0x0ffffffc0fffffff,%rax
+       movq    $0x0ffffffc0ffffffc,%rcx
+       andq    0(%rsi),%rax
+       andq    8(%rsi),%rcx
+       movq    %rax,24(%rdi)
+       movq    %rcx,32(%rdi)
+       movq    %r10,0(%rdx)
+       movq    %r11,8(%rdx)
+       movl    $1,%eax
+.Lno_key:
+       .byte   0xf3,0xc3
+.size  poly1305_init,.-poly1305_init
+
+.type  poly1305_blocks,@function
+.align 32
+poly1305_blocks:
+.cfi_startproc 
+.Lblocks:
+       shrq    $4,%rdx
+       jz      .Lno_data
+
+       pushq   %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r15,-56
+.Lblocks_body:
+
+       movq    %rdx,%r15
+
+       movq    24(%rdi),%r11
+       movq    32(%rdi),%r13
+
+       movq    0(%rdi),%r14
+       movq    8(%rdi),%rbx
+       movq    16(%rdi),%rbp
+
+       movq    %r13,%r12
+       shrq    $2,%r13
+       movq    %r12,%rax
+       addq    %r12,%r13
+       jmp     .Loop
+
+.align 32
+.Loop:
+       addq    0(%rsi),%r14
+       adcq    8(%rsi),%rbx
+       leaq    16(%rsi),%rsi
+       adcq    %rcx,%rbp
+       mulq    %r14
+       movq    %rax,%r9
+       movq    %r11,%rax
+       movq    %rdx,%r10
+
+       mulq    %r14
+       movq    %rax,%r14
+       movq    %r11,%rax
+       movq    %rdx,%r8
+
+       mulq    %rbx
+       addq    %rax,%r9
+       movq    %r13,%rax
+       adcq    %rdx,%r10
+
+       mulq    %rbx
+       movq    %rbp,%rbx
+       addq    %rax,%r14
+       adcq    %rdx,%r8
+
+       imulq   %r13,%rbx
+       addq    %rbx,%r9
+       movq    %r8,%rbx
+       adcq    $0,%r10
+
+       imulq   %r11,%rbp
+       addq    %r9,%rbx
+       movq    $-4,%rax
+       adcq    %rbp,%r10
+
+       andq    %r10,%rax
+       movq    %r10,%rbp
+       shrq    $2,%r10
+       andq    $3,%rbp
+       addq    %r10,%rax
+       addq    %rax,%r14
+       adcq    $0,%rbx
+       adcq    $0,%rbp
+       movq    %r12,%rax
+       decq    %r15
+       jnz     .Loop
+
+       movq    %r14,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rbp,16(%rdi)
+
+       movq    0(%rsp),%r15
+.cfi_restore   %r15
+       movq    8(%rsp),%r14
+.cfi_restore   %r14
+       movq    16(%rsp),%r13
+.cfi_restore   %r13
+       movq    24(%rsp),%r12
+.cfi_restore   %r12
+       movq    32(%rsp),%rbp
+.cfi_restore   %rbp
+       movq    40(%rsp),%rbx
+.cfi_restore   %rbx
+       leaq    48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lno_data:
+.Lblocks_epilogue:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  poly1305_blocks,.-poly1305_blocks
+
+.type  poly1305_emit,@function
+.align 32
+poly1305_emit:
+.Lemit:
+       movq    0(%rdi),%r8
+       movq    8(%rdi),%r9
+       movq    16(%rdi),%r10
+
+       movq    %r8,%rax
+       addq    $5,%r8
+       movq    %r9,%rcx
+       adcq    $0,%r9
+       adcq    $0,%r10
+       shrq    $2,%r10
+       cmovnzq %r8,%rax
+       cmovnzq %r9,%rcx
+
+       addq    0(%rdx),%rax
+       adcq    8(%rdx),%rcx
+       movq    %rax,0(%rsi)
+       movq    %rcx,8(%rsi)
+
+       .byte   0xf3,0xc3
+.size  poly1305_emit,.-poly1305_emit
+.type  __poly1305_block,@function
+.align 32
+__poly1305_block:
+       mulq    %r14
+       movq    %rax,%r9
+       movq    %r11,%rax
+       movq    %rdx,%r10
+
+       mulq    %r14
+       movq    %rax,%r14
+       movq    %r11,%rax
+       movq    %rdx,%r8
+
+       mulq    %rbx
+       addq    %rax,%r9
+       movq    %r13,%rax
+       adcq    %rdx,%r10
+
+       mulq    %rbx
+       movq    %rbp,%rbx
+       addq    %rax,%r14
+       adcq    %rdx,%r8
+
+       imulq   %r13,%rbx
+       addq    %rbx,%r9
+       movq    %r8,%rbx
+       adcq    $0,%r10
+
+       imulq   %r11,%rbp
+       addq    %r9,%rbx
+       movq    $-4,%rax
+       adcq    %rbp,%r10
+
+       andq    %r10,%rax
+       movq    %r10,%rbp
+       shrq    $2,%r10
+       andq    $3,%rbp
+       addq    %r10,%rax
+       addq    %rax,%r14
+       adcq    $0,%rbx
+       adcq    $0,%rbp
+       .byte   0xf3,0xc3
+.size  __poly1305_block,.-__poly1305_block
+
+.type  __poly1305_init_avx,@function
+.align 32
+__poly1305_init_avx:
+       movq    %r11,%r14
+       movq    %r12,%rbx
+       xorq    %rbp,%rbp
+
+       leaq    48+64(%rdi),%rdi
+
+       movq    %r12,%rax
+       call    __poly1305_block
+
+       movl    $0x3ffffff,%eax
+       movl    $0x3ffffff,%edx
+       movq    %r14,%r8
+       andl    %r14d,%eax
+       movq    %r11,%r9
+       andl    %r11d,%edx
+       movl    %eax,-64(%rdi)
+       shrq    $26,%r8
+       movl    %edx,-60(%rdi)
+       shrq    $26,%r9
+
+       movl    $0x3ffffff,%eax
+       movl    $0x3ffffff,%edx
+       andl    %r8d,%eax
+       andl    %r9d,%edx
+       movl    %eax,-48(%rdi)
+       leal    (%rax,%rax,4),%eax
+       movl    %edx,-44(%rdi)
+       leal    (%rdx,%rdx,4),%edx
+       movl    %eax,-32(%rdi)
+       shrq    $26,%r8
+       movl    %edx,-28(%rdi)
+       shrq    $26,%r9
+
+       movq    %rbx,%rax
+       movq    %r12,%rdx
+       shlq    $12,%rax
+       shlq    $12,%rdx
+       orq     %r8,%rax
+       orq     %r9,%rdx
+       andl    $0x3ffffff,%eax
+       andl    $0x3ffffff,%edx
+       movl    %eax,-16(%rdi)
+       leal    (%rax,%rax,4),%eax
+       movl    %edx,-12(%rdi)
+       leal    (%rdx,%rdx,4),%edx
+       movl    %eax,0(%rdi)
+       movq    %rbx,%r8
+       movl    %edx,4(%rdi)
+       movq    %r12,%r9
+
+       movl    $0x3ffffff,%eax
+       movl    $0x3ffffff,%edx
+       shrq    $14,%r8
+       shrq    $14,%r9
+       andl    %r8d,%eax
+       andl    %r9d,%edx
+       movl    %eax,16(%rdi)
+       leal    (%rax,%rax,4),%eax
+       movl    %edx,20(%rdi)
+       leal    (%rdx,%rdx,4),%edx
+       movl    %eax,32(%rdi)
+       shrq    $26,%r8
+       movl    %edx,36(%rdi)
+       shrq    $26,%r9
+
+       movq    %rbp,%rax
+       shlq    $24,%rax
+       orq     %rax,%r8
+       movl    %r8d,48(%rdi)
+       leaq    (%r8,%r8,4),%r8
+       movl    %r9d,52(%rdi)
+       leaq    (%r9,%r9,4),%r9
+       movl    %r8d,64(%rdi)
+       movl    %r9d,68(%rdi)
+
+       movq    %r12,%rax
+       call    __poly1305_block
+
+       movl    $0x3ffffff,%eax
+       movq    %r14,%r8
+       andl    %r14d,%eax
+       shrq    $26,%r8
+       movl    %eax,-52(%rdi)
+
+       movl    $0x3ffffff,%edx
+       andl    %r8d,%edx
+       movl    %edx,-36(%rdi)
+       leal    (%rdx,%rdx,4),%edx
+       shrq    $26,%r8
+       movl    %edx,-20(%rdi)
+
+       movq    %rbx,%rax
+       shlq    $12,%rax
+       orq     %r8,%rax
+       andl    $0x3ffffff,%eax
+       movl    %eax,-4(%rdi)
+       leal    (%rax,%rax,4),%eax
+       movq    %rbx,%r8
+       movl    %eax,12(%rdi)
+
+       movl    $0x3ffffff,%edx
+       shrq    $14,%r8
+       andl    %r8d,%edx
+       movl    %edx,28(%rdi)
+       leal    (%rdx,%rdx,4),%edx
+       shrq    $26,%r8
+       movl    %edx,44(%rdi)
+
+       movq    %rbp,%rax
+       shlq    $24,%rax
+       orq     %rax,%r8
+       movl    %r8d,60(%rdi)
+       leaq    (%r8,%r8,4),%r8
+       movl    %r8d,76(%rdi)
+
+       movq    %r12,%rax
+       call    __poly1305_block
+
+       movl    $0x3ffffff,%eax
+       movq    %r14,%r8
+       andl    %r14d,%eax
+       shrq    $26,%r8
+       movl    %eax,-56(%rdi)
+
+       movl    $0x3ffffff,%edx
+       andl    %r8d,%edx
+       movl    %edx,-40(%rdi)
+       leal    (%rdx,%rdx,4),%edx
+       shrq    $26,%r8
+       movl    %edx,-24(%rdi)
+
+       movq    %rbx,%rax
+       shlq    $12,%rax
+       orq     %r8,%rax
+       andl    $0x3ffffff,%eax
+       movl    %eax,-8(%rdi)
+       leal    (%rax,%rax,4),%eax
+       movq    %rbx,%r8
+       movl    %eax,8(%rdi)
+
+       movl    $0x3ffffff,%edx
+       shrq    $14,%r8
+       andl    %r8d,%edx
+       movl    %edx,24(%rdi)
+       leal    (%rdx,%rdx,4),%edx
+       shrq    $26,%r8
+       movl    %edx,40(%rdi)
+
+       movq    %rbp,%rax
+       shlq    $24,%rax
+       orq     %rax,%r8
+       movl    %r8d,56(%rdi)
+       leaq    (%r8,%r8,4),%r8
+       movl    %r8d,72(%rdi)
+
+       leaq    -48-64(%rdi),%rdi
+       .byte   0xf3,0xc3
+.size  __poly1305_init_avx,.-__poly1305_init_avx
+
+.type  poly1305_blocks_avx,@function
+.align 32
+poly1305_blocks_avx:
+.cfi_startproc 
+       movl    20(%rdi),%r8d
+       cmpq    $128,%rdx
+       jae     .Lblocks_avx
+       testl   %r8d,%r8d
+       jz      .Lblocks
+
+.Lblocks_avx:
+       andq    $-16,%rdx
+       jz      .Lno_data_avx
+
+       vzeroupper
+
+       testl   %r8d,%r8d
+       jz      .Lbase2_64_avx
+
+       testq   $31,%rdx
+       jz      .Leven_avx
+
+       pushq   %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r15,-56
+.Lblocks_avx_body:
+
+       movq    %rdx,%r15
+
+       movq    0(%rdi),%r8
+       movq    8(%rdi),%r9
+       movl    16(%rdi),%ebp
+
+       movq    24(%rdi),%r11
+       movq    32(%rdi),%r13
+
+
+       movl    %r8d,%r14d
+       andq    $-2147483648,%r8
+       movq    %r9,%r12
+       movl    %r9d,%ebx
+       andq    $-2147483648,%r9
+
+       shrq    $6,%r8
+       shlq    $52,%r12
+       addq    %r8,%r14
+       shrq    $12,%rbx
+       shrq    $18,%r9
+       addq    %r12,%r14
+       adcq    %r9,%rbx
+
+       movq    %rbp,%r8
+       shlq    $40,%r8
+       shrq    $24,%rbp
+       addq    %r8,%rbx
+       adcq    $0,%rbp
+
+       movq    $-4,%r9
+       movq    %rbp,%r8
+       andq    %rbp,%r9
+       shrq    $2,%r8
+       andq    $3,%rbp
+       addq    %r9,%r8
+       addq    %r8,%r14
+       adcq    $0,%rbx
+       adcq    $0,%rbp
+
+       movq    %r13,%r12
+       movq    %r13,%rax
+       shrq    $2,%r13
+       addq    %r12,%r13
+
+       addq    0(%rsi),%r14
+       adcq    8(%rsi),%rbx
+       leaq    16(%rsi),%rsi
+       adcq    %rcx,%rbp
+
+       call    __poly1305_block
+
+       testq   %rcx,%rcx
+       jz      .Lstore_base2_64_avx
+
+
+       movq    %r14,%rax
+       movq    %r14,%rdx
+       shrq    $52,%r14
+       movq    %rbx,%r11
+       movq    %rbx,%r12
+       shrq    $26,%rdx
+       andq    $0x3ffffff,%rax
+       shlq    $12,%r11
+       andq    $0x3ffffff,%rdx
+       shrq    $14,%rbx
+       orq     %r11,%r14
+       shlq    $24,%rbp
+       andq    $0x3ffffff,%r14
+       shrq    $40,%r12
+       andq    $0x3ffffff,%rbx
+       orq     %r12,%rbp
+
+       subq    $16,%r15
+       jz      .Lstore_base2_26_avx
+
+       vmovd   %eax,%xmm0
+       vmovd   %edx,%xmm1
+       vmovd   %r14d,%xmm2
+       vmovd   %ebx,%xmm3
+       vmovd   %ebp,%xmm4
+       jmp     .Lproceed_avx
+
+.align 32
+.Lstore_base2_64_avx:
+       movq    %r14,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rbp,16(%rdi)
+       jmp     .Ldone_avx
+
+.align 16
+.Lstore_base2_26_avx:
+       movl    %eax,0(%rdi)
+       movl    %edx,4(%rdi)
+       movl    %r14d,8(%rdi)
+       movl    %ebx,12(%rdi)
+       movl    %ebp,16(%rdi)
+.align 16
+.Ldone_avx:
+       movq    0(%rsp),%r15
+.cfi_restore   %r15
+       movq    8(%rsp),%r14
+.cfi_restore   %r14
+       movq    16(%rsp),%r13
+.cfi_restore   %r13
+       movq    24(%rsp),%r12
+.cfi_restore   %r12
+       movq    32(%rsp),%rbp
+.cfi_restore   %rbp
+       movq    40(%rsp),%rbx
+.cfi_restore   %rbx
+       leaq    48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+
+.align 32
+.Lbase2_64_avx:
+.cfi_startproc 
+       pushq   %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r15,-56
+.Lbase2_64_avx_body:
+
+       movq    %rdx,%r15
+
+       movq    24(%rdi),%r11
+       movq    32(%rdi),%r13
+
+       movq    0(%rdi),%r14
+       movq    8(%rdi),%rbx
+       movl    16(%rdi),%ebp
+
+       movq    %r13,%r12
+       movq    %r13,%rax
+       shrq    $2,%r13
+       addq    %r12,%r13
+
+       testq   $31,%rdx
+       jz      .Linit_avx
+
+       addq    0(%rsi),%r14
+       adcq    8(%rsi),%rbx
+       leaq    16(%rsi),%rsi
+       adcq    %rcx,%rbp
+       subq    $16,%r15
+
+       call    __poly1305_block
+
+.Linit_avx:
+
+       movq    %r14,%rax
+       movq    %r14,%rdx
+       shrq    $52,%r14
+       movq    %rbx,%r8
+       movq    %rbx,%r9
+       shrq    $26,%rdx
+       andq    $0x3ffffff,%rax
+       shlq    $12,%r8
+       andq    $0x3ffffff,%rdx
+       shrq    $14,%rbx
+       orq     %r8,%r14
+       shlq    $24,%rbp
+       andq    $0x3ffffff,%r14
+       shrq    $40,%r9
+       andq    $0x3ffffff,%rbx
+       orq     %r9,%rbp
+
+       vmovd   %eax,%xmm0
+       vmovd   %edx,%xmm1
+       vmovd   %r14d,%xmm2
+       vmovd   %ebx,%xmm3
+       vmovd   %ebp,%xmm4
+       movl    $1,20(%rdi)
+
+       call    __poly1305_init_avx
+
+.Lproceed_avx:
+       movq    %r15,%rdx
+
+       movq    0(%rsp),%r15
+.cfi_restore   %r15
+       movq    8(%rsp),%r14
+.cfi_restore   %r14
+       movq    16(%rsp),%r13
+.cfi_restore   %r13
+       movq    24(%rsp),%r12
+.cfi_restore   %r12
+       movq    32(%rsp),%rbp
+.cfi_restore   %rbp
+       movq    40(%rsp),%rbx
+.cfi_restore   %rbx
+       leaq    48(%rsp),%rax
+       leaq    48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lbase2_64_avx_epilogue:
+       jmp     .Ldo_avx
+.cfi_endproc   
+
+.align 32
+.Leven_avx:
+.cfi_startproc 
+       vmovd   0(%rdi),%xmm0
+       vmovd   4(%rdi),%xmm1
+       vmovd   8(%rdi),%xmm2
+       vmovd   12(%rdi),%xmm3
+       vmovd   16(%rdi),%xmm4
+
+.Ldo_avx:
+       leaq    -88(%rsp),%r11
+.cfi_def_cfa   %r11,0x60
+       subq    $0x178,%rsp
+       subq    $64,%rdx
+       leaq    -32(%rsi),%rax
+       cmovcq  %rax,%rsi
+
+       vmovdqu 48(%rdi),%xmm14
+       leaq    112(%rdi),%rdi
+       leaq    .Lconst(%rip),%rcx
+
+
+
+       vmovdqu 32(%rsi),%xmm5
+       vmovdqu 48(%rsi),%xmm6
+       vmovdqa 64(%rcx),%xmm15
+
+       vpsrldq $6,%xmm5,%xmm7
+       vpsrldq $6,%xmm6,%xmm8
+       vpunpckhqdq     %xmm6,%xmm5,%xmm9
+       vpunpcklqdq     %xmm6,%xmm5,%xmm5
+       vpunpcklqdq     %xmm8,%xmm7,%xmm8
+
+       vpsrlq  $40,%xmm9,%xmm9
+       vpsrlq  $26,%xmm5,%xmm6
+       vpand   %xmm15,%xmm5,%xmm5
+       vpsrlq  $4,%xmm8,%xmm7
+       vpand   %xmm15,%xmm6,%xmm6
+       vpsrlq  $30,%xmm8,%xmm8
+       vpand   %xmm15,%xmm7,%xmm7
+       vpand   %xmm15,%xmm8,%xmm8
+       vpor    32(%rcx),%xmm9,%xmm9
+
+       jbe     .Lskip_loop_avx
+
+
+       vmovdqu -48(%rdi),%xmm11
+       vmovdqu -32(%rdi),%xmm12
+       vpshufd $0xEE,%xmm14,%xmm13
+       vpshufd $0x44,%xmm14,%xmm10
+       vmovdqa %xmm13,-144(%r11)
+       vmovdqa %xmm10,0(%rsp)
+       vpshufd $0xEE,%xmm11,%xmm14
+       vmovdqu -16(%rdi),%xmm10
+       vpshufd $0x44,%xmm11,%xmm11
+       vmovdqa %xmm14,-128(%r11)
+       vmovdqa %xmm11,16(%rsp)
+       vpshufd $0xEE,%xmm12,%xmm13
+       vmovdqu 0(%rdi),%xmm11
+       vpshufd $0x44,%xmm12,%xmm12
+       vmovdqa %xmm13,-112(%r11)
+       vmovdqa %xmm12,32(%rsp)
+       vpshufd $0xEE,%xmm10,%xmm14
+       vmovdqu 16(%rdi),%xmm12
+       vpshufd $0x44,%xmm10,%xmm10
+       vmovdqa %xmm14,-96(%r11)
+       vmovdqa %xmm10,48(%rsp)
+       vpshufd $0xEE,%xmm11,%xmm13
+       vmovdqu 32(%rdi),%xmm10
+       vpshufd $0x44,%xmm11,%xmm11
+       vmovdqa %xmm13,-80(%r11)
+       vmovdqa %xmm11,64(%rsp)
+       vpshufd $0xEE,%xmm12,%xmm14
+       vmovdqu 48(%rdi),%xmm11
+       vpshufd $0x44,%xmm12,%xmm12
+       vmovdqa %xmm14,-64(%r11)
+       vmovdqa %xmm12,80(%rsp)
+       vpshufd $0xEE,%xmm10,%xmm13
+       vmovdqu 64(%rdi),%xmm12
+       vpshufd $0x44,%xmm10,%xmm10
+       vmovdqa %xmm13,-48(%r11)
+       vmovdqa %xmm10,96(%rsp)
+       vpshufd $0xEE,%xmm11,%xmm14
+       vpshufd $0x44,%xmm11,%xmm11
+       vmovdqa %xmm14,-32(%r11)
+       vmovdqa %xmm11,112(%rsp)
+       vpshufd $0xEE,%xmm12,%xmm13
+       vmovdqa 0(%rsp),%xmm14
+       vpshufd $0x44,%xmm12,%xmm12
+       vmovdqa %xmm13,-16(%r11)
+       vmovdqa %xmm12,128(%rsp)
+
+       jmp     .Loop_avx
+
+.align 32
+.Loop_avx:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+       vpmuludq        %xmm5,%xmm14,%xmm10
+       vpmuludq        %xmm6,%xmm14,%xmm11
+       vmovdqa %xmm2,32(%r11)
+       vpmuludq        %xmm7,%xmm14,%xmm12
+       vmovdqa 16(%rsp),%xmm2
+       vpmuludq        %xmm8,%xmm14,%xmm13
+       vpmuludq        %xmm9,%xmm14,%xmm14
+
+       vmovdqa %xmm0,0(%r11)
+       vpmuludq        32(%rsp),%xmm9,%xmm0
+       vmovdqa %xmm1,16(%r11)
+       vpmuludq        %xmm8,%xmm2,%xmm1
+       vpaddq  %xmm0,%xmm10,%xmm10
+       vpaddq  %xmm1,%xmm14,%xmm14
+       vmovdqa %xmm3,48(%r11)
+       vpmuludq        %xmm7,%xmm2,%xmm0
+       vpmuludq        %xmm6,%xmm2,%xmm1
+       vpaddq  %xmm0,%xmm13,%xmm13
+       vmovdqa 48(%rsp),%xmm3
+       vpaddq  %xmm1,%xmm12,%xmm12
+       vmovdqa %xmm4,64(%r11)
+       vpmuludq        %xmm5,%xmm2,%xmm2
+       vpmuludq        %xmm7,%xmm3,%xmm0
+       vpaddq  %xmm2,%xmm11,%xmm11
+
+       vmovdqa 64(%rsp),%xmm4
+       vpaddq  %xmm0,%xmm14,%xmm14
+       vpmuludq        %xmm6,%xmm3,%xmm1
+       vpmuludq        %xmm5,%xmm3,%xmm3
+       vpaddq  %xmm1,%xmm13,%xmm13
+       vmovdqa 80(%rsp),%xmm2
+       vpaddq  %xmm3,%xmm12,%xmm12
+       vpmuludq        %xmm9,%xmm4,%xmm0
+       vpmuludq        %xmm8,%xmm4,%xmm4
+       vpaddq  %xmm0,%xmm11,%xmm11
+       vmovdqa 96(%rsp),%xmm3
+       vpaddq  %xmm4,%xmm10,%xmm10
+
+       vmovdqa 128(%rsp),%xmm4
+       vpmuludq        %xmm6,%xmm2,%xmm1
+       vpmuludq        %xmm5,%xmm2,%xmm2
+       vpaddq  %xmm1,%xmm14,%xmm14
+       vpaddq  %xmm2,%xmm13,%xmm13
+       vpmuludq        %xmm9,%xmm3,%xmm0
+       vpmuludq        %xmm8,%xmm3,%xmm1
+       vpaddq  %xmm0,%xmm12,%xmm12
+       vmovdqu 0(%rsi),%xmm0
+       vpaddq  %xmm1,%xmm11,%xmm11
+       vpmuludq        %xmm7,%xmm3,%xmm3
+       vpmuludq        %xmm7,%xmm4,%xmm7
+       vpaddq  %xmm3,%xmm10,%xmm10
+
+       vmovdqu 16(%rsi),%xmm1
+       vpaddq  %xmm7,%xmm11,%xmm11
+       vpmuludq        %xmm8,%xmm4,%xmm8
+       vpmuludq        %xmm9,%xmm4,%xmm9
+       vpsrldq $6,%xmm0,%xmm2
+       vpaddq  %xmm8,%xmm12,%xmm12
+       vpaddq  %xmm9,%xmm13,%xmm13
+       vpsrldq $6,%xmm1,%xmm3
+       vpmuludq        112(%rsp),%xmm5,%xmm9
+       vpmuludq        %xmm6,%xmm4,%xmm5
+       vpunpckhqdq     %xmm1,%xmm0,%xmm4
+       vpaddq  %xmm9,%xmm14,%xmm14
+       vmovdqa -144(%r11),%xmm9
+       vpaddq  %xmm5,%xmm10,%xmm10
+
+       vpunpcklqdq     %xmm1,%xmm0,%xmm0
+       vpunpcklqdq     %xmm3,%xmm2,%xmm3
+
+
+       vpsrldq $5,%xmm4,%xmm4
+       vpsrlq  $26,%xmm0,%xmm1
+       vpand   %xmm15,%xmm0,%xmm0
+       vpsrlq  $4,%xmm3,%xmm2
+       vpand   %xmm15,%xmm1,%xmm1
+       vpand   0(%rcx),%xmm4,%xmm4
+       vpsrlq  $30,%xmm3,%xmm3
+       vpand   %xmm15,%xmm2,%xmm2
+       vpand   %xmm15,%xmm3,%xmm3
+       vpor    32(%rcx),%xmm4,%xmm4
+
+       vpaddq  0(%r11),%xmm0,%xmm0
+       vpaddq  16(%r11),%xmm1,%xmm1
+       vpaddq  32(%r11),%xmm2,%xmm2
+       vpaddq  48(%r11),%xmm3,%xmm3
+       vpaddq  64(%r11),%xmm4,%xmm4
+
+       leaq    32(%rsi),%rax
+       leaq    64(%rsi),%rsi
+       subq    $64,%rdx
+       cmovcq  %rax,%rsi
+
+
+
+
+
+
+
+
+
+
+       vpmuludq        %xmm0,%xmm9,%xmm5
+       vpmuludq        %xmm1,%xmm9,%xmm6
+       vpaddq  %xmm5,%xmm10,%xmm10
+       vpaddq  %xmm6,%xmm11,%xmm11
+       vmovdqa -128(%r11),%xmm7
+       vpmuludq        %xmm2,%xmm9,%xmm5
+       vpmuludq        %xmm3,%xmm9,%xmm6
+       vpaddq  %xmm5,%xmm12,%xmm12
+       vpaddq  %xmm6,%xmm13,%xmm13
+       vpmuludq        %xmm4,%xmm9,%xmm9
+       vpmuludq        -112(%r11),%xmm4,%xmm5
+       vpaddq  %xmm9,%xmm14,%xmm14
+
+       vpaddq  %xmm5,%xmm10,%xmm10
+       vpmuludq        %xmm2,%xmm7,%xmm6
+       vpmuludq        %xmm3,%xmm7,%xmm5
+       vpaddq  %xmm6,%xmm13,%xmm13
+       vmovdqa -96(%r11),%xmm8
+       vpaddq  %xmm5,%xmm14,%xmm14
+       vpmuludq        %xmm1,%xmm7,%xmm6
+       vpmuludq        %xmm0,%xmm7,%xmm7
+       vpaddq  %xmm6,%xmm12,%xmm12
+       vpaddq  %xmm7,%xmm11,%xmm11
+
+       vmovdqa -80(%r11),%xmm9
+       vpmuludq        %xmm2,%xmm8,%xmm5
+       vpmuludq        %xmm1,%xmm8,%xmm6
+       vpaddq  %xmm5,%xmm14,%xmm14
+       vpaddq  %xmm6,%xmm13,%xmm13
+       vmovdqa -64(%r11),%xmm7
+       vpmuludq        %xmm0,%xmm8,%xmm8
+       vpmuludq        %xmm4,%xmm9,%xmm5
+       vpaddq  %xmm8,%xmm12,%xmm12
+       vpaddq  %xmm5,%xmm11,%xmm11
+       vmovdqa -48(%r11),%xmm8
+       vpmuludq        %xmm3,%xmm9,%xmm9
+       vpmuludq        %xmm1,%xmm7,%xmm6
+       vpaddq  %xmm9,%xmm10,%xmm10
+
+       vmovdqa -16(%r11),%xmm9
+       vpaddq  %xmm6,%xmm14,%xmm14
+       vpmuludq        %xmm0,%xmm7,%xmm7
+       vpmuludq        %xmm4,%xmm8,%xmm5
+       vpaddq  %xmm7,%xmm13,%xmm13
+       vpaddq  %xmm5,%xmm12,%xmm12
+       vmovdqu 32(%rsi),%xmm5
+       vpmuludq        %xmm3,%xmm8,%xmm7
+       vpmuludq        %xmm2,%xmm8,%xmm8
+       vpaddq  %xmm7,%xmm11,%xmm11
+       vmovdqu 48(%rsi),%xmm6
+       vpaddq  %xmm8,%xmm10,%xmm10
+
+       vpmuludq        %xmm2,%xmm9,%xmm2
+       vpmuludq        %xmm3,%xmm9,%xmm3
+       vpsrldq $6,%xmm5,%xmm7
+       vpaddq  %xmm2,%xmm11,%xmm11
+       vpmuludq        %xmm4,%xmm9,%xmm4
+       vpsrldq $6,%xmm6,%xmm8
+       vpaddq  %xmm3,%xmm12,%xmm2
+       vpaddq  %xmm4,%xmm13,%xmm3
+       vpmuludq        -32(%r11),%xmm0,%xmm4
+       vpmuludq        %xmm1,%xmm9,%xmm0
+       vpunpckhqdq     %xmm6,%xmm5,%xmm9
+       vpaddq  %xmm4,%xmm14,%xmm4
+       vpaddq  %xmm0,%xmm10,%xmm0
+
+       vpunpcklqdq     %xmm6,%xmm5,%xmm5
+       vpunpcklqdq     %xmm8,%xmm7,%xmm8
+
+
+       vpsrldq $5,%xmm9,%xmm9
+       vpsrlq  $26,%xmm5,%xmm6
+       vmovdqa 0(%rsp),%xmm14
+       vpand   %xmm15,%xmm5,%xmm5
+       vpsrlq  $4,%xmm8,%xmm7
+       vpand   %xmm15,%xmm6,%xmm6
+       vpand   0(%rcx),%xmm9,%xmm9
+       vpsrlq  $30,%xmm8,%xmm8
+       vpand   %xmm15,%xmm7,%xmm7
+       vpand   %xmm15,%xmm8,%xmm8
+       vpor    32(%rcx),%xmm9,%xmm9
+
+
+
+
+
+       vpsrlq  $26,%xmm3,%xmm13
+       vpand   %xmm15,%xmm3,%xmm3
+       vpaddq  %xmm13,%xmm4,%xmm4
+
+       vpsrlq  $26,%xmm0,%xmm10
+       vpand   %xmm15,%xmm0,%xmm0
+       vpaddq  %xmm10,%xmm11,%xmm1
+
+       vpsrlq  $26,%xmm4,%xmm10
+       vpand   %xmm15,%xmm4,%xmm4
+
+       vpsrlq  $26,%xmm1,%xmm11
+       vpand   %xmm15,%xmm1,%xmm1
+       vpaddq  %xmm11,%xmm2,%xmm2
+
+       vpaddq  %xmm10,%xmm0,%xmm0
+       vpsllq  $2,%xmm10,%xmm10
+       vpaddq  %xmm10,%xmm0,%xmm0
+
+       vpsrlq  $26,%xmm2,%xmm12
+       vpand   %xmm15,%xmm2,%xmm2
+       vpaddq  %xmm12,%xmm3,%xmm3
+
+       vpsrlq  $26,%xmm0,%xmm10
+       vpand   %xmm15,%xmm0,%xmm0
+       vpaddq  %xmm10,%xmm1,%xmm1
+
+       vpsrlq  $26,%xmm3,%xmm13
+       vpand   %xmm15,%xmm3,%xmm3
+       vpaddq  %xmm13,%xmm4,%xmm4
+
+       ja      .Loop_avx
+
+.Lskip_loop_avx:
+
+
+
+       vpshufd $0x10,%xmm14,%xmm14
+       addq    $32,%rdx
+       jnz     .Long_tail_avx
+
+       vpaddq  %xmm2,%xmm7,%xmm7
+       vpaddq  %xmm0,%xmm5,%xmm5
+       vpaddq  %xmm1,%xmm6,%xmm6
+       vpaddq  %xmm3,%xmm8,%xmm8
+       vpaddq  %xmm4,%xmm9,%xmm9
+
+.Long_tail_avx:
+       vmovdqa %xmm2,32(%r11)
+       vmovdqa %xmm0,0(%r11)
+       vmovdqa %xmm1,16(%r11)
+       vmovdqa %xmm3,48(%r11)
+       vmovdqa %xmm4,64(%r11)
+
+
+
+
+
+
+
+       vpmuludq        %xmm7,%xmm14,%xmm12
+       vpmuludq        %xmm5,%xmm14,%xmm10
+       vpshufd $0x10,-48(%rdi),%xmm2
+       vpmuludq        %xmm6,%xmm14,%xmm11
+       vpmuludq        %xmm8,%xmm14,%xmm13
+       vpmuludq        %xmm9,%xmm14,%xmm14
+
+       vpmuludq        %xmm8,%xmm2,%xmm0
+       vpaddq  %xmm0,%xmm14,%xmm14
+       vpshufd $0x10,-32(%rdi),%xmm3
+       vpmuludq        %xmm7,%xmm2,%xmm1
+       vpaddq  %xmm1,%xmm13,%xmm13
+       vpshufd $0x10,-16(%rdi),%xmm4
+       vpmuludq        %xmm6,%xmm2,%xmm0
+       vpaddq  %xmm0,%xmm12,%xmm12
+       vpmuludq        %xmm5,%xmm2,%xmm2
+       vpaddq  %xmm2,%xmm11,%xmm11
+       vpmuludq        %xmm9,%xmm3,%xmm3
+       vpaddq  %xmm3,%xmm10,%xmm10
+
+       vpshufd $0x10,0(%rdi),%xmm2
+       vpmuludq        %xmm7,%xmm4,%xmm1
+       vpaddq  %xmm1,%xmm14,%xmm14
+       vpmuludq        %xmm6,%xmm4,%xmm0
+       vpaddq  %xmm0,%xmm13,%xmm13
+       vpshufd $0x10,16(%rdi),%xmm3
+       vpmuludq        %xmm5,%xmm4,%xmm4
+       vpaddq  %xmm4,%xmm12,%xmm12
+       vpmuludq        %xmm9,%xmm2,%xmm1
+       vpaddq  %xmm1,%xmm11,%xmm11
+       vpshufd $0x10,32(%rdi),%xmm4
+       vpmuludq        %xmm8,%xmm2,%xmm2
+       vpaddq  %xmm2,%xmm10,%xmm10
+
+       vpmuludq        %xmm6,%xmm3,%xmm0
+       vpaddq  %xmm0,%xmm14,%xmm14
+       vpmuludq        %xmm5,%xmm3,%xmm3
+       vpaddq  %xmm3,%xmm13,%xmm13
+       vpshufd $0x10,48(%rdi),%xmm2
+       vpmuludq        %xmm9,%xmm4,%xmm1
+       vpaddq  %xmm1,%xmm12,%xmm12
+       vpshufd $0x10,64(%rdi),%xmm3
+       vpmuludq        %xmm8,%xmm4,%xmm0
+       vpaddq  %xmm0,%xmm11,%xmm11
+       vpmuludq        %xmm7,%xmm4,%xmm4
+       vpaddq  %xmm4,%xmm10,%xmm10
+
+       vpmuludq        %xmm5,%xmm2,%xmm2
+       vpaddq  %xmm2,%xmm14,%xmm14
+       vpmuludq        %xmm9,%xmm3,%xmm1
+       vpaddq  %xmm1,%xmm13,%xmm13
+       vpmuludq        %xmm8,%xmm3,%xmm0
+       vpaddq  %xmm0,%xmm12,%xmm12
+       vpmuludq        %xmm7,%xmm3,%xmm1
+       vpaddq  %xmm1,%xmm11,%xmm11
+       vpmuludq        %xmm6,%xmm3,%xmm3
+       vpaddq  %xmm3,%xmm10,%xmm10
+
+       jz      .Lshort_tail_avx
+
+       vmovdqu 0(%rsi),%xmm0
+       vmovdqu 16(%rsi),%xmm1
+
+       vpsrldq $6,%xmm0,%xmm2
+       vpsrldq $6,%xmm1,%xmm3
+       vpunpckhqdq     %xmm1,%xmm0,%xmm4
+       vpunpcklqdq     %xmm1,%xmm0,%xmm0
+       vpunpcklqdq     %xmm3,%xmm2,%xmm3
+
+       vpsrlq  $40,%xmm4,%xmm4
+       vpsrlq  $26,%xmm0,%xmm1
+       vpand   %xmm15,%xmm0,%xmm0
+       vpsrlq  $4,%xmm3,%xmm2
+       vpand   %xmm15,%xmm1,%xmm1
+       vpsrlq  $30,%xmm3,%xmm3
+       vpand   %xmm15,%xmm2,%xmm2
+       vpand   %xmm15,%xmm3,%xmm3
+       vpor    32(%rcx),%xmm4,%xmm4
+
+       vpshufd $0x32,-64(%rdi),%xmm9
+       vpaddq  0(%r11),%xmm0,%xmm0
+       vpaddq  16(%r11),%xmm1,%xmm1
+       vpaddq  32(%r11),%xmm2,%xmm2
+       vpaddq  48(%r11),%xmm3,%xmm3
+       vpaddq  64(%r11),%xmm4,%xmm4
+
+
+
+
+       vpmuludq        %xmm0,%xmm9,%xmm5
+       vpaddq  %xmm5,%xmm10,%xmm10
+       vpmuludq        %xmm1,%xmm9,%xmm6
+       vpaddq  %xmm6,%xmm11,%xmm11
+       vpmuludq        %xmm2,%xmm9,%xmm5
+       vpaddq  %xmm5,%xmm12,%xmm12
+       vpshufd $0x32,-48(%rdi),%xmm7
+       vpmuludq        %xmm3,%xmm9,%xmm6
+       vpaddq  %xmm6,%xmm13,%xmm13
+       vpmuludq        %xmm4,%xmm9,%xmm9
+       vpaddq  %xmm9,%xmm14,%xmm14
+
+       vpmuludq        %xmm3,%xmm7,%xmm5
+       vpaddq  %xmm5,%xmm14,%xmm14
+       vpshufd $0x32,-32(%rdi),%xmm8
+       vpmuludq        %xmm2,%xmm7,%xmm6
+       vpaddq  %xmm6,%xmm13,%xmm13
+       vpshufd $0x32,-16(%rdi),%xmm9
+       vpmuludq        %xmm1,%xmm7,%xmm5
+       vpaddq  %xmm5,%xmm12,%xmm12
+       vpmuludq        %xmm0,%xmm7,%xmm7
+       vpaddq  %xmm7,%xmm11,%xmm11
+       vpmuludq        %xmm4,%xmm8,%xmm8
+       vpaddq  %xmm8,%xmm10,%xmm10
+
+       vpshufd $0x32,0(%rdi),%xmm7
+       vpmuludq        %xmm2,%xmm9,%xmm6
+       vpaddq  %xmm6,%xmm14,%xmm14
+       vpmuludq        %xmm1,%xmm9,%xmm5
+       vpaddq  %xmm5,%xmm13,%xmm13
+       vpshufd $0x32,16(%rdi),%xmm8
+       vpmuludq        %xmm0,%xmm9,%xmm9
+       vpaddq  %xmm9,%xmm12,%xmm12
+       vpmuludq        %xmm4,%xmm7,%xmm6
+       vpaddq  %xmm6,%xmm11,%xmm11
+       vpshufd $0x32,32(%rdi),%xmm9
+       vpmuludq        %xmm3,%xmm7,%xmm7
+       vpaddq  %xmm7,%xmm10,%xmm10
+
+       vpmuludq        %xmm1,%xmm8,%xmm5
+       vpaddq  %xmm5,%xmm14,%xmm14
+       vpmuludq        %xmm0,%xmm8,%xmm8
+       vpaddq  %xmm8,%xmm13,%xmm13
+       vpshufd $0x32,48(%rdi),%xmm7
+       vpmuludq        %xmm4,%xmm9,%xmm6
+       vpaddq  %xmm6,%xmm12,%xmm12
+       vpshufd $0x32,64(%rdi),%xmm8
+       vpmuludq        %xmm3,%xmm9,%xmm5
+       vpaddq  %xmm5,%xmm11,%xmm11
+       vpmuludq        %xmm2,%xmm9,%xmm9
+       vpaddq  %xmm9,%xmm10,%xmm10
+
+       vpmuludq        %xmm0,%xmm7,%xmm7
+       vpaddq  %xmm7,%xmm14,%xmm14
+       vpmuludq        %xmm4,%xmm8,%xmm6
+       vpaddq  %xmm6,%xmm13,%xmm13
+       vpmuludq        %xmm3,%xmm8,%xmm5
+       vpaddq  %xmm5,%xmm12,%xmm12
+       vpmuludq        %xmm2,%xmm8,%xmm6
+       vpaddq  %xmm6,%xmm11,%xmm11
+       vpmuludq        %xmm1,%xmm8,%xmm8
+       vpaddq  %xmm8,%xmm10,%xmm10
+
+.Lshort_tail_avx:
+
+
+
+       vpsrldq $8,%xmm14,%xmm9
+       vpsrldq $8,%xmm13,%xmm8
+       vpsrldq $8,%xmm11,%xmm6
+       vpsrldq $8,%xmm10,%xmm5
+       vpsrldq $8,%xmm12,%xmm7
+       vpaddq  %xmm8,%xmm13,%xmm13
+       vpaddq  %xmm9,%xmm14,%xmm14
+       vpaddq  %xmm5,%xmm10,%xmm10
+       vpaddq  %xmm6,%xmm11,%xmm11
+       vpaddq  %xmm7,%xmm12,%xmm12
+
+
+
+
+       vpsrlq  $26,%xmm13,%xmm3
+       vpand   %xmm15,%xmm13,%xmm13
+       vpaddq  %xmm3,%xmm14,%xmm14
+
+       vpsrlq  $26,%xmm10,%xmm0
+       vpand   %xmm15,%xmm10,%xmm10
+       vpaddq  %xmm0,%xmm11,%xmm11
+
+       vpsrlq  $26,%xmm14,%xmm4
+       vpand   %xmm15,%xmm14,%xmm14
+
+       vpsrlq  $26,%xmm11,%xmm1
+       vpand   %xmm15,%xmm11,%xmm11
+       vpaddq  %xmm1,%xmm12,%xmm12
+
+       vpaddq  %xmm4,%xmm10,%xmm10
+       vpsllq  $2,%xmm4,%xmm4
+       vpaddq  %xmm4,%xmm10,%xmm10
+
+       vpsrlq  $26,%xmm12,%xmm2
+       vpand   %xmm15,%xmm12,%xmm12
+       vpaddq  %xmm2,%xmm13,%xmm13
+
+       vpsrlq  $26,%xmm10,%xmm0
+       vpand   %xmm15,%xmm10,%xmm10
+       vpaddq  %xmm0,%xmm11,%xmm11
+
+       vpsrlq  $26,%xmm13,%xmm3
+       vpand   %xmm15,%xmm13,%xmm13
+       vpaddq  %xmm3,%xmm14,%xmm14
+
+       vmovd   %xmm10,-112(%rdi)
+       vmovd   %xmm11,-108(%rdi)
+       vmovd   %xmm12,-104(%rdi)
+       vmovd   %xmm13,-100(%rdi)
+       vmovd   %xmm14,-96(%rdi)
+       leaq    88(%r11),%rsp
+.cfi_def_cfa   %rsp,8
+       vzeroupper
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  poly1305_blocks_avx,.-poly1305_blocks_avx
+
+.type  poly1305_emit_avx,@function
+.align 32
+poly1305_emit_avx:
+       cmpl    $0,20(%rdi)
+       je      .Lemit
+
+       movl    0(%rdi),%eax
+       movl    4(%rdi),%ecx
+       movl    8(%rdi),%r8d
+       movl    12(%rdi),%r11d
+       movl    16(%rdi),%r10d
+
+       shlq    $26,%rcx
+       movq    %r8,%r9
+       shlq    $52,%r8
+       addq    %rcx,%rax
+       shrq    $12,%r9
+       addq    %rax,%r8
+       adcq    $0,%r9
+
+       shlq    $14,%r11
+       movq    %r10,%rax
+       shrq    $24,%r10
+       addq    %r11,%r9
+       shlq    $40,%rax
+       addq    %rax,%r9
+       adcq    $0,%r10
+
+       movq    %r10,%rax
+       movq    %r10,%rcx
+       andq    $3,%r10
+       shrq    $2,%rax
+       andq    $-4,%rcx
+       addq    %rcx,%rax
+       addq    %rax,%r8
+       adcq    $0,%r9
+       adcq    $0,%r10
+
+       movq    %r8,%rax
+       addq    $5,%r8
+       movq    %r9,%rcx
+       adcq    $0,%r9
+       adcq    $0,%r10
+       shrq    $2,%r10
+       cmovnzq %r8,%rax
+       cmovnzq %r9,%rcx
+
+       addq    0(%rdx),%rax
+       adcq    8(%rdx),%rcx
+       movq    %rax,0(%rsi)
+       movq    %rcx,8(%rsi)
+
+       .byte   0xf3,0xc3
+.size  poly1305_emit_avx,.-poly1305_emit_avx
+.type  poly1305_blocks_avx2,@function
+.align 32
+poly1305_blocks_avx2:
+.cfi_startproc 
+       movl    20(%rdi),%r8d
+       cmpq    $128,%rdx
+       jae     .Lblocks_avx2
+       testl   %r8d,%r8d
+       jz      .Lblocks
+
+.Lblocks_avx2:
+       andq    $-16,%rdx
+       jz      .Lno_data_avx2
+
+       vzeroupper
+
+       testl   %r8d,%r8d
+       jz      .Lbase2_64_avx2
+
+       testq   $63,%rdx
+       jz      .Leven_avx2
+
+       pushq   %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r15,-56
+.Lblocks_avx2_body:
+
+       movq    %rdx,%r15
+
+       movq    0(%rdi),%r8
+       movq    8(%rdi),%r9
+       movl    16(%rdi),%ebp
+
+       movq    24(%rdi),%r11
+       movq    32(%rdi),%r13
+
+
+       movl    %r8d,%r14d
+       andq    $-2147483648,%r8
+       movq    %r9,%r12
+       movl    %r9d,%ebx
+       andq    $-2147483648,%r9
+
+       shrq    $6,%r8
+       shlq    $52,%r12
+       addq    %r8,%r14
+       shrq    $12,%rbx
+       shrq    $18,%r9
+       addq    %r12,%r14
+       adcq    %r9,%rbx
+
+       movq    %rbp,%r8
+       shlq    $40,%r8
+       shrq    $24,%rbp
+       addq    %r8,%rbx
+       adcq    $0,%rbp
+
+       movq    $-4,%r9
+       movq    %rbp,%r8
+       andq    %rbp,%r9
+       shrq    $2,%r8
+       andq    $3,%rbp
+       addq    %r9,%r8
+       addq    %r8,%r14
+       adcq    $0,%rbx
+       adcq    $0,%rbp
+
+       movq    %r13,%r12
+       movq    %r13,%rax
+       shrq    $2,%r13
+       addq    %r12,%r13
+
+.Lbase2_26_pre_avx2:
+       addq    0(%rsi),%r14
+       adcq    8(%rsi),%rbx
+       leaq    16(%rsi),%rsi
+       adcq    %rcx,%rbp
+       subq    $16,%r15
+
+       call    __poly1305_block
+       movq    %r12,%rax
+
+       testq   $63,%r15
+       jnz     .Lbase2_26_pre_avx2
+
+       testq   %rcx,%rcx
+       jz      .Lstore_base2_64_avx2
+
+
+       movq    %r14,%rax
+       movq    %r14,%rdx
+       shrq    $52,%r14
+       movq    %rbx,%r11
+       movq    %rbx,%r12
+       shrq    $26,%rdx
+       andq    $0x3ffffff,%rax
+       shlq    $12,%r11
+       andq    $0x3ffffff,%rdx
+       shrq    $14,%rbx
+       orq     %r11,%r14
+       shlq    $24,%rbp
+       andq    $0x3ffffff,%r14
+       shrq    $40,%r12
+       andq    $0x3ffffff,%rbx
+       orq     %r12,%rbp
+
+       testq   %r15,%r15
+       jz      .Lstore_base2_26_avx2
+
+       vmovd   %eax,%xmm0
+       vmovd   %edx,%xmm1
+       vmovd   %r14d,%xmm2
+       vmovd   %ebx,%xmm3
+       vmovd   %ebp,%xmm4
+       jmp     .Lproceed_avx2
+
+.align 32
+.Lstore_base2_64_avx2:
+       movq    %r14,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rbp,16(%rdi)
+       jmp     .Ldone_avx2
+
+.align 16
+.Lstore_base2_26_avx2:
+       movl    %eax,0(%rdi)
+       movl    %edx,4(%rdi)
+       movl    %r14d,8(%rdi)
+       movl    %ebx,12(%rdi)
+       movl    %ebp,16(%rdi)
+.align 16
+.Ldone_avx2:
+       movq    0(%rsp),%r15
+.cfi_restore   %r15
+       movq    8(%rsp),%r14
+.cfi_restore   %r14
+       movq    16(%rsp),%r13
+.cfi_restore   %r13
+       movq    24(%rsp),%r12
+.cfi_restore   %r12
+       movq    32(%rsp),%rbp
+.cfi_restore   %rbp
+       movq    40(%rsp),%rbx
+.cfi_restore   %rbx
+       leaq    48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lno_data_avx2:
+.Lblocks_avx2_epilogue:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+
+.align 32
+.Lbase2_64_avx2:
+.cfi_startproc 
+       pushq   %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r15,-56
+.Lbase2_64_avx2_body:
+
+       movq    %rdx,%r15
+
+       movq    24(%rdi),%r11
+       movq    32(%rdi),%r13
+
+       movq    0(%rdi),%r14
+       movq    8(%rdi),%rbx
+       movl    16(%rdi),%ebp
+
+       movq    %r13,%r12
+       movq    %r13,%rax
+       shrq    $2,%r13
+       addq    %r12,%r13
+
+       testq   $63,%rdx
+       jz      .Linit_avx2
+
+.Lbase2_64_pre_avx2:
+       addq    0(%rsi),%r14
+       adcq    8(%rsi),%rbx
+       leaq    16(%rsi),%rsi
+       adcq    %rcx,%rbp
+       subq    $16,%r15
+
+       call    __poly1305_block
+       movq    %r12,%rax
+
+       testq   $63,%r15
+       jnz     .Lbase2_64_pre_avx2
+
+.Linit_avx2:
+
+       movq    %r14,%rax
+       movq    %r14,%rdx
+       shrq    $52,%r14
+       movq    %rbx,%r8
+       movq    %rbx,%r9
+       shrq    $26,%rdx
+       andq    $0x3ffffff,%rax
+       shlq    $12,%r8
+       andq    $0x3ffffff,%rdx
+       shrq    $14,%rbx
+       orq     %r8,%r14
+       shlq    $24,%rbp
+       andq    $0x3ffffff,%r14
+       shrq    $40,%r9
+       andq    $0x3ffffff,%rbx
+       orq     %r9,%rbp
+
+       vmovd   %eax,%xmm0
+       vmovd   %edx,%xmm1
+       vmovd   %r14d,%xmm2
+       vmovd   %ebx,%xmm3
+       vmovd   %ebp,%xmm4
+       movl    $1,20(%rdi)
+
+       call    __poly1305_init_avx
+
+.Lproceed_avx2:
+       movq    %r15,%rdx
+       movl    OPENSSL_ia32cap_P+8(%rip),%r10d
+       movl    $3221291008,%r11d
+
+       movq    0(%rsp),%r15
+.cfi_restore   %r15
+       movq    8(%rsp),%r14
+.cfi_restore   %r14
+       movq    16(%rsp),%r13
+.cfi_restore   %r13
+       movq    24(%rsp),%r12
+.cfi_restore   %r12
+       movq    32(%rsp),%rbp
+.cfi_restore   %rbp
+       movq    40(%rsp),%rbx
+.cfi_restore   %rbx
+       leaq    48(%rsp),%rax
+       leaq    48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lbase2_64_avx2_epilogue:
+       jmp     .Ldo_avx2
+.cfi_endproc   
+
+.align 32
+.Leven_avx2:
+.cfi_startproc 
+       movl    OPENSSL_ia32cap_P+8(%rip),%r10d
+       vmovd   0(%rdi),%xmm0
+       vmovd   4(%rdi),%xmm1
+       vmovd   8(%rdi),%xmm2
+       vmovd   12(%rdi),%xmm3
+       vmovd   16(%rdi),%xmm4
+
+.Ldo_avx2:
+       cmpq    $512,%rdx
+       jb      .Lskip_avx512
+       andl    %r11d,%r10d
+       testl   $65536,%r10d
+       jnz     .Lblocks_avx512
+.Lskip_avx512:
+       leaq    -8(%rsp),%r11
+.cfi_def_cfa   %r11,16
+       subq    $0x128,%rsp
+       leaq    .Lconst(%rip),%rcx
+       leaq    48+64(%rdi),%rdi
+       vmovdqa 96(%rcx),%ymm7
+
+
+       vmovdqu -64(%rdi),%xmm9
+       andq    $-512,%rsp
+       vmovdqu -48(%rdi),%xmm10
+       vmovdqu -32(%rdi),%xmm6
+       vmovdqu -16(%rdi),%xmm11
+       vmovdqu 0(%rdi),%xmm12
+       vmovdqu 16(%rdi),%xmm13
+       leaq    144(%rsp),%rax
+       vmovdqu 32(%rdi),%xmm14
+       vpermd  %ymm9,%ymm7,%ymm9
+       vmovdqu 48(%rdi),%xmm15
+       vpermd  %ymm10,%ymm7,%ymm10
+       vmovdqu 64(%rdi),%xmm5
+       vpermd  %ymm6,%ymm7,%ymm6
+       vmovdqa %ymm9,0(%rsp)
+       vpermd  %ymm11,%ymm7,%ymm11
+       vmovdqa %ymm10,32-144(%rax)
+       vpermd  %ymm12,%ymm7,%ymm12
+       vmovdqa %ymm6,64-144(%rax)
+       vpermd  %ymm13,%ymm7,%ymm13
+       vmovdqa %ymm11,96-144(%rax)
+       vpermd  %ymm14,%ymm7,%ymm14
+       vmovdqa %ymm12,128-144(%rax)
+       vpermd  %ymm15,%ymm7,%ymm15
+       vmovdqa %ymm13,160-144(%rax)
+       vpermd  %ymm5,%ymm7,%ymm5
+       vmovdqa %ymm14,192-144(%rax)
+       vmovdqa %ymm15,224-144(%rax)
+       vmovdqa %ymm5,256-144(%rax)
+       vmovdqa 64(%rcx),%ymm5
+
+
+
+       vmovdqu 0(%rsi),%xmm7
+       vmovdqu 16(%rsi),%xmm8
+       vinserti128     $1,32(%rsi),%ymm7,%ymm7
+       vinserti128     $1,48(%rsi),%ymm8,%ymm8
+       leaq    64(%rsi),%rsi
+
+       vpsrldq $6,%ymm7,%ymm9
+       vpsrldq $6,%ymm8,%ymm10
+       vpunpckhqdq     %ymm8,%ymm7,%ymm6
+       vpunpcklqdq     %ymm10,%ymm9,%ymm9
+       vpunpcklqdq     %ymm8,%ymm7,%ymm7
+
+       vpsrlq  $30,%ymm9,%ymm10
+       vpsrlq  $4,%ymm9,%ymm9
+       vpsrlq  $26,%ymm7,%ymm8
+       vpsrlq  $40,%ymm6,%ymm6
+       vpand   %ymm5,%ymm9,%ymm9
+       vpand   %ymm5,%ymm7,%ymm7
+       vpand   %ymm5,%ymm8,%ymm8
+       vpand   %ymm5,%ymm10,%ymm10
+       vpor    32(%rcx),%ymm6,%ymm6
+
+       vpaddq  %ymm2,%ymm9,%ymm2
+       subq    $64,%rdx
+       jz      .Ltail_avx2
+       jmp     .Loop_avx2
+
+.align 32
+.Loop_avx2:
+
+
+
+
+
+
+
+
+       vpaddq  %ymm0,%ymm7,%ymm0
+       vmovdqa 0(%rsp),%ymm7
+       vpaddq  %ymm1,%ymm8,%ymm1
+       vmovdqa 32(%rsp),%ymm8
+       vpaddq  %ymm3,%ymm10,%ymm3
+       vmovdqa 96(%rsp),%ymm9
+       vpaddq  %ymm4,%ymm6,%ymm4
+       vmovdqa 48(%rax),%ymm10
+       vmovdqa 112(%rax),%ymm5
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+       vpmuludq        %ymm2,%ymm7,%ymm13
+       vpmuludq        %ymm2,%ymm8,%ymm14
+       vpmuludq        %ymm2,%ymm9,%ymm15
+       vpmuludq        %ymm2,%ymm10,%ymm11
+       vpmuludq        %ymm2,%ymm5,%ymm12
+
+       vpmuludq        %ymm0,%ymm8,%ymm6
+       vpmuludq        %ymm1,%ymm8,%ymm2
+       vpaddq  %ymm6,%ymm12,%ymm12
+       vpaddq  %ymm2,%ymm13,%ymm13
+       vpmuludq        %ymm3,%ymm8,%ymm6
+       vpmuludq        64(%rsp),%ymm4,%ymm2
+       vpaddq  %ymm6,%ymm15,%ymm15
+       vpaddq  %ymm2,%ymm11,%ymm11
+       vmovdqa -16(%rax),%ymm8
+
+       vpmuludq        %ymm0,%ymm7,%ymm6
+       vpmuludq        %ymm1,%ymm7,%ymm2
+       vpaddq  %ymm6,%ymm11,%ymm11
+       vpaddq  %ymm2,%ymm12,%ymm12
+       vpmuludq        %ymm3,%ymm7,%ymm6
+       vpmuludq        %ymm4,%ymm7,%ymm2
+       vmovdqu 0(%rsi),%xmm7
+       vpaddq  %ymm6,%ymm14,%ymm14
+       vpaddq  %ymm2,%ymm15,%ymm15
+       vinserti128     $1,32(%rsi),%ymm7,%ymm7
+
+       vpmuludq        %ymm3,%ymm8,%ymm6
+       vpmuludq        %ymm4,%ymm8,%ymm2
+       vmovdqu 16(%rsi),%xmm8
+       vpaddq  %ymm6,%ymm11,%ymm11
+       vpaddq  %ymm2,%ymm12,%ymm12
+       vmovdqa 16(%rax),%ymm2
+       vpmuludq        %ymm1,%ymm9,%ymm6
+       vpmuludq        %ymm0,%ymm9,%ymm9
+       vpaddq  %ymm6,%ymm14,%ymm14
+       vpaddq  %ymm9,%ymm13,%ymm13
+       vinserti128     $1,48(%rsi),%ymm8,%ymm8
+       leaq    64(%rsi),%rsi
+
+       vpmuludq        %ymm1,%ymm2,%ymm6
+       vpmuludq        %ymm0,%ymm2,%ymm2
+       vpsrldq $6,%ymm7,%ymm9
+       vpaddq  %ymm6,%ymm15,%ymm15
+       vpaddq  %ymm2,%ymm14,%ymm14
+       vpmuludq        %ymm3,%ymm10,%ymm6
+       vpmuludq        %ymm4,%ymm10,%ymm2
+       vpsrldq $6,%ymm8,%ymm10
+       vpaddq  %ymm6,%ymm12,%ymm12
+       vpaddq  %ymm2,%ymm13,%ymm13
+       vpunpckhqdq     %ymm8,%ymm7,%ymm6
+
+       vpmuludq        %ymm3,%ymm5,%ymm3
+       vpmuludq        %ymm4,%ymm5,%ymm4
+       vpunpcklqdq     %ymm8,%ymm7,%ymm7
+       vpaddq  %ymm3,%ymm13,%ymm2
+       vpaddq  %ymm4,%ymm14,%ymm3
+       vpunpcklqdq     %ymm10,%ymm9,%ymm10
+       vpmuludq        80(%rax),%ymm0,%ymm4
+       vpmuludq        %ymm1,%ymm5,%ymm0
+       vmovdqa 64(%rcx),%ymm5
+       vpaddq  %ymm4,%ymm15,%ymm4
+       vpaddq  %ymm0,%ymm11,%ymm0
+
+
+
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpaddq  %ymm11,%ymm12,%ymm1
+
+       vpsrlq  $26,%ymm4,%ymm15
+       vpand   %ymm5,%ymm4,%ymm4
+
+       vpsrlq  $4,%ymm10,%ymm9
+
+       vpsrlq  $26,%ymm1,%ymm12
+       vpand   %ymm5,%ymm1,%ymm1
+       vpaddq  %ymm12,%ymm2,%ymm2
+
+       vpaddq  %ymm15,%ymm0,%ymm0
+       vpsllq  $2,%ymm15,%ymm15
+       vpaddq  %ymm15,%ymm0,%ymm0
+
+       vpand   %ymm5,%ymm9,%ymm9
+       vpsrlq  $26,%ymm7,%ymm8
+
+       vpsrlq  $26,%ymm2,%ymm13
+       vpand   %ymm5,%ymm2,%ymm2
+       vpaddq  %ymm13,%ymm3,%ymm3
+
+       vpaddq  %ymm9,%ymm2,%ymm2
+       vpsrlq  $30,%ymm10,%ymm10
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpaddq  %ymm11,%ymm1,%ymm1
+
+       vpsrlq  $40,%ymm6,%ymm6
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       vpand   %ymm5,%ymm7,%ymm7
+       vpand   %ymm5,%ymm8,%ymm8
+       vpand   %ymm5,%ymm10,%ymm10
+       vpor    32(%rcx),%ymm6,%ymm6
+
+       subq    $64,%rdx
+       jnz     .Loop_avx2
+
+.byte  0x66,0x90
+.Ltail_avx2:
+
+
+
+
+
+
+
+       vpaddq  %ymm0,%ymm7,%ymm0
+       vmovdqu 4(%rsp),%ymm7
+       vpaddq  %ymm1,%ymm8,%ymm1
+       vmovdqu 36(%rsp),%ymm8
+       vpaddq  %ymm3,%ymm10,%ymm3
+       vmovdqu 100(%rsp),%ymm9
+       vpaddq  %ymm4,%ymm6,%ymm4
+       vmovdqu 52(%rax),%ymm10
+       vmovdqu 116(%rax),%ymm5
+
+       vpmuludq        %ymm2,%ymm7,%ymm13
+       vpmuludq        %ymm2,%ymm8,%ymm14
+       vpmuludq        %ymm2,%ymm9,%ymm15
+       vpmuludq        %ymm2,%ymm10,%ymm11
+       vpmuludq        %ymm2,%ymm5,%ymm12
+
+       vpmuludq        %ymm0,%ymm8,%ymm6
+       vpmuludq        %ymm1,%ymm8,%ymm2
+       vpaddq  %ymm6,%ymm12,%ymm12
+       vpaddq  %ymm2,%ymm13,%ymm13
+       vpmuludq        %ymm3,%ymm8,%ymm6
+       vpmuludq        68(%rsp),%ymm4,%ymm2
+       vpaddq  %ymm6,%ymm15,%ymm15
+       vpaddq  %ymm2,%ymm11,%ymm11
+
+       vpmuludq        %ymm0,%ymm7,%ymm6
+       vpmuludq        %ymm1,%ymm7,%ymm2
+       vpaddq  %ymm6,%ymm11,%ymm11
+       vmovdqu -12(%rax),%ymm8
+       vpaddq  %ymm2,%ymm12,%ymm12
+       vpmuludq        %ymm3,%ymm7,%ymm6
+       vpmuludq        %ymm4,%ymm7,%ymm2
+       vpaddq  %ymm6,%ymm14,%ymm14
+       vpaddq  %ymm2,%ymm15,%ymm15
+
+       vpmuludq        %ymm3,%ymm8,%ymm6
+       vpmuludq        %ymm4,%ymm8,%ymm2
+       vpaddq  %ymm6,%ymm11,%ymm11
+       vpaddq  %ymm2,%ymm12,%ymm12
+       vmovdqu 20(%rax),%ymm2
+       vpmuludq        %ymm1,%ymm9,%ymm6
+       vpmuludq        %ymm0,%ymm9,%ymm9
+       vpaddq  %ymm6,%ymm14,%ymm14
+       vpaddq  %ymm9,%ymm13,%ymm13
+
+       vpmuludq        %ymm1,%ymm2,%ymm6
+       vpmuludq        %ymm0,%ymm2,%ymm2
+       vpaddq  %ymm6,%ymm15,%ymm15
+       vpaddq  %ymm2,%ymm14,%ymm14
+       vpmuludq        %ymm3,%ymm10,%ymm6
+       vpmuludq        %ymm4,%ymm10,%ymm2
+       vpaddq  %ymm6,%ymm12,%ymm12
+       vpaddq  %ymm2,%ymm13,%ymm13
+
+       vpmuludq        %ymm3,%ymm5,%ymm3
+       vpmuludq        %ymm4,%ymm5,%ymm4
+       vpaddq  %ymm3,%ymm13,%ymm2
+       vpaddq  %ymm4,%ymm14,%ymm3
+       vpmuludq        84(%rax),%ymm0,%ymm4
+       vpmuludq        %ymm1,%ymm5,%ymm0
+       vmovdqa 64(%rcx),%ymm5
+       vpaddq  %ymm4,%ymm15,%ymm4
+       vpaddq  %ymm0,%ymm11,%ymm0
+
+
+
+
+       vpsrldq $8,%ymm12,%ymm8
+       vpsrldq $8,%ymm2,%ymm9
+       vpsrldq $8,%ymm3,%ymm10
+       vpsrldq $8,%ymm4,%ymm6
+       vpsrldq $8,%ymm0,%ymm7
+       vpaddq  %ymm8,%ymm12,%ymm12
+       vpaddq  %ymm9,%ymm2,%ymm2
+       vpaddq  %ymm10,%ymm3,%ymm3
+       vpaddq  %ymm6,%ymm4,%ymm4
+       vpaddq  %ymm7,%ymm0,%ymm0
+
+       vpermq  $0x2,%ymm3,%ymm10
+       vpermq  $0x2,%ymm4,%ymm6
+       vpermq  $0x2,%ymm0,%ymm7
+       vpermq  $0x2,%ymm12,%ymm8
+       vpermq  $0x2,%ymm2,%ymm9
+       vpaddq  %ymm10,%ymm3,%ymm3
+       vpaddq  %ymm6,%ymm4,%ymm4
+       vpaddq  %ymm7,%ymm0,%ymm0
+       vpaddq  %ymm8,%ymm12,%ymm12
+       vpaddq  %ymm9,%ymm2,%ymm2
+
+
+
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpaddq  %ymm11,%ymm12,%ymm1
+
+       vpsrlq  $26,%ymm4,%ymm15
+       vpand   %ymm5,%ymm4,%ymm4
+
+       vpsrlq  $26,%ymm1,%ymm12
+       vpand   %ymm5,%ymm1,%ymm1
+       vpaddq  %ymm12,%ymm2,%ymm2
+
+       vpaddq  %ymm15,%ymm0,%ymm0
+       vpsllq  $2,%ymm15,%ymm15
+       vpaddq  %ymm15,%ymm0,%ymm0
+
+       vpsrlq  $26,%ymm2,%ymm13
+       vpand   %ymm5,%ymm2,%ymm2
+       vpaddq  %ymm13,%ymm3,%ymm3
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpaddq  %ymm11,%ymm1,%ymm1
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       vmovd   %xmm0,-112(%rdi)
+       vmovd   %xmm1,-108(%rdi)
+       vmovd   %xmm2,-104(%rdi)
+       vmovd   %xmm3,-100(%rdi)
+       vmovd   %xmm4,-96(%rdi)
+       leaq    8(%r11),%rsp
+.cfi_def_cfa   %rsp,8
+       vzeroupper
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  poly1305_blocks_avx2,.-poly1305_blocks_avx2
+.type  poly1305_blocks_avx512,@function
+.align 32
+poly1305_blocks_avx512:
+.cfi_startproc 
+.Lblocks_avx512:
+       movl    $15,%eax
+       kmovw   %eax,%k2
+       leaq    -8(%rsp),%r11
+.cfi_def_cfa   %r11,16
+       subq    $0x128,%rsp
+       leaq    .Lconst(%rip),%rcx
+       leaq    48+64(%rdi),%rdi
+       vmovdqa 96(%rcx),%ymm9
+
+
+       vmovdqu -64(%rdi),%xmm11
+       andq    $-512,%rsp
+       vmovdqu -48(%rdi),%xmm12
+       movq    $0x20,%rax
+       vmovdqu -32(%rdi),%xmm7
+       vmovdqu -16(%rdi),%xmm13
+       vmovdqu 0(%rdi),%xmm8
+       vmovdqu 16(%rdi),%xmm14
+       vmovdqu 32(%rdi),%xmm10
+       vmovdqu 48(%rdi),%xmm15
+       vmovdqu 64(%rdi),%xmm6
+       vpermd  %zmm11,%zmm9,%zmm16
+       vpbroadcastq    64(%rcx),%zmm5
+       vpermd  %zmm12,%zmm9,%zmm17
+       vpermd  %zmm7,%zmm9,%zmm21
+       vpermd  %zmm13,%zmm9,%zmm18
+       vmovdqa64       %zmm16,0(%rsp){%k2}
+       vpsrlq  $32,%zmm16,%zmm7
+       vpermd  %zmm8,%zmm9,%zmm22
+       vmovdqu64       %zmm17,0(%rsp,%rax,1){%k2}
+       vpsrlq  $32,%zmm17,%zmm8
+       vpermd  %zmm14,%zmm9,%zmm19
+       vmovdqa64       %zmm21,64(%rsp){%k2}
+       vpermd  %zmm10,%zmm9,%zmm23
+       vpermd  %zmm15,%zmm9,%zmm20
+       vmovdqu64       %zmm18,64(%rsp,%rax,1){%k2}
+       vpermd  %zmm6,%zmm9,%zmm24
+       vmovdqa64       %zmm22,128(%rsp){%k2}
+       vmovdqu64       %zmm19,128(%rsp,%rax,1){%k2}
+       vmovdqa64       %zmm23,192(%rsp){%k2}
+       vmovdqu64       %zmm20,192(%rsp,%rax,1){%k2}
+       vmovdqa64       %zmm24,256(%rsp){%k2}
+
+
+
+
+
+
+
+
+
+
+       vpmuludq        %zmm7,%zmm16,%zmm11
+       vpmuludq        %zmm7,%zmm17,%zmm12
+       vpmuludq        %zmm7,%zmm18,%zmm13
+       vpmuludq        %zmm7,%zmm19,%zmm14
+       vpmuludq        %zmm7,%zmm20,%zmm15
+       vpsrlq  $32,%zmm18,%zmm9
+
+       vpmuludq        %zmm8,%zmm24,%zmm25
+       vpmuludq        %zmm8,%zmm16,%zmm26
+       vpmuludq        %zmm8,%zmm17,%zmm27
+       vpmuludq        %zmm8,%zmm18,%zmm28
+       vpmuludq        %zmm8,%zmm19,%zmm29
+       vpsrlq  $32,%zmm19,%zmm10
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm27,%zmm13,%zmm13
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+
+       vpmuludq        %zmm9,%zmm23,%zmm25
+       vpmuludq        %zmm9,%zmm24,%zmm26
+       vpmuludq        %zmm9,%zmm17,%zmm28
+       vpmuludq        %zmm9,%zmm18,%zmm29
+       vpmuludq        %zmm9,%zmm16,%zmm27
+       vpsrlq  $32,%zmm20,%zmm6
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vpmuludq        %zmm10,%zmm22,%zmm25
+       vpmuludq        %zmm10,%zmm16,%zmm28
+       vpmuludq        %zmm10,%zmm17,%zmm29
+       vpmuludq        %zmm10,%zmm23,%zmm26
+       vpmuludq        %zmm10,%zmm24,%zmm27
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vpmuludq        %zmm6,%zmm24,%zmm28
+       vpmuludq        %zmm6,%zmm16,%zmm29
+       vpmuludq        %zmm6,%zmm21,%zmm25
+       vpmuludq        %zmm6,%zmm22,%zmm26
+       vpmuludq        %zmm6,%zmm23,%zmm27
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+
+
+       vmovdqu64       0(%rsi),%zmm10
+       vmovdqu64       64(%rsi),%zmm6
+       leaq    128(%rsi),%rsi
+
+
+
+
+       vpsrlq  $26,%zmm14,%zmm28
+       vpandq  %zmm5,%zmm14,%zmm14
+       vpaddq  %zmm28,%zmm15,%zmm15
+
+       vpsrlq  $26,%zmm11,%zmm25
+       vpandq  %zmm5,%zmm11,%zmm11
+       vpaddq  %zmm25,%zmm12,%zmm12
+
+       vpsrlq  $26,%zmm15,%zmm29
+       vpandq  %zmm5,%zmm15,%zmm15
+
+       vpsrlq  $26,%zmm12,%zmm26
+       vpandq  %zmm5,%zmm12,%zmm12
+       vpaddq  %zmm26,%zmm13,%zmm13
+
+       vpaddq  %zmm29,%zmm11,%zmm11
+       vpsllq  $2,%zmm29,%zmm29
+       vpaddq  %zmm29,%zmm11,%zmm11
+
+       vpsrlq  $26,%zmm13,%zmm27
+       vpandq  %zmm5,%zmm13,%zmm13
+       vpaddq  %zmm27,%zmm14,%zmm14
+
+       vpsrlq  $26,%zmm11,%zmm25
+       vpandq  %zmm5,%zmm11,%zmm11
+       vpaddq  %zmm25,%zmm12,%zmm12
+
+       vpsrlq  $26,%zmm14,%zmm28
+       vpandq  %zmm5,%zmm14,%zmm14
+       vpaddq  %zmm28,%zmm15,%zmm15
+
+
+
+
+
+       vpunpcklqdq     %zmm6,%zmm10,%zmm7
+       vpunpckhqdq     %zmm6,%zmm10,%zmm6
+
+
+
+
+
+
+       vmovdqa32       128(%rcx),%zmm25
+       movl    $0x7777,%eax
+       kmovw   %eax,%k1
+
+       vpermd  %zmm16,%zmm25,%zmm16
+       vpermd  %zmm17,%zmm25,%zmm17
+       vpermd  %zmm18,%zmm25,%zmm18
+       vpermd  %zmm19,%zmm25,%zmm19
+       vpermd  %zmm20,%zmm25,%zmm20
+
+       vpermd  %zmm11,%zmm25,%zmm16{%k1}
+       vpermd  %zmm12,%zmm25,%zmm17{%k1}
+       vpermd  %zmm13,%zmm25,%zmm18{%k1}
+       vpermd  %zmm14,%zmm25,%zmm19{%k1}
+       vpermd  %zmm15,%zmm25,%zmm20{%k1}
+
+       vpslld  $2,%zmm17,%zmm21
+       vpslld  $2,%zmm18,%zmm22
+       vpslld  $2,%zmm19,%zmm23
+       vpslld  $2,%zmm20,%zmm24
+       vpaddd  %zmm17,%zmm21,%zmm21
+       vpaddd  %zmm18,%zmm22,%zmm22
+       vpaddd  %zmm19,%zmm23,%zmm23
+       vpaddd  %zmm20,%zmm24,%zmm24
+
+       vpbroadcastq    32(%rcx),%zmm30
+
+       vpsrlq  $52,%zmm7,%zmm9
+       vpsllq  $12,%zmm6,%zmm10
+       vporq   %zmm10,%zmm9,%zmm9
+       vpsrlq  $26,%zmm7,%zmm8
+       vpsrlq  $14,%zmm6,%zmm10
+       vpsrlq  $40,%zmm6,%zmm6
+       vpandq  %zmm5,%zmm9,%zmm9
+       vpandq  %zmm5,%zmm7,%zmm7
+
+
+
+
+       vpaddq  %zmm2,%zmm9,%zmm2
+       subq    $192,%rdx
+       jbe     .Ltail_avx512
+       jmp     .Loop_avx512
+
+.align 32
+.Loop_avx512:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+       vpmuludq        %zmm2,%zmm17,%zmm14
+       vpaddq  %zmm0,%zmm7,%zmm0
+       vpmuludq        %zmm2,%zmm18,%zmm15
+       vpandq  %zmm5,%zmm8,%zmm8
+       vpmuludq        %zmm2,%zmm23,%zmm11
+       vpandq  %zmm5,%zmm10,%zmm10
+       vpmuludq        %zmm2,%zmm24,%zmm12
+       vporq   %zmm30,%zmm6,%zmm6
+       vpmuludq        %zmm2,%zmm16,%zmm13
+       vpaddq  %zmm1,%zmm8,%zmm1
+       vpaddq  %zmm3,%zmm10,%zmm3
+       vpaddq  %zmm4,%zmm6,%zmm4
+
+       vmovdqu64       0(%rsi),%zmm10
+       vmovdqu64       64(%rsi),%zmm6
+       leaq    128(%rsi),%rsi
+       vpmuludq        %zmm0,%zmm19,%zmm28
+       vpmuludq        %zmm0,%zmm20,%zmm29
+       vpmuludq        %zmm0,%zmm16,%zmm25
+       vpmuludq        %zmm0,%zmm17,%zmm26
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm26,%zmm12,%zmm12
+
+       vpmuludq        %zmm1,%zmm18,%zmm28
+       vpmuludq        %zmm1,%zmm19,%zmm29
+       vpmuludq        %zmm1,%zmm24,%zmm25
+       vpmuludq        %zmm0,%zmm18,%zmm27
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vpunpcklqdq     %zmm6,%zmm10,%zmm7
+       vpunpckhqdq     %zmm6,%zmm10,%zmm6
+
+       vpmuludq        %zmm3,%zmm16,%zmm28
+       vpmuludq        %zmm3,%zmm17,%zmm29
+       vpmuludq        %zmm1,%zmm16,%zmm26
+       vpmuludq        %zmm1,%zmm17,%zmm27
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vpmuludq        %zmm4,%zmm24,%zmm28
+       vpmuludq        %zmm4,%zmm16,%zmm29
+       vpmuludq        %zmm3,%zmm22,%zmm25
+       vpmuludq        %zmm3,%zmm23,%zmm26
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpmuludq        %zmm3,%zmm24,%zmm27
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vpmuludq        %zmm4,%zmm21,%zmm25
+       vpmuludq        %zmm4,%zmm22,%zmm26
+       vpmuludq        %zmm4,%zmm23,%zmm27
+       vpaddq  %zmm25,%zmm11,%zmm0
+       vpaddq  %zmm26,%zmm12,%zmm1
+       vpaddq  %zmm27,%zmm13,%zmm2
+
+
+
+
+       vpsrlq  $52,%zmm7,%zmm9
+       vpsllq  $12,%zmm6,%zmm10
+
+       vpsrlq  $26,%zmm14,%zmm3
+       vpandq  %zmm5,%zmm14,%zmm14
+       vpaddq  %zmm3,%zmm15,%zmm4
+
+       vporq   %zmm10,%zmm9,%zmm9
+
+       vpsrlq  $26,%zmm0,%zmm11
+       vpandq  %zmm5,%zmm0,%zmm0
+       vpaddq  %zmm11,%zmm1,%zmm1
+
+       vpandq  %zmm5,%zmm9,%zmm9
+
+       vpsrlq  $26,%zmm4,%zmm15
+       vpandq  %zmm5,%zmm4,%zmm4
+
+       vpsrlq  $26,%zmm1,%zmm12
+       vpandq  %zmm5,%zmm1,%zmm1
+       vpaddq  %zmm12,%zmm2,%zmm2
+
+       vpaddq  %zmm15,%zmm0,%zmm0
+       vpsllq  $2,%zmm15,%zmm15
+       vpaddq  %zmm15,%zmm0,%zmm0
+
+       vpaddq  %zmm9,%zmm2,%zmm2
+       vpsrlq  $26,%zmm7,%zmm8
+
+       vpsrlq  $26,%zmm2,%zmm13
+       vpandq  %zmm5,%zmm2,%zmm2
+       vpaddq  %zmm13,%zmm14,%zmm3
+
+       vpsrlq  $14,%zmm6,%zmm10
+
+       vpsrlq  $26,%zmm0,%zmm11
+       vpandq  %zmm5,%zmm0,%zmm0
+       vpaddq  %zmm11,%zmm1,%zmm1
+
+       vpsrlq  $40,%zmm6,%zmm6
+
+       vpsrlq  $26,%zmm3,%zmm14
+       vpandq  %zmm5,%zmm3,%zmm3
+       vpaddq  %zmm14,%zmm4,%zmm4
+
+       vpandq  %zmm5,%zmm7,%zmm7
+
+
+
+
+       subq    $128,%rdx
+       ja      .Loop_avx512
+
+.Ltail_avx512:
+
+
+
+
+
+       vpsrlq  $32,%zmm16,%zmm16
+       vpsrlq  $32,%zmm17,%zmm17
+       vpsrlq  $32,%zmm18,%zmm18
+       vpsrlq  $32,%zmm23,%zmm23
+       vpsrlq  $32,%zmm24,%zmm24
+       vpsrlq  $32,%zmm19,%zmm19
+       vpsrlq  $32,%zmm20,%zmm20
+       vpsrlq  $32,%zmm21,%zmm21
+       vpsrlq  $32,%zmm22,%zmm22
+
+
+
+       leaq    (%rsi,%rdx,1),%rsi
+
+
+       vpaddq  %zmm0,%zmm7,%zmm0
+
+       vpmuludq        %zmm2,%zmm17,%zmm14
+       vpmuludq        %zmm2,%zmm18,%zmm15
+       vpmuludq        %zmm2,%zmm23,%zmm11
+       vpandq  %zmm5,%zmm8,%zmm8
+       vpmuludq        %zmm2,%zmm24,%zmm12
+       vpandq  %zmm5,%zmm10,%zmm10
+       vpmuludq        %zmm2,%zmm16,%zmm13
+       vporq   %zmm30,%zmm6,%zmm6
+       vpaddq  %zmm1,%zmm8,%zmm1
+       vpaddq  %zmm3,%zmm10,%zmm3
+       vpaddq  %zmm4,%zmm6,%zmm4
+
+       vmovdqu 0(%rsi),%xmm7
+       vpmuludq        %zmm0,%zmm19,%zmm28
+       vpmuludq        %zmm0,%zmm20,%zmm29
+       vpmuludq        %zmm0,%zmm16,%zmm25
+       vpmuludq        %zmm0,%zmm17,%zmm26
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm26,%zmm12,%zmm12
+
+       vmovdqu 16(%rsi),%xmm8
+       vpmuludq        %zmm1,%zmm18,%zmm28
+       vpmuludq        %zmm1,%zmm19,%zmm29
+       vpmuludq        %zmm1,%zmm24,%zmm25
+       vpmuludq        %zmm0,%zmm18,%zmm27
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vinserti128     $1,32(%rsi),%ymm7,%ymm7
+       vpmuludq        %zmm3,%zmm16,%zmm28
+       vpmuludq        %zmm3,%zmm17,%zmm29
+       vpmuludq        %zmm1,%zmm16,%zmm26
+       vpmuludq        %zmm1,%zmm17,%zmm27
+       vpaddq  %zmm28,%zmm14,%zmm14
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vinserti128     $1,48(%rsi),%ymm8,%ymm8
+       vpmuludq        %zmm4,%zmm24,%zmm28
+       vpmuludq        %zmm4,%zmm16,%zmm29
+       vpmuludq        %zmm3,%zmm22,%zmm25
+       vpmuludq        %zmm3,%zmm23,%zmm26
+       vpmuludq        %zmm3,%zmm24,%zmm27
+       vpaddq  %zmm28,%zmm14,%zmm3
+       vpaddq  %zmm29,%zmm15,%zmm15
+       vpaddq  %zmm25,%zmm11,%zmm11
+       vpaddq  %zmm26,%zmm12,%zmm12
+       vpaddq  %zmm27,%zmm13,%zmm13
+
+       vpmuludq        %zmm4,%zmm21,%zmm25
+       vpmuludq        %zmm4,%zmm22,%zmm26
+       vpmuludq        %zmm4,%zmm23,%zmm27
+       vpaddq  %zmm25,%zmm11,%zmm0
+       vpaddq  %zmm26,%zmm12,%zmm1
+       vpaddq  %zmm27,%zmm13,%zmm2
+
+
+
+
+       movl    $1,%eax
+       vpermq  $0xb1,%zmm3,%zmm14
+       vpermq  $0xb1,%zmm15,%zmm4
+       vpermq  $0xb1,%zmm0,%zmm11
+       vpermq  $0xb1,%zmm1,%zmm12
+       vpermq  $0xb1,%zmm2,%zmm13
+       vpaddq  %zmm14,%zmm3,%zmm3
+       vpaddq  %zmm15,%zmm4,%zmm4
+       vpaddq  %zmm11,%zmm0,%zmm0
+       vpaddq  %zmm12,%zmm1,%zmm1
+       vpaddq  %zmm13,%zmm2,%zmm2
+
+       kmovw   %eax,%k3
+       vpermq  $0x2,%zmm3,%zmm14
+       vpermq  $0x2,%zmm4,%zmm15
+       vpermq  $0x2,%zmm0,%zmm11
+       vpermq  $0x2,%zmm1,%zmm12
+       vpermq  $0x2,%zmm2,%zmm13
+       vpaddq  %zmm14,%zmm3,%zmm3
+       vpaddq  %zmm15,%zmm4,%zmm4
+       vpaddq  %zmm11,%zmm0,%zmm0
+       vpaddq  %zmm12,%zmm1,%zmm1
+       vpaddq  %zmm13,%zmm2,%zmm2
+
+       vextracti64x4   $0x1,%zmm3,%ymm14
+       vextracti64x4   $0x1,%zmm4,%ymm15
+       vextracti64x4   $0x1,%zmm0,%ymm11
+       vextracti64x4   $0x1,%zmm1,%ymm12
+       vextracti64x4   $0x1,%zmm2,%ymm13
+       vpaddq  %zmm14,%zmm3,%zmm3{%k3}{z}
+       vpaddq  %zmm15,%zmm4,%zmm4{%k3}{z}
+       vpaddq  %zmm11,%zmm0,%zmm0{%k3}{z}
+       vpaddq  %zmm12,%zmm1,%zmm1{%k3}{z}
+       vpaddq  %zmm13,%zmm2,%zmm2{%k3}{z}
+
+
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpsrldq $6,%ymm7,%ymm9
+       vpsrldq $6,%ymm8,%ymm10
+       vpunpckhqdq     %ymm8,%ymm7,%ymm6
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpunpcklqdq     %ymm10,%ymm9,%ymm9
+       vpunpcklqdq     %ymm8,%ymm7,%ymm7
+       vpaddq  %ymm11,%ymm1,%ymm1
+
+       vpsrlq  $26,%ymm4,%ymm15
+       vpand   %ymm5,%ymm4,%ymm4
+
+       vpsrlq  $26,%ymm1,%ymm12
+       vpand   %ymm5,%ymm1,%ymm1
+       vpsrlq  $30,%ymm9,%ymm10
+       vpsrlq  $4,%ymm9,%ymm9
+       vpaddq  %ymm12,%ymm2,%ymm2
+
+       vpaddq  %ymm15,%ymm0,%ymm0
+       vpsllq  $2,%ymm15,%ymm15
+       vpsrlq  $26,%ymm7,%ymm8
+       vpsrlq  $40,%ymm6,%ymm6
+       vpaddq  %ymm15,%ymm0,%ymm0
+
+       vpsrlq  $26,%ymm2,%ymm13
+       vpand   %ymm5,%ymm2,%ymm2
+       vpand   %ymm5,%ymm9,%ymm9
+       vpand   %ymm5,%ymm7,%ymm7
+       vpaddq  %ymm13,%ymm3,%ymm3
+
+       vpsrlq  $26,%ymm0,%ymm11
+       vpand   %ymm5,%ymm0,%ymm0
+       vpaddq  %ymm2,%ymm9,%ymm2
+       vpand   %ymm5,%ymm8,%ymm8
+       vpaddq  %ymm11,%ymm1,%ymm1
+
+       vpsrlq  $26,%ymm3,%ymm14
+       vpand   %ymm5,%ymm3,%ymm3
+       vpand   %ymm5,%ymm10,%ymm10
+       vpor    32(%rcx),%ymm6,%ymm6
+       vpaddq  %ymm14,%ymm4,%ymm4
+
+       leaq    144(%rsp),%rax
+       addq    $64,%rdx
+       jnz     .Ltail_avx2
+
+       vpsubq  %ymm9,%ymm2,%ymm2
+       vmovd   %xmm0,-112(%rdi)
+       vmovd   %xmm1,-108(%rdi)
+       vmovd   %xmm2,-104(%rdi)
+       vmovd   %xmm3,-100(%rdi)
+       vmovd   %xmm4,-96(%rdi)
+       vzeroall
+       leaq    8(%r11),%rsp
+.cfi_def_cfa   %rsp,8
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  poly1305_blocks_avx512,.-poly1305_blocks_avx512
+.type  poly1305_init_base2_44,@function
+.align 32
+poly1305_init_base2_44:
+       xorq    %rax,%rax
+       movq    %rax,0(%rdi)
+       movq    %rax,8(%rdi)
+       movq    %rax,16(%rdi)
+
+.Linit_base2_44:
+       leaq    poly1305_blocks_vpmadd52(%rip),%r10
+       leaq    poly1305_emit_base2_44(%rip),%r11
+
+       movq    $0x0ffffffc0fffffff,%rax
+       movq    $0x0ffffffc0ffffffc,%rcx
+       andq    0(%rsi),%rax
+       movq    $0x00000fffffffffff,%r8
+       andq    8(%rsi),%rcx
+       movq    $0x00000fffffffffff,%r9
+       andq    %rax,%r8
+       shrdq   $44,%rcx,%rax
+       movq    %r8,40(%rdi)
+       andq    %r9,%rax
+       shrq    $24,%rcx
+       movq    %rax,48(%rdi)
+       leaq    (%rax,%rax,4),%rax
+       movq    %rcx,56(%rdi)
+       shlq    $2,%rax
+       leaq    (%rcx,%rcx,4),%rcx
+       shlq    $2,%rcx
+       movq    %rax,24(%rdi)
+       movq    %rcx,32(%rdi)
+       movq    $-1,64(%rdi)
+       movq    %r10,0(%rdx)
+       movq    %r11,8(%rdx)
+       movl    $1,%eax
+       .byte   0xf3,0xc3
+.size  poly1305_init_base2_44,.-poly1305_init_base2_44
+.type  poly1305_blocks_vpmadd52,@function
+.align 32
+poly1305_blocks_vpmadd52:
+       shrq    $4,%rdx
+       jz      .Lno_data_vpmadd52
+
+       shlq    $40,%rcx
+       movq    64(%rdi),%r8
+
+
+
+
+
+
+       movq    $3,%rax
+       movq    $1,%r10
+       cmpq    $4,%rdx
+       cmovaeq %r10,%rax
+       testq   %r8,%r8
+       cmovnsq %r10,%rax
+
+       andq    %rdx,%rax
+       jz      .Lblocks_vpmadd52_4x
+
+       subq    %rax,%rdx
+       movl    $7,%r10d
+       movl    $1,%r11d
+       kmovw   %r10d,%k7
+       leaq    .L2_44_inp_permd(%rip),%r10
+       kmovw   %r11d,%k1
+
+       vmovq   %rcx,%xmm21
+       vmovdqa64       0(%r10),%ymm19
+       vmovdqa64       32(%r10),%ymm20
+       vpermq  $0xcf,%ymm21,%ymm21
+       vmovdqa64       64(%r10),%ymm22
+
+       vmovdqu64       0(%rdi),%ymm16{%k7}{z}
+       vmovdqu64       40(%rdi),%ymm3{%k7}{z}
+       vmovdqu64       32(%rdi),%ymm4{%k7}{z}
+       vmovdqu64       24(%rdi),%ymm5{%k7}{z}
+
+       vmovdqa64       96(%r10),%ymm23
+       vmovdqa64       128(%r10),%ymm24
+
+       jmp     .Loop_vpmadd52
+
+.align 32
+.Loop_vpmadd52:
+       vmovdqu32       0(%rsi),%xmm18
+       leaq    16(%rsi),%rsi
+
+       vpermd  %ymm18,%ymm19,%ymm18
+       vpsrlvq %ymm20,%ymm18,%ymm18
+       vpandq  %ymm22,%ymm18,%ymm18
+       vporq   %ymm21,%ymm18,%ymm18
+
+       vpaddq  %ymm18,%ymm16,%ymm16
+
+       vpermq  $0,%ymm16,%ymm0{%k7}{z}
+       vpermq  $85,%ymm16,%ymm1{%k7}{z}
+       vpermq  $170,%ymm16,%ymm2{%k7}{z}
+
+       vpxord  %ymm16,%ymm16,%ymm16
+       vpxord  %ymm17,%ymm17,%ymm17
+
+       vpmadd52luq     %ymm3,%ymm0,%ymm16
+       vpmadd52huq     %ymm3,%ymm0,%ymm17
+
+       vpmadd52luq     %ymm4,%ymm1,%ymm16
+       vpmadd52huq     %ymm4,%ymm1,%ymm17
+
+       vpmadd52luq     %ymm5,%ymm2,%ymm16
+       vpmadd52huq     %ymm5,%ymm2,%ymm17
+
+       vpsrlvq %ymm23,%ymm16,%ymm18
+       vpsllvq %ymm24,%ymm17,%ymm17
+       vpandq  %ymm22,%ymm16,%ymm16
+
+       vpaddq  %ymm18,%ymm17,%ymm17
+
+       vpermq  $147,%ymm17,%ymm17
+
+       vpaddq  %ymm17,%ymm16,%ymm16
+
+       vpsrlvq %ymm23,%ymm16,%ymm18
+       vpandq  %ymm22,%ymm16,%ymm16
+
+       vpermq  $147,%ymm18,%ymm18
+
+       vpaddq  %ymm18,%ymm16,%ymm16
+
+       vpermq  $147,%ymm16,%ymm18{%k1}{z}
+
+       vpaddq  %ymm18,%ymm16,%ymm16
+       vpsllq  $2,%ymm18,%ymm18
+
+       vpaddq  %ymm18,%ymm16,%ymm16
+
+       decq    %rax
+       jnz     .Loop_vpmadd52
+
+       vmovdqu64       %ymm16,0(%rdi){%k7}
+
+       testq   %rdx,%rdx
+       jnz     .Lblocks_vpmadd52_4x
+
+.Lno_data_vpmadd52:
+       .byte   0xf3,0xc3
+.size  poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
+.type  poly1305_blocks_vpmadd52_4x,@function
+.align 32
+poly1305_blocks_vpmadd52_4x:
+       shrq    $4,%rdx
+       jz      .Lno_data_vpmadd52_4x
+
+       shlq    $40,%rcx
+       movq    64(%rdi),%r8
+
+.Lblocks_vpmadd52_4x:
+       vpbroadcastq    %rcx,%ymm31
+
+       vmovdqa64       .Lx_mask44(%rip),%ymm28
+       movl    $5,%eax
+       vmovdqa64       .Lx_mask42(%rip),%ymm29
+       kmovw   %eax,%k1
+
+       testq   %r8,%r8
+       js      .Linit_vpmadd52
+
+       vmovq   0(%rdi),%xmm0
+       vmovq   8(%rdi),%xmm1
+       vmovq   16(%rdi),%xmm2
+
+       testq   $3,%rdx
+       jnz     .Lblocks_vpmadd52_2x_do
+
+.Lblocks_vpmadd52_4x_do:
+       vpbroadcastq    64(%rdi),%ymm3
+       vpbroadcastq    96(%rdi),%ymm4
+       vpbroadcastq    128(%rdi),%ymm5
+       vpbroadcastq    160(%rdi),%ymm16
+
+.Lblocks_vpmadd52_4x_key_loaded:
+       vpsllq  $2,%ymm5,%ymm17
+       vpaddq  %ymm5,%ymm17,%ymm17
+       vpsllq  $2,%ymm17,%ymm17
+
+       testq   $7,%rdx
+       jz      .Lblocks_vpmadd52_8x
+
+       vmovdqu64       0(%rsi),%ymm26
+       vmovdqu64       32(%rsi),%ymm27
+       leaq    64(%rsi),%rsi
+
+       vpunpcklqdq     %ymm27,%ymm26,%ymm25
+       vpunpckhqdq     %ymm27,%ymm26,%ymm27
+
+
+
+       vpsrlq  $24,%ymm27,%ymm26
+       vporq   %ymm31,%ymm26,%ymm26
+       vpaddq  %ymm26,%ymm2,%ymm2
+       vpandq  %ymm28,%ymm25,%ymm24
+       vpsrlq  $44,%ymm25,%ymm25
+       vpsllq  $20,%ymm27,%ymm27
+       vporq   %ymm27,%ymm25,%ymm25
+       vpandq  %ymm28,%ymm25,%ymm25
+
+       subq    $4,%rdx
+       jz      .Ltail_vpmadd52_4x
+       jmp     .Loop_vpmadd52_4x
+       ud2
+
+.align 32
+.Linit_vpmadd52:
+       vmovq   24(%rdi),%xmm16
+       vmovq   56(%rdi),%xmm2
+       vmovq   32(%rdi),%xmm17
+       vmovq   40(%rdi),%xmm3
+       vmovq   48(%rdi),%xmm4
+
+       vmovdqa %ymm3,%ymm0
+       vmovdqa %ymm4,%ymm1
+       vmovdqa %ymm2,%ymm5
+
+       movl    $2,%eax
+
+.Lmul_init_vpmadd52:
+       vpxorq  %ymm18,%ymm18,%ymm18
+       vpmadd52luq     %ymm2,%ymm16,%ymm18
+       vpxorq  %ymm19,%ymm19,%ymm19
+       vpmadd52huq     %ymm2,%ymm16,%ymm19
+       vpxorq  %ymm20,%ymm20,%ymm20
+       vpmadd52luq     %ymm2,%ymm17,%ymm20
+       vpxorq  %ymm21,%ymm21,%ymm21
+       vpmadd52huq     %ymm2,%ymm17,%ymm21
+       vpxorq  %ymm22,%ymm22,%ymm22
+       vpmadd52luq     %ymm2,%ymm3,%ymm22
+       vpxorq  %ymm23,%ymm23,%ymm23
+       vpmadd52huq     %ymm2,%ymm3,%ymm23
+
+       vpmadd52luq     %ymm0,%ymm3,%ymm18
+       vpmadd52huq     %ymm0,%ymm3,%ymm19
+       vpmadd52luq     %ymm0,%ymm4,%ymm20
+       vpmadd52huq     %ymm0,%ymm4,%ymm21
+       vpmadd52luq     %ymm0,%ymm5,%ymm22
+       vpmadd52huq     %ymm0,%ymm5,%ymm23
+
+       vpmadd52luq     %ymm1,%ymm17,%ymm18
+       vpmadd52huq     %ymm1,%ymm17,%ymm19
+       vpmadd52luq     %ymm1,%ymm3,%ymm20
+       vpmadd52huq     %ymm1,%ymm3,%ymm21
+       vpmadd52luq     %ymm1,%ymm4,%ymm22
+       vpmadd52huq     %ymm1,%ymm4,%ymm23
+
+
+
+       vpsrlq  $44,%ymm18,%ymm30
+       vpsllq  $8,%ymm19,%ymm19
+       vpandq  %ymm28,%ymm18,%ymm0
+       vpaddq  %ymm30,%ymm19,%ymm19
+
+       vpaddq  %ymm19,%ymm20,%ymm20
+
+       vpsrlq  $44,%ymm20,%ymm30
+       vpsllq  $8,%ymm21,%ymm21
+       vpandq  %ymm28,%ymm20,%ymm1
+       vpaddq  %ymm30,%ymm21,%ymm21
+
+       vpaddq  %ymm21,%ymm22,%ymm22
+
+       vpsrlq  $42,%ymm22,%ymm30
+       vpsllq  $10,%ymm23,%ymm23
+       vpandq  %ymm29,%ymm22,%ymm2
+       vpaddq  %ymm30,%ymm23,%ymm23
+
+       vpaddq  %ymm23,%ymm0,%ymm0
+       vpsllq  $2,%ymm23,%ymm23
+
+       vpaddq  %ymm23,%ymm0,%ymm0
+
+       vpsrlq  $44,%ymm0,%ymm30
+       vpandq  %ymm28,%ymm0,%ymm0
+
+       vpaddq  %ymm30,%ymm1,%ymm1
+
+       decl    %eax
+       jz      .Ldone_init_vpmadd52
+
+       vpunpcklqdq     %ymm4,%ymm1,%ymm4
+       vpbroadcastq    %xmm1,%xmm1
+       vpunpcklqdq     %ymm5,%ymm2,%ymm5
+       vpbroadcastq    %xmm2,%xmm2
+       vpunpcklqdq     %ymm3,%ymm0,%ymm3
+       vpbroadcastq    %xmm0,%xmm0
+
+       vpsllq  $2,%ymm4,%ymm16
+       vpsllq  $2,%ymm5,%ymm17
+       vpaddq  %ymm4,%ymm16,%ymm16
+       vpaddq  %ymm5,%ymm17,%ymm17
+       vpsllq  $2,%ymm16,%ymm16
+       vpsllq  $2,%ymm17,%ymm17
+
+       jmp     .Lmul_init_vpmadd52
+       ud2
+
+.align 32
+.Ldone_init_vpmadd52:
+       vinserti128     $1,%xmm4,%ymm1,%ymm4
+       vinserti128     $1,%xmm5,%ymm2,%ymm5
+       vinserti128     $1,%xmm3,%ymm0,%ymm3
+
+       vpermq  $216,%ymm4,%ymm4
+       vpermq  $216,%ymm5,%ymm5
+       vpermq  $216,%ymm3,%ymm3
+
+       vpsllq  $2,%ymm4,%ymm16
+       vpaddq  %ymm4,%ymm16,%ymm16
+       vpsllq  $2,%ymm16,%ymm16
+
+       vmovq   0(%rdi),%xmm0
+       vmovq   8(%rdi),%xmm1
+       vmovq   16(%rdi),%xmm2
+
+       testq   $3,%rdx
+       jnz     .Ldone_init_vpmadd52_2x
+
+       vmovdqu64       %ymm3,64(%rdi)
+       vpbroadcastq    %xmm3,%ymm3
+       vmovdqu64       %ymm4,96(%rdi)
+       vpbroadcastq    %xmm4,%ymm4
+       vmovdqu64       %ymm5,128(%rdi)
+       vpbroadcastq    %xmm5,%ymm5
+       vmovdqu64       %ymm16,160(%rdi)
+       vpbroadcastq    %xmm16,%ymm16
+
+       jmp     .Lblocks_vpmadd52_4x_key_loaded
+       ud2
+
+.align 32
+.Ldone_init_vpmadd52_2x:
+       vmovdqu64       %ymm3,64(%rdi)
+       vpsrldq $8,%ymm3,%ymm3
+       vmovdqu64       %ymm4,96(%rdi)
+       vpsrldq $8,%ymm4,%ymm4
+       vmovdqu64       %ymm5,128(%rdi)
+       vpsrldq $8,%ymm5,%ymm5
+       vmovdqu64       %ymm16,160(%rdi)
+       vpsrldq $8,%ymm16,%ymm16
+       jmp     .Lblocks_vpmadd52_2x_key_loaded
+       ud2
+
+.align 32
+.Lblocks_vpmadd52_2x_do:
+       vmovdqu64       128+8(%rdi),%ymm5{%k1}{z}
+       vmovdqu64       160+8(%rdi),%ymm16{%k1}{z}
+       vmovdqu64       64+8(%rdi),%ymm3{%k1}{z}
+       vmovdqu64       96+8(%rdi),%ymm4{%k1}{z}
+
+.Lblocks_vpmadd52_2x_key_loaded:
+       vmovdqu64       0(%rsi),%ymm26
+       vpxorq  %ymm27,%ymm27,%ymm27
+       leaq    32(%rsi),%rsi
+
+       vpunpcklqdq     %ymm27,%ymm26,%ymm25
+       vpunpckhqdq     %ymm27,%ymm26,%ymm27
+
+
+
+       vpsrlq  $24,%ymm27,%ymm26
+       vporq   %ymm31,%ymm26,%ymm26
+       vpaddq  %ymm26,%ymm2,%ymm2
+       vpandq  %ymm28,%ymm25,%ymm24
+       vpsrlq  $44,%ymm25,%ymm25
+       vpsllq  $20,%ymm27,%ymm27
+       vporq   %ymm27,%ymm25,%ymm25
+       vpandq  %ymm28,%ymm25,%ymm25
+
+       jmp     .Ltail_vpmadd52_2x
+       ud2
+
+.align 32
+.Loop_vpmadd52_4x:
+
+       vpaddq  %ymm24,%ymm0,%ymm0
+       vpaddq  %ymm25,%ymm1,%ymm1
+
+       vpxorq  %ymm18,%ymm18,%ymm18
+       vpmadd52luq     %ymm2,%ymm16,%ymm18
+       vpxorq  %ymm19,%ymm19,%ymm19
+       vpmadd52huq     %ymm2,%ymm16,%ymm19
+       vpxorq  %ymm20,%ymm20,%ymm20
+       vpmadd52luq     %ymm2,%ymm17,%ymm20
+       vpxorq  %ymm21,%ymm21,%ymm21
+       vpmadd52huq     %ymm2,%ymm17,%ymm21
+       vpxorq  %ymm22,%ymm22,%ymm22
+       vpmadd52luq     %ymm2,%ymm3,%ymm22
+       vpxorq  %ymm23,%ymm23,%ymm23
+       vpmadd52huq     %ymm2,%ymm3,%ymm23
+
+       vmovdqu64       0(%rsi),%ymm26
+       vmovdqu64       32(%rsi),%ymm27
+       leaq    64(%rsi),%rsi
+       vpmadd52luq     %ymm0,%ymm3,%ymm18
+       vpmadd52huq     %ymm0,%ymm3,%ymm19
+       vpmadd52luq     %ymm0,%ymm4,%ymm20
+       vpmadd52huq     %ymm0,%ymm4,%ymm21
+       vpmadd52luq     %ymm0,%ymm5,%ymm22
+       vpmadd52huq     %ymm0,%ymm5,%ymm23
+
+       vpunpcklqdq     %ymm27,%ymm26,%ymm25
+       vpunpckhqdq     %ymm27,%ymm26,%ymm27
+       vpmadd52luq     %ymm1,%ymm17,%ymm18
+       vpmadd52huq     %ymm1,%ymm17,%ymm19
+       vpmadd52luq     %ymm1,%ymm3,%ymm20
+       vpmadd52huq     %ymm1,%ymm3,%ymm21
+       vpmadd52luq     %ymm1,%ymm4,%ymm22
+       vpmadd52huq     %ymm1,%ymm4,%ymm23
+
+
+
+       vpsrlq  $44,%ymm18,%ymm30
+       vpsllq  $8,%ymm19,%ymm19
+       vpandq  %ymm28,%ymm18,%ymm0
+       vpaddq  %ymm30,%ymm19,%ymm19
+
+       vpsrlq  $24,%ymm27,%ymm26
+       vporq   %ymm31,%ymm26,%ymm26
+       vpaddq  %ymm19,%ymm20,%ymm20
+
+       vpsrlq  $44,%ymm20,%ymm30
+       vpsllq  $8,%ymm21,%ymm21
+       vpandq  %ymm28,%ymm20,%ymm1
+       vpaddq  %ymm30,%ymm21,%ymm21
+
+       vpandq  %ymm28,%ymm25,%ymm24
+       vpsrlq  $44,%ymm25,%ymm25
+       vpsllq  $20,%ymm27,%ymm27
+       vpaddq  %ymm21,%ymm22,%ymm22
+
+       vpsrlq  $42,%ymm22,%ymm30
+       vpsllq  $10,%ymm23,%ymm23
+       vpandq  %ymm29,%ymm22,%ymm2
+       vpaddq  %ymm30,%ymm23,%ymm23
+
+       vpaddq  %ymm26,%ymm2,%ymm2
+       vpaddq  %ymm23,%ymm0,%ymm0
+       vpsllq  $2,%ymm23,%ymm23
+
+       vpaddq  %ymm23,%ymm0,%ymm0
+       vporq   %ymm27,%ymm25,%ymm25
+       vpandq  %ymm28,%ymm25,%ymm25
+
+       vpsrlq  $44,%ymm0,%ymm30
+       vpandq  %ymm28,%ymm0,%ymm0
+
+       vpaddq  %ymm30,%ymm1,%ymm1
+
+       subq    $4,%rdx
+       jnz     .Loop_vpmadd52_4x
+
+.Ltail_vpmadd52_4x:
+       vmovdqu64       128(%rdi),%ymm5
+       vmovdqu64       160(%rdi),%ymm16
+       vmovdqu64       64(%rdi),%ymm3
+       vmovdqu64       96(%rdi),%ymm4
+
+.Ltail_vpmadd52_2x:
+       vpsllq  $2,%ymm5,%ymm17
+       vpaddq  %ymm5,%ymm17,%ymm17
+       vpsllq  $2,%ymm17,%ymm17
+
+
+       vpaddq  %ymm24,%ymm0,%ymm0
+       vpaddq  %ymm25,%ymm1,%ymm1
+
+       vpxorq  %ymm18,%ymm18,%ymm18
+       vpmadd52luq     %ymm2,%ymm16,%ymm18
+       vpxorq  %ymm19,%ymm19,%ymm19
+       vpmadd52huq     %ymm2,%ymm16,%ymm19
+       vpxorq  %ymm20,%ymm20,%ymm20
+       vpmadd52luq     %ymm2,%ymm17,%ymm20
+       vpxorq  %ymm21,%ymm21,%ymm21
+       vpmadd52huq     %ymm2,%ymm17,%ymm21
+       vpxorq  %ymm22,%ymm22,%ymm22
+       vpmadd52luq     %ymm2,%ymm3,%ymm22
+       vpxorq  %ymm23,%ymm23,%ymm23
+       vpmadd52huq     %ymm2,%ymm3,%ymm23
+
+       vpmadd52luq     %ymm0,%ymm3,%ymm18
+       vpmadd52huq     %ymm0,%ymm3,%ymm19
+       vpmadd52luq     %ymm0,%ymm4,%ymm20
+       vpmadd52huq     %ymm0,%ymm4,%ymm21
+       vpmadd52luq     %ymm0,%ymm5,%ymm22
+       vpmadd52huq     %ymm0,%ymm5,%ymm23
+
+       vpmadd52luq     %ymm1,%ymm17,%ymm18
+       vpmadd52huq     %ymm1,%ymm17,%ymm19
+       vpmadd52luq     %ymm1,%ymm3,%ymm20
+       vpmadd52huq     %ymm1,%ymm3,%ymm21
+       vpmadd52luq     %ymm1,%ymm4,%ymm22
+       vpmadd52huq     %ymm1,%ymm4,%ymm23
+
+
+
+
+       movl    $1,%eax
+       kmovw   %eax,%k1
+       vpsrldq $8,%ymm18,%ymm24
+       vpsrldq $8,%ymm19,%ymm0
+       vpsrldq $8,%ymm20,%ymm25
+       vpsrldq $8,%ymm21,%ymm1
+       vpaddq  %ymm24,%ymm18,%ymm18
+       vpaddq  %ymm0,%ymm19,%ymm19
+       vpsrldq $8,%ymm22,%ymm26
+       vpsrldq $8,%ymm23,%ymm2
+       vpaddq  %ymm25,%ymm20,%ymm20
+       vpaddq  %ymm1,%ymm21,%ymm21
+       vpermq  $0x2,%ymm18,%ymm24
+       vpermq  $0x2,%ymm19,%ymm0
+       vpaddq  %ymm26,%ymm22,%ymm22
+       vpaddq  %ymm2,%ymm23,%ymm23
+
+       vpermq  $0x2,%ymm20,%ymm25
+       vpermq  $0x2,%ymm21,%ymm1
+       vpaddq  %ymm24,%ymm18,%ymm18{%k1}{z}
+       vpaddq  %ymm0,%ymm19,%ymm19{%k1}{z}
+       vpermq  $0x2,%ymm22,%ymm26
+       vpermq  $0x2,%ymm23,%ymm2
+       vpaddq  %ymm25,%ymm20,%ymm20{%k1}{z}
+       vpaddq  %ymm1,%ymm21,%ymm21{%k1}{z}
+       vpaddq  %ymm26,%ymm22,%ymm22{%k1}{z}
+       vpaddq  %ymm2,%ymm23,%ymm23{%k1}{z}
+
+
+
+       vpsrlq  $44,%ymm18,%ymm30
+       vpsllq  $8,%ymm19,%ymm19
+       vpandq  %ymm28,%ymm18,%ymm0
+       vpaddq  %ymm30,%ymm19,%ymm19
+
+       vpaddq  %ymm19,%ymm20,%ymm20
+
+       vpsrlq  $44,%ymm20,%ymm30
+       vpsllq  $8,%ymm21,%ymm21
+       vpandq  %ymm28,%ymm20,%ymm1
+       vpaddq  %ymm30,%ymm21,%ymm21
+
+       vpaddq  %ymm21,%ymm22,%ymm22
+
+       vpsrlq  $42,%ymm22,%ymm30
+       vpsllq  $10,%ymm23,%ymm23
+       vpandq  %ymm29,%ymm22,%ymm2
+       vpaddq  %ymm30,%ymm23,%ymm23
+
+       vpaddq  %ymm23,%ymm0,%ymm0
+       vpsllq  $2,%ymm23,%ymm23
+
+       vpaddq  %ymm23,%ymm0,%ymm0
+
+       vpsrlq  $44,%ymm0,%ymm30
+       vpandq  %ymm28,%ymm0,%ymm0
+
+       vpaddq  %ymm30,%ymm1,%ymm1
+
+
+       subq    $2,%rdx
+       ja      .Lblocks_vpmadd52_4x_do
+
+       vmovq   %xmm0,0(%rdi)
+       vmovq   %xmm1,8(%rdi)
+       vmovq   %xmm2,16(%rdi)
+       vzeroall
+
+.Lno_data_vpmadd52_4x:
+       .byte   0xf3,0xc3
+.size  poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
+.type  poly1305_blocks_vpmadd52_8x,@function
+.align 32
+poly1305_blocks_vpmadd52_8x:
+       shrq    $4,%rdx
+       jz      .Lno_data_vpmadd52_8x
+
+       shlq    $40,%rcx
+       movq    64(%rdi),%r8
+
+       vmovdqa64       .Lx_mask44(%rip),%ymm28
+       vmovdqa64       .Lx_mask42(%rip),%ymm29
+
+       testq   %r8,%r8
+       js      .Linit_vpmadd52
+
+       vmovq   0(%rdi),%xmm0
+       vmovq   8(%rdi),%xmm1
+       vmovq   16(%rdi),%xmm2
+
+.Lblocks_vpmadd52_8x:
+
+
+
+       vmovdqu64       128(%rdi),%ymm5
+       vmovdqu64       160(%rdi),%ymm16
+       vmovdqu64       64(%rdi),%ymm3
+       vmovdqu64       96(%rdi),%ymm4
+
+       vpsllq  $2,%ymm5,%ymm17
+       vpaddq  %ymm5,%ymm17,%ymm17
+       vpsllq  $2,%ymm17,%ymm17
+
+       vpbroadcastq    %xmm5,%ymm8
+       vpbroadcastq    %xmm3,%ymm6
+       vpbroadcastq    %xmm4,%ymm7
+
+       vpxorq  %ymm18,%ymm18,%ymm18
+       vpmadd52luq     %ymm8,%ymm16,%ymm18
+       vpxorq  %ymm19,%ymm19,%ymm19
+       vpmadd52huq     %ymm8,%ymm16,%ymm19
+       vpxorq  %ymm20,%ymm20,%ymm20
+       vpmadd52luq     %ymm8,%ymm17,%ymm20
+       vpxorq  %ymm21,%ymm21,%ymm21
+       vpmadd52huq     %ymm8,%ymm17,%ymm21
+       vpxorq  %ymm22,%ymm22,%ymm22
+       vpmadd52luq     %ymm8,%ymm3,%ymm22
+       vpxorq  %ymm23,%ymm23,%ymm23
+       vpmadd52huq     %ymm8,%ymm3,%ymm23
+
+       vpmadd52luq     %ymm6,%ymm3,%ymm18
+       vpmadd52huq     %ymm6,%ymm3,%ymm19
+       vpmadd52luq     %ymm6,%ymm4,%ymm20
+       vpmadd52huq     %ymm6,%ymm4,%ymm21
+       vpmadd52luq     %ymm6,%ymm5,%ymm22
+       vpmadd52huq     %ymm6,%ymm5,%ymm23
+
+       vpmadd52luq     %ymm7,%ymm17,%ymm18
+       vpmadd52huq     %ymm7,%ymm17,%ymm19
+       vpmadd52luq     %ymm7,%ymm3,%ymm20
+       vpmadd52huq     %ymm7,%ymm3,%ymm21
+       vpmadd52luq     %ymm7,%ymm4,%ymm22
+       vpmadd52huq     %ymm7,%ymm4,%ymm23
+
+
+
+       vpsrlq  $44,%ymm18,%ymm30
+       vpsllq  $8,%ymm19,%ymm19
+       vpandq  %ymm28,%ymm18,%ymm6
+       vpaddq  %ymm30,%ymm19,%ymm19
+
+       vpaddq  %ymm19,%ymm20,%ymm20
+
+       vpsrlq  $44,%ymm20,%ymm30
+       vpsllq  $8,%ymm21,%ymm21
+       vpandq  %ymm28,%ymm20,%ymm7
+       vpaddq  %ymm30,%ymm21,%ymm21
+
+       vpaddq  %ymm21,%ymm22,%ymm22
+
+       vpsrlq  $42,%ymm22,%ymm30
+       vpsllq  $10,%ymm23,%ymm23
+       vpandq  %ymm29,%ymm22,%ymm8
+       vpaddq  %ymm30,%ymm23,%ymm23
+
+       vpaddq  %ymm23,%ymm6,%ymm6
+       vpsllq  $2,%ymm23,%ymm23
+
+       vpaddq  %ymm23,%ymm6,%ymm6
+
+       vpsrlq  $44,%ymm6,%ymm30
+       vpandq  %ymm28,%ymm6,%ymm6
+
+       vpaddq  %ymm30,%ymm7,%ymm7
+
+
+
+
+
+       vpunpcklqdq     %ymm5,%ymm8,%ymm26
+       vpunpckhqdq     %ymm5,%ymm8,%ymm5
+       vpunpcklqdq     %ymm3,%ymm6,%ymm24
+       vpunpckhqdq     %ymm3,%ymm6,%ymm3
+       vpunpcklqdq     %ymm4,%ymm7,%ymm25
+       vpunpckhqdq     %ymm4,%ymm7,%ymm4
+       vshufi64x2      $0x44,%zmm5,%zmm26,%zmm8
+       vshufi64x2      $0x44,%zmm3,%zmm24,%zmm6
+       vshufi64x2      $0x44,%zmm4,%zmm25,%zmm7
+
+       vmovdqu64       0(%rsi),%zmm26
+       vmovdqu64       64(%rsi),%zmm27
+       leaq    128(%rsi),%rsi
+
+       vpsllq  $2,%zmm8,%zmm10
+       vpsllq  $2,%zmm7,%zmm9
+       vpaddq  %zmm8,%zmm10,%zmm10
+       vpaddq  %zmm7,%zmm9,%zmm9
+       vpsllq  $2,%zmm10,%zmm10
+       vpsllq  $2,%zmm9,%zmm9
+
+       vpbroadcastq    %rcx,%zmm31
+       vpbroadcastq    %xmm28,%zmm28
+       vpbroadcastq    %xmm29,%zmm29
+
+       vpbroadcastq    %xmm9,%zmm16
+       vpbroadcastq    %xmm10,%zmm17
+       vpbroadcastq    %xmm6,%zmm3
+       vpbroadcastq    %xmm7,%zmm4
+       vpbroadcastq    %xmm8,%zmm5
+
+       vpunpcklqdq     %zmm27,%zmm26,%zmm25
+       vpunpckhqdq     %zmm27,%zmm26,%zmm27
+
+
+
+       vpsrlq  $24,%zmm27,%zmm26
+       vporq   %zmm31,%zmm26,%zmm26
+       vpaddq  %zmm26,%zmm2,%zmm2
+       vpandq  %zmm28,%zmm25,%zmm24
+       vpsrlq  $44,%zmm25,%zmm25
+       vpsllq  $20,%zmm27,%zmm27
+       vporq   %zmm27,%zmm25,%zmm25
+       vpandq  %zmm28,%zmm25,%zmm25
+
+       subq    $8,%rdx
+       jz      .Ltail_vpmadd52_8x
+       jmp     .Loop_vpmadd52_8x
+
+.align 32
+.Loop_vpmadd52_8x:
+
+       vpaddq  %zmm24,%zmm0,%zmm0
+       vpaddq  %zmm25,%zmm1,%zmm1
+
+       vpxorq  %zmm18,%zmm18,%zmm18
+       vpmadd52luq     %zmm2,%zmm16,%zmm18
+       vpxorq  %zmm19,%zmm19,%zmm19
+       vpmadd52huq     %zmm2,%zmm16,%zmm19
+       vpxorq  %zmm20,%zmm20,%zmm20
+       vpmadd52luq     %zmm2,%zmm17,%zmm20
+       vpxorq  %zmm21,%zmm21,%zmm21
+       vpmadd52huq     %zmm2,%zmm17,%zmm21
+       vpxorq  %zmm22,%zmm22,%zmm22
+       vpmadd52luq     %zmm2,%zmm3,%zmm22
+       vpxorq  %zmm23,%zmm23,%zmm23
+       vpmadd52huq     %zmm2,%zmm3,%zmm23
+
+       vmovdqu64       0(%rsi),%zmm26
+       vmovdqu64       64(%rsi),%zmm27
+       leaq    128(%rsi),%rsi
+       vpmadd52luq     %zmm0,%zmm3,%zmm18
+       vpmadd52huq     %zmm0,%zmm3,%zmm19
+       vpmadd52luq     %zmm0,%zmm4,%zmm20
+       vpmadd52huq     %zmm0,%zmm4,%zmm21
+       vpmadd52luq     %zmm0,%zmm5,%zmm22
+       vpmadd52huq     %zmm0,%zmm5,%zmm23
+
+       vpunpcklqdq     %zmm27,%zmm26,%zmm25
+       vpunpckhqdq     %zmm27,%zmm26,%zmm27
+       vpmadd52luq     %zmm1,%zmm17,%zmm18
+       vpmadd52huq     %zmm1,%zmm17,%zmm19
+       vpmadd52luq     %zmm1,%zmm3,%zmm20
+       vpmadd52huq     %zmm1,%zmm3,%zmm21
+       vpmadd52luq     %zmm1,%zmm4,%zmm22
+       vpmadd52huq     %zmm1,%zmm4,%zmm23
+
+
+
+       vpsrlq  $44,%zmm18,%zmm30
+       vpsllq  $8,%zmm19,%zmm19
+       vpandq  %zmm28,%zmm18,%zmm0
+       vpaddq  %zmm30,%zmm19,%zmm19
+
+       vpsrlq  $24,%zmm27,%zmm26
+       vporq   %zmm31,%zmm26,%zmm26
+       vpaddq  %zmm19,%zmm20,%zmm20
+
+       vpsrlq  $44,%zmm20,%zmm30
+       vpsllq  $8,%zmm21,%zmm21
+       vpandq  %zmm28,%zmm20,%zmm1
+       vpaddq  %zmm30,%zmm21,%zmm21
+
+       vpandq  %zmm28,%zmm25,%zmm24
+       vpsrlq  $44,%zmm25,%zmm25
+       vpsllq  $20,%zmm27,%zmm27
+       vpaddq  %zmm21,%zmm22,%zmm22
+
+       vpsrlq  $42,%zmm22,%zmm30
+       vpsllq  $10,%zmm23,%zmm23
+       vpandq  %zmm29,%zmm22,%zmm2
+       vpaddq  %zmm30,%zmm23,%zmm23
+
+       vpaddq  %zmm26,%zmm2,%zmm2
+       vpaddq  %zmm23,%zmm0,%zmm0
+       vpsllq  $2,%zmm23,%zmm23
+
+       vpaddq  %zmm23,%zmm0,%zmm0
+       vporq   %zmm27,%zmm25,%zmm25
+       vpandq  %zmm28,%zmm25,%zmm25
+
+       vpsrlq  $44,%zmm0,%zmm30
+       vpandq  %zmm28,%zmm0,%zmm0
+
+       vpaddq  %zmm30,%zmm1,%zmm1
+
+       subq    $8,%rdx
+       jnz     .Loop_vpmadd52_8x
+
+.Ltail_vpmadd52_8x:
+
+       vpaddq  %zmm24,%zmm0,%zmm0
+       vpaddq  %zmm25,%zmm1,%zmm1
+
+       vpxorq  %zmm18,%zmm18,%zmm18
+       vpmadd52luq     %zmm2,%zmm9,%zmm18
+       vpxorq  %zmm19,%zmm19,%zmm19
+       vpmadd52huq     %zmm2,%zmm9,%zmm19
+       vpxorq  %zmm20,%zmm20,%zmm20
+       vpmadd52luq     %zmm2,%zmm10,%zmm20
+       vpxorq  %zmm21,%zmm21,%zmm21
+       vpmadd52huq     %zmm2,%zmm10,%zmm21
+       vpxorq  %zmm22,%zmm22,%zmm22
+       vpmadd52luq     %zmm2,%zmm6,%zmm22
+       vpxorq  %zmm23,%zmm23,%zmm23
+       vpmadd52huq     %zmm2,%zmm6,%zmm23
+
+       vpmadd52luq     %zmm0,%zmm6,%zmm18
+       vpmadd52huq     %zmm0,%zmm6,%zmm19
+       vpmadd52luq     %zmm0,%zmm7,%zmm20
+       vpmadd52huq     %zmm0,%zmm7,%zmm21
+       vpmadd52luq     %zmm0,%zmm8,%zmm22
+       vpmadd52huq     %zmm0,%zmm8,%zmm23
+
+       vpmadd52luq     %zmm1,%zmm10,%zmm18
+       vpmadd52huq     %zmm1,%zmm10,%zmm19
+       vpmadd52luq     %zmm1,%zmm6,%zmm20
+       vpmadd52huq     %zmm1,%zmm6,%zmm21
+       vpmadd52luq     %zmm1,%zmm7,%zmm22
+       vpmadd52huq     %zmm1,%zmm7,%zmm23
+
+
+
+
+       movl    $1,%eax
+       kmovw   %eax,%k1
+       vpsrldq $8,%zmm18,%zmm24
+       vpsrldq $8,%zmm19,%zmm0
+       vpsrldq $8,%zmm20,%zmm25
+       vpsrldq $8,%zmm21,%zmm1
+       vpaddq  %zmm24,%zmm18,%zmm18
+       vpaddq  %zmm0,%zmm19,%zmm19
+       vpsrldq $8,%zmm22,%zmm26
+       vpsrldq $8,%zmm23,%zmm2
+       vpaddq  %zmm25,%zmm20,%zmm20
+       vpaddq  %zmm1,%zmm21,%zmm21
+       vpermq  $0x2,%zmm18,%zmm24
+       vpermq  $0x2,%zmm19,%zmm0
+       vpaddq  %zmm26,%zmm22,%zmm22
+       vpaddq  %zmm2,%zmm23,%zmm23
+
+       vpermq  $0x2,%zmm20,%zmm25
+       vpermq  $0x2,%zmm21,%zmm1
+       vpaddq  %zmm24,%zmm18,%zmm18
+       vpaddq  %zmm0,%zmm19,%zmm19
+       vpermq  $0x2,%zmm22,%zmm26
+       vpermq  $0x2,%zmm23,%zmm2
+       vpaddq  %zmm25,%zmm20,%zmm20
+       vpaddq  %zmm1,%zmm21,%zmm21
+       vextracti64x4   $1,%zmm18,%ymm24
+       vextracti64x4   $1,%zmm19,%ymm0
+       vpaddq  %zmm26,%zmm22,%zmm22
+       vpaddq  %zmm2,%zmm23,%zmm23
+
+       vextracti64x4   $1,%zmm20,%ymm25
+       vextracti64x4   $1,%zmm21,%ymm1
+       vextracti64x4   $1,%zmm22,%ymm26
+       vextracti64x4   $1,%zmm23,%ymm2
+       vpaddq  %ymm24,%ymm18,%ymm18{%k1}{z}
+       vpaddq  %ymm0,%ymm19,%ymm19{%k1}{z}
+       vpaddq  %ymm25,%ymm20,%ymm20{%k1}{z}
+       vpaddq  %ymm1,%ymm21,%ymm21{%k1}{z}
+       vpaddq  %ymm26,%ymm22,%ymm22{%k1}{z}
+       vpaddq  %ymm2,%ymm23,%ymm23{%k1}{z}
+
+
+
+       vpsrlq  $44,%ymm18,%ymm30
+       vpsllq  $8,%ymm19,%ymm19
+       vpandq  %ymm28,%ymm18,%ymm0
+       vpaddq  %ymm30,%ymm19,%ymm19
+
+       vpaddq  %ymm19,%ymm20,%ymm20
+
+       vpsrlq  $44,%ymm20,%ymm30
+       vpsllq  $8,%ymm21,%ymm21
+       vpandq  %ymm28,%ymm20,%ymm1
+       vpaddq  %ymm30,%ymm21,%ymm21
+
+       vpaddq  %ymm21,%ymm22,%ymm22
+
+       vpsrlq  $42,%ymm22,%ymm30
+       vpsllq  $10,%ymm23,%ymm23
+       vpandq  %ymm29,%ymm22,%ymm2
+       vpaddq  %ymm30,%ymm23,%ymm23
+
+       vpaddq  %ymm23,%ymm0,%ymm0
+       vpsllq  $2,%ymm23,%ymm23
+
+       vpaddq  %ymm23,%ymm0,%ymm0
+
+       vpsrlq  $44,%ymm0,%ymm30
+       vpandq  %ymm28,%ymm0,%ymm0
+
+       vpaddq  %ymm30,%ymm1,%ymm1
+
+
+
+       vmovq   %xmm0,0(%rdi)
+       vmovq   %xmm1,8(%rdi)
+       vmovq   %xmm2,16(%rdi)
+       vzeroall
+
+.Lno_data_vpmadd52_8x:
+       .byte   0xf3,0xc3
+.size  poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
+.type  poly1305_emit_base2_44,@function
+.align 32
+poly1305_emit_base2_44:
+       movq    0(%rdi),%r8
+       movq    8(%rdi),%r9
+       movq    16(%rdi),%r10
+
+       movq    %r9,%rax
+       shrq    $20,%r9
+       shlq    $44,%rax
+       movq    %r10,%rcx
+       shrq    $40,%r10
+       shlq    $24,%rcx
+
+       addq    %rax,%r8
+       adcq    %rcx,%r9
+       adcq    $0,%r10
+
+       movq    %r8,%rax
+       addq    $5,%r8
+       movq    %r9,%rcx
+       adcq    $0,%r9
+       adcq    $0,%r10
+       shrq    $2,%r10
+       cmovnzq %r8,%rax
+       cmovnzq %r9,%rcx
+
+       addq    0(%rdx),%rax
+       adcq    8(%rdx),%rcx
+       movq    %rax,0(%rsi)
+       movq    %rcx,8(%rsi)
+
+       .byte   0xf3,0xc3
+.size  poly1305_emit_base2_44,.-poly1305_emit_base2_44
+.align 64
+.Lconst:
+.Lmask24:
+.long  0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.L129:
+.long  16777216,0,16777216,0,16777216,0,16777216,0
+.Lmask26:
+.long  0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.Lpermd_avx2:
+.long  2,2,2,3,2,0,2,1
+.Lpermd_avx512:
+.long  0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.L2_44_inp_permd:
+.long  0,1,1,2,2,3,7,7
+.L2_44_inp_shift:
+.quad  0,12,24,64
+.L2_44_mask:
+.quad  0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+.L2_44_shift_rgt:
+.quad  44,44,42,64
+.L2_44_shift_lft:
+.quad  8,8,10,64
+
+.align 64
+.Lx_mask44:
+.quad  0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.quad  0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.Lx_mask42:
+.quad  0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.quad  0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.byte  
80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 16
+.globl xor128_encrypt_n_pad
+.type  xor128_encrypt_n_pad,@function
+.align 16
+xor128_encrypt_n_pad:
+       subq    %rdx,%rsi
+       subq    %rdx,%rdi
+       movq    %rcx,%r10
+       shrq    $4,%rcx
+       jz      .Ltail_enc
+       nop
+.Loop_enc_xmm:
+       movdqu  (%rsi,%rdx,1),%xmm0
+       pxor    (%rdx),%xmm0
+       movdqu  %xmm0,(%rdi,%rdx,1)
+       movdqa  %xmm0,(%rdx)
+       leaq    16(%rdx),%rdx
+       decq    %rcx
+       jnz     .Loop_enc_xmm
+
+       andq    $15,%r10
+       jz      .Ldone_enc
+
+.Ltail_enc:
+       movq    $16,%rcx
+       subq    %r10,%rcx
+       xorl    %eax,%eax
+.Loop_enc_byte:
+       movb    (%rsi,%rdx,1),%al
+       xorb    (%rdx),%al
+       movb    %al,(%rdi,%rdx,1)
+       movb    %al,(%rdx)
+       leaq    1(%rdx),%rdx
+       decq    %r10
+       jnz     .Loop_enc_byte
+
+       xorl    %eax,%eax
+.Loop_enc_pad:
+       movb    %al,(%rdx)
+       leaq    1(%rdx),%rdx
+       decq    %rcx
+       jnz     .Loop_enc_pad
+
+.Ldone_enc:
+       movq    %rdx,%rax
+       .byte   0xf3,0xc3
+.size  xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
+
+.globl xor128_decrypt_n_pad
+.type  xor128_decrypt_n_pad,@function
+.align 16
+xor128_decrypt_n_pad:
+       subq    %rdx,%rsi
+       subq    %rdx,%rdi
+       movq    %rcx,%r10
+       shrq    $4,%rcx
+       jz      .Ltail_dec
+       nop
+.Loop_dec_xmm:
+       movdqu  (%rsi,%rdx,1),%xmm0
+       movdqa  (%rdx),%xmm1
+       pxor    %xmm0,%xmm1
+       movdqu  %xmm1,(%rdi,%rdx,1)
+       movdqa  %xmm0,(%rdx)
+       leaq    16(%rdx),%rdx
+       decq    %rcx
+       jnz     .Loop_dec_xmm
+
+       pxor    %xmm1,%xmm1
+       andq    $15,%r10
+       jz      .Ldone_dec
+
+.Ltail_dec:
+       movq    $16,%rcx
+       subq    %r10,%rcx
+       xorl    %eax,%eax
+       xorq    %r11,%r11
+.Loop_dec_byte:
+       movb    (%rsi,%rdx,1),%r11b
+       movb    (%rdx),%al
+       xorb    %r11b,%al
+       movb    %al,(%rdi,%rdx,1)
+       movb    %r11b,(%rdx)
+       leaq    1(%rdx),%rdx
+       decq    %r10
+       jnz     .Loop_dec_byte
+
+       xorl    %eax,%eax
+.Loop_dec_pad:
+       movb    %al,(%rdx)
+       leaq    1(%rdx),%rdx
+       decq    %rcx
+       jnz     .Loop_dec_pad
+
+.Ldone_dec:
+       movq    %rdx,%rax
+       .byte   0xf3,0xc3
+.size  xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
-- 
2.19.1

Reply via email to