These x86_64 vectorized implementations come from Andy Polyakov's
implementation, and are included here in raw form without modification,
so that subsequent commits that fix these up for the kernel can see how
it has changed.

While this is CRYPTOGAMS code, the originating code for this happens to
be the same as OpenSSL's commit cded951378069a478391843f5f8653c1eb5128da

Signed-off-by: Jason A. Donenfeld <ja...@zx2c4.com>
Based-on-code-from: Andy Polyakov <ap...@openssl.org>
Cc: Andy Polyakov <ap...@openssl.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: x...@kernel.org
Cc: Samuel Neves <sne...@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumas...@gmail.com>
Cc: Andy Lutomirski <l...@kernel.org>
Cc: Greg KH <gre...@linuxfoundation.org>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: kernel-harden...@lists.openwall.com
Cc: linux-crypto@vger.kernel.org
---
 .../chacha20/chacha20-x86_64-cryptogams.S     | 3433 +++++++++++++++++
 1 file changed, 3433 insertions(+)
 create mode 100644 lib/zinc/chacha20/chacha20-x86_64-cryptogams.S

diff --git a/lib/zinc/chacha20/chacha20-x86_64-cryptogams.S 
b/lib/zinc/chacha20/chacha20-x86_64-cryptogams.S
new file mode 100644
index 000000000000..2bfc76f7e01f
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-x86_64-cryptogams.S
@@ -0,0 +1,3433 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <ap...@openssl.org>. All Rights 
Reserved.
+ */
+
+.text  
+
+
+
+.align 64
+.Lzero:
+.long  0,0,0,0
+.Lone:
+.long  1,0,0,0
+.Linc:
+.long  0,1,2,3
+.Lfour:
+.long  4,4,4,4
+.Lincy:
+.long  0,2,4,6,1,3,5,7
+.Leight:
+.long  8,8,8,8,8,8,8,8
+.Lrot16:
+.byte  0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.Lrot24:
+.byte  0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.Ltwoy:
+.long  2,0,0,0, 2,0,0,0
+.align 64
+.Lzeroz:
+.long  0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.Lfourz:
+.long  4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.Lincz:
+.long  0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lsixteen:
+.long  16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.Lsigma:
+.byte  101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.byte  
67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.globl ChaCha20_ctr32
+.type  ChaCha20_ctr32,@function
+.align 64
+ChaCha20_ctr32:
+.cfi_startproc 
+       cmpq    $0,%rdx
+       je      .Lno_data
+       movq    OPENSSL_ia32cap_P+4(%rip),%r10
+       btq     $48,%r10
+       jc      .LChaCha20_avx512
+       testq   %r10,%r10
+       js      .LChaCha20_avx512vl
+       testl   $512,%r10d
+       jnz     .LChaCha20_ssse3
+
+       pushq   %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset    %r15,-56
+       subq    $64+24,%rsp
+.cfi_adjust_cfa_offset 64+24
+.Lctr32_body:
+
+
+       movdqu  (%rcx),%xmm1
+       movdqu  16(%rcx),%xmm2
+       movdqu  (%r8),%xmm3
+       movdqa  .Lone(%rip),%xmm4
+
+
+       movdqa  %xmm1,16(%rsp)
+       movdqa  %xmm2,32(%rsp)
+       movdqa  %xmm3,48(%rsp)
+       movq    %rdx,%rbp
+       jmp     .Loop_outer
+
+.align 32
+.Loop_outer:
+       movl    $0x61707865,%eax
+       movl    $0x3320646e,%ebx
+       movl    $0x79622d32,%ecx
+       movl    $0x6b206574,%edx
+       movl    16(%rsp),%r8d
+       movl    20(%rsp),%r9d
+       movl    24(%rsp),%r10d
+       movl    28(%rsp),%r11d
+       movd    %xmm3,%r12d
+       movl    52(%rsp),%r13d
+       movl    56(%rsp),%r14d
+       movl    60(%rsp),%r15d
+
+       movq    %rbp,64+0(%rsp)
+       movl    $10,%ebp
+       movq    %rsi,64+8(%rsp)
+.byte  102,72,15,126,214
+       movq    %rdi,64+16(%rsp)
+       movq    %rsi,%rdi
+       shrq    $32,%rdi
+       jmp     .Loop
+
+.align 32
+.Loop:
+       addl    %r8d,%eax
+       xorl    %eax,%r12d
+       roll    $16,%r12d
+       addl    %r9d,%ebx
+       xorl    %ebx,%r13d
+       roll    $16,%r13d
+       addl    %r12d,%esi
+       xorl    %esi,%r8d
+       roll    $12,%r8d
+       addl    %r13d,%edi
+       xorl    %edi,%r9d
+       roll    $12,%r9d
+       addl    %r8d,%eax
+       xorl    %eax,%r12d
+       roll    $8,%r12d
+       addl    %r9d,%ebx
+       xorl    %ebx,%r13d
+       roll    $8,%r13d
+       addl    %r12d,%esi
+       xorl    %esi,%r8d
+       roll    $7,%r8d
+       addl    %r13d,%edi
+       xorl    %edi,%r9d
+       roll    $7,%r9d
+       movl    %esi,32(%rsp)
+       movl    %edi,36(%rsp)
+       movl    40(%rsp),%esi
+       movl    44(%rsp),%edi
+       addl    %r10d,%ecx
+       xorl    %ecx,%r14d
+       roll    $16,%r14d
+       addl    %r11d,%edx
+       xorl    %edx,%r15d
+       roll    $16,%r15d
+       addl    %r14d,%esi
+       xorl    %esi,%r10d
+       roll    $12,%r10d
+       addl    %r15d,%edi
+       xorl    %edi,%r11d
+       roll    $12,%r11d
+       addl    %r10d,%ecx
+       xorl    %ecx,%r14d
+       roll    $8,%r14d
+       addl    %r11d,%edx
+       xorl    %edx,%r15d
+       roll    $8,%r15d
+       addl    %r14d,%esi
+       xorl    %esi,%r10d
+       roll    $7,%r10d
+       addl    %r15d,%edi
+       xorl    %edi,%r11d
+       roll    $7,%r11d
+       addl    %r9d,%eax
+       xorl    %eax,%r15d
+       roll    $16,%r15d
+       addl    %r10d,%ebx
+       xorl    %ebx,%r12d
+       roll    $16,%r12d
+       addl    %r15d,%esi
+       xorl    %esi,%r9d
+       roll    $12,%r9d
+       addl    %r12d,%edi
+       xorl    %edi,%r10d
+       roll    $12,%r10d
+       addl    %r9d,%eax
+       xorl    %eax,%r15d
+       roll    $8,%r15d
+       addl    %r10d,%ebx
+       xorl    %ebx,%r12d
+       roll    $8,%r12d
+       addl    %r15d,%esi
+       xorl    %esi,%r9d
+       roll    $7,%r9d
+       addl    %r12d,%edi
+       xorl    %edi,%r10d
+       roll    $7,%r10d
+       movl    %esi,40(%rsp)
+       movl    %edi,44(%rsp)
+       movl    32(%rsp),%esi
+       movl    36(%rsp),%edi
+       addl    %r11d,%ecx
+       xorl    %ecx,%r13d
+       roll    $16,%r13d
+       addl    %r8d,%edx
+       xorl    %edx,%r14d
+       roll    $16,%r14d
+       addl    %r13d,%esi
+       xorl    %esi,%r11d
+       roll    $12,%r11d
+       addl    %r14d,%edi
+       xorl    %edi,%r8d
+       roll    $12,%r8d
+       addl    %r11d,%ecx
+       xorl    %ecx,%r13d
+       roll    $8,%r13d
+       addl    %r8d,%edx
+       xorl    %edx,%r14d
+       roll    $8,%r14d
+       addl    %r13d,%esi
+       xorl    %esi,%r11d
+       roll    $7,%r11d
+       addl    %r14d,%edi
+       xorl    %edi,%r8d
+       roll    $7,%r8d
+       decl    %ebp
+       jnz     .Loop
+       movl    %edi,36(%rsp)
+       movl    %esi,32(%rsp)
+       movq    64(%rsp),%rbp
+       movdqa  %xmm2,%xmm1
+       movq    64+8(%rsp),%rsi
+       paddd   %xmm4,%xmm3
+       movq    64+16(%rsp),%rdi
+
+       addl    $0x61707865,%eax
+       addl    $0x3320646e,%ebx
+       addl    $0x79622d32,%ecx
+       addl    $0x6b206574,%edx
+       addl    16(%rsp),%r8d
+       addl    20(%rsp),%r9d
+       addl    24(%rsp),%r10d
+       addl    28(%rsp),%r11d
+       addl    48(%rsp),%r12d
+       addl    52(%rsp),%r13d
+       addl    56(%rsp),%r14d
+       addl    60(%rsp),%r15d
+       paddd   32(%rsp),%xmm1
+
+       cmpq    $64,%rbp
+       jb      .Ltail
+
+       xorl    0(%rsi),%eax
+       xorl    4(%rsi),%ebx
+       xorl    8(%rsi),%ecx
+       xorl    12(%rsi),%edx
+       xorl    16(%rsi),%r8d
+       xorl    20(%rsi),%r9d
+       xorl    24(%rsi),%r10d
+       xorl    28(%rsi),%r11d
+       movdqu  32(%rsi),%xmm0
+       xorl    48(%rsi),%r12d
+       xorl    52(%rsi),%r13d
+       xorl    56(%rsi),%r14d
+       xorl    60(%rsi),%r15d
+       leaq    64(%rsi),%rsi
+       pxor    %xmm1,%xmm0
+
+       movdqa  %xmm2,32(%rsp)
+       movd    %xmm3,48(%rsp)
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+       movdqu  %xmm0,32(%rdi)
+       movl    %r12d,48(%rdi)
+       movl    %r13d,52(%rdi)
+       movl    %r14d,56(%rdi)
+       movl    %r15d,60(%rdi)
+       leaq    64(%rdi),%rdi
+
+       subq    $64,%rbp
+       jnz     .Loop_outer
+
+       jmp     .Ldone
+
+.align 16
+.Ltail:
+       movl    %eax,0(%rsp)
+       movl    %ebx,4(%rsp)
+       xorq    %rbx,%rbx
+       movl    %ecx,8(%rsp)
+       movl    %edx,12(%rsp)
+       movl    %r8d,16(%rsp)
+       movl    %r9d,20(%rsp)
+       movl    %r10d,24(%rsp)
+       movl    %r11d,28(%rsp)
+       movdqa  %xmm1,32(%rsp)
+       movl    %r12d,48(%rsp)
+       movl    %r13d,52(%rsp)
+       movl    %r14d,56(%rsp)
+       movl    %r15d,60(%rsp)
+
+.Loop_tail:
+       movzbl  (%rsi,%rbx,1),%eax
+       movzbl  (%rsp,%rbx,1),%edx
+       leaq    1(%rbx),%rbx
+       xorl    %edx,%eax
+       movb    %al,-1(%rdi,%rbx,1)
+       decq    %rbp
+       jnz     .Loop_tail
+
+.Ldone:
+       leaq    64+24+48(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lno_data:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  ChaCha20_ctr32,.-ChaCha20_ctr32
+.type  ChaCha20_ssse3,@function
+.align 32
+ChaCha20_ssse3:
+.cfi_startproc 
+.LChaCha20_ssse3:
+       movq    %rsp,%r9
+.cfi_def_cfa_register  %r9
+       testl   $2048,%r10d
+       jnz     .LChaCha20_4xop
+       cmpq    $128,%rdx
+       je      .LChaCha20_128
+       ja      .LChaCha20_4x
+
+.Ldo_sse3_after_all:
+       subq    $64+8,%rsp
+       movdqa  .Lsigma(%rip),%xmm0
+       movdqu  (%rcx),%xmm1
+       movdqu  16(%rcx),%xmm2
+       movdqu  (%r8),%xmm3
+       movdqa  .Lrot16(%rip),%xmm6
+       movdqa  .Lrot24(%rip),%xmm7
+
+       movdqa  %xmm0,0(%rsp)
+       movdqa  %xmm1,16(%rsp)
+       movdqa  %xmm2,32(%rsp)
+       movdqa  %xmm3,48(%rsp)
+       movq    $10,%r8
+       jmp     .Loop_ssse3
+
+.align 32
+.Loop_outer_ssse3:
+       movdqa  .Lone(%rip),%xmm3
+       movdqa  0(%rsp),%xmm0
+       movdqa  16(%rsp),%xmm1
+       movdqa  32(%rsp),%xmm2
+       paddd   48(%rsp),%xmm3
+       movq    $10,%r8
+       movdqa  %xmm3,48(%rsp)
+       jmp     .Loop_ssse3
+
+.align 32
+.Loop_ssse3:
+       paddd   %xmm1,%xmm0
+       pxor    %xmm0,%xmm3
+.byte  102,15,56,0,222
+       paddd   %xmm3,%xmm2
+       pxor    %xmm2,%xmm1
+       movdqa  %xmm1,%xmm4
+       psrld   $20,%xmm1
+       pslld   $12,%xmm4
+       por     %xmm4,%xmm1
+       paddd   %xmm1,%xmm0
+       pxor    %xmm0,%xmm3
+.byte  102,15,56,0,223
+       paddd   %xmm3,%xmm2
+       pxor    %xmm2,%xmm1
+       movdqa  %xmm1,%xmm4
+       psrld   $25,%xmm1
+       pslld   $7,%xmm4
+       por     %xmm4,%xmm1
+       pshufd  $78,%xmm2,%xmm2
+       pshufd  $57,%xmm1,%xmm1
+       pshufd  $147,%xmm3,%xmm3
+       nop
+       paddd   %xmm1,%xmm0
+       pxor    %xmm0,%xmm3
+.byte  102,15,56,0,222
+       paddd   %xmm3,%xmm2
+       pxor    %xmm2,%xmm1
+       movdqa  %xmm1,%xmm4
+       psrld   $20,%xmm1
+       pslld   $12,%xmm4
+       por     %xmm4,%xmm1
+       paddd   %xmm1,%xmm0
+       pxor    %xmm0,%xmm3
+.byte  102,15,56,0,223
+       paddd   %xmm3,%xmm2
+       pxor    %xmm2,%xmm1
+       movdqa  %xmm1,%xmm4
+       psrld   $25,%xmm1
+       pslld   $7,%xmm4
+       por     %xmm4,%xmm1
+       pshufd  $78,%xmm2,%xmm2
+       pshufd  $147,%xmm1,%xmm1
+       pshufd  $57,%xmm3,%xmm3
+       decq    %r8
+       jnz     .Loop_ssse3
+       paddd   0(%rsp),%xmm0
+       paddd   16(%rsp),%xmm1
+       paddd   32(%rsp),%xmm2
+       paddd   48(%rsp),%xmm3
+
+       cmpq    $64,%rdx
+       jb      .Ltail_ssse3
+
+       movdqu  0(%rsi),%xmm4
+       movdqu  16(%rsi),%xmm5
+       pxor    %xmm4,%xmm0
+       movdqu  32(%rsi),%xmm4
+       pxor    %xmm5,%xmm1
+       movdqu  48(%rsi),%xmm5
+       leaq    64(%rsi),%rsi
+       pxor    %xmm4,%xmm2
+       pxor    %xmm5,%xmm3
+
+       movdqu  %xmm0,0(%rdi)
+       movdqu  %xmm1,16(%rdi)
+       movdqu  %xmm2,32(%rdi)
+       movdqu  %xmm3,48(%rdi)
+       leaq    64(%rdi),%rdi
+
+       subq    $64,%rdx
+       jnz     .Loop_outer_ssse3
+
+       jmp     .Ldone_ssse3
+
+.align 16
+.Ltail_ssse3:
+       movdqa  %xmm0,0(%rsp)
+       movdqa  %xmm1,16(%rsp)
+       movdqa  %xmm2,32(%rsp)
+       movdqa  %xmm3,48(%rsp)
+       xorq    %r8,%r8
+
+.Loop_tail_ssse3:
+       movzbl  (%rsi,%r8,1),%eax
+       movzbl  (%rsp,%r8,1),%ecx
+       leaq    1(%r8),%r8
+       xorl    %ecx,%eax
+       movb    %al,-1(%rdi,%r8,1)
+       decq    %rdx
+       jnz     .Loop_tail_ssse3
+
+.Ldone_ssse3:
+       leaq    (%r9),%rsp
+.cfi_def_cfa_register  %rsp
+.Lssse3_epilogue:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  ChaCha20_ssse3,.-ChaCha20_ssse3
+.type  ChaCha20_128,@function
+.align 32
+ChaCha20_128:
+.cfi_startproc 
+.LChaCha20_128:
+       movq    %rsp,%r9
+.cfi_def_cfa_register  %r9
+       subq    $64+8,%rsp
+       movdqa  .Lsigma(%rip),%xmm8
+       movdqu  (%rcx),%xmm9
+       movdqu  16(%rcx),%xmm2
+       movdqu  (%r8),%xmm3
+       movdqa  .Lone(%rip),%xmm1
+       movdqa  .Lrot16(%rip),%xmm6
+       movdqa  .Lrot24(%rip),%xmm7
+
+       movdqa  %xmm8,%xmm10
+       movdqa  %xmm8,0(%rsp)
+       movdqa  %xmm9,%xmm11
+       movdqa  %xmm9,16(%rsp)
+       movdqa  %xmm2,%xmm0
+       movdqa  %xmm2,32(%rsp)
+       paddd   %xmm3,%xmm1
+       movdqa  %xmm3,48(%rsp)
+       movq    $10,%r8
+       jmp     .Loop_128
+
+.align 32
+.Loop_128:
+       paddd   %xmm9,%xmm8
+       pxor    %xmm8,%xmm3
+       paddd   %xmm11,%xmm10
+       pxor    %xmm10,%xmm1
+.byte  102,15,56,0,222
+.byte  102,15,56,0,206
+       paddd   %xmm3,%xmm2
+       paddd   %xmm1,%xmm0
+       pxor    %xmm2,%xmm9
+       pxor    %xmm0,%xmm11
+       movdqa  %xmm9,%xmm4
+       psrld   $20,%xmm9
+       movdqa  %xmm11,%xmm5
+       pslld   $12,%xmm4
+       psrld   $20,%xmm11
+       por     %xmm4,%xmm9
+       pslld   $12,%xmm5
+       por     %xmm5,%xmm11
+       paddd   %xmm9,%xmm8
+       pxor    %xmm8,%xmm3
+       paddd   %xmm11,%xmm10
+       pxor    %xmm10,%xmm1
+.byte  102,15,56,0,223
+.byte  102,15,56,0,207
+       paddd   %xmm3,%xmm2
+       paddd   %xmm1,%xmm0
+       pxor    %xmm2,%xmm9
+       pxor    %xmm0,%xmm11
+       movdqa  %xmm9,%xmm4
+       psrld   $25,%xmm9
+       movdqa  %xmm11,%xmm5
+       pslld   $7,%xmm4
+       psrld   $25,%xmm11
+       por     %xmm4,%xmm9
+       pslld   $7,%xmm5
+       por     %xmm5,%xmm11
+       pshufd  $78,%xmm2,%xmm2
+       pshufd  $57,%xmm9,%xmm9
+       pshufd  $147,%xmm3,%xmm3
+       pshufd  $78,%xmm0,%xmm0
+       pshufd  $57,%xmm11,%xmm11
+       pshufd  $147,%xmm1,%xmm1
+       paddd   %xmm9,%xmm8
+       pxor    %xmm8,%xmm3
+       paddd   %xmm11,%xmm10
+       pxor    %xmm10,%xmm1
+.byte  102,15,56,0,222
+.byte  102,15,56,0,206
+       paddd   %xmm3,%xmm2
+       paddd   %xmm1,%xmm0
+       pxor    %xmm2,%xmm9
+       pxor    %xmm0,%xmm11
+       movdqa  %xmm9,%xmm4
+       psrld   $20,%xmm9
+       movdqa  %xmm11,%xmm5
+       pslld   $12,%xmm4
+       psrld   $20,%xmm11
+       por     %xmm4,%xmm9
+       pslld   $12,%xmm5
+       por     %xmm5,%xmm11
+       paddd   %xmm9,%xmm8
+       pxor    %xmm8,%xmm3
+       paddd   %xmm11,%xmm10
+       pxor    %xmm10,%xmm1
+.byte  102,15,56,0,223
+.byte  102,15,56,0,207
+       paddd   %xmm3,%xmm2
+       paddd   %xmm1,%xmm0
+       pxor    %xmm2,%xmm9
+       pxor    %xmm0,%xmm11
+       movdqa  %xmm9,%xmm4
+       psrld   $25,%xmm9
+       movdqa  %xmm11,%xmm5
+       pslld   $7,%xmm4
+       psrld   $25,%xmm11
+       por     %xmm4,%xmm9
+       pslld   $7,%xmm5
+       por     %xmm5,%xmm11
+       pshufd  $78,%xmm2,%xmm2
+       pshufd  $147,%xmm9,%xmm9
+       pshufd  $57,%xmm3,%xmm3
+       pshufd  $78,%xmm0,%xmm0
+       pshufd  $147,%xmm11,%xmm11
+       pshufd  $57,%xmm1,%xmm1
+       decq    %r8
+       jnz     .Loop_128
+       paddd   0(%rsp),%xmm8
+       paddd   16(%rsp),%xmm9
+       paddd   32(%rsp),%xmm2
+       paddd   48(%rsp),%xmm3
+       paddd   .Lone(%rip),%xmm1
+       paddd   0(%rsp),%xmm10
+       paddd   16(%rsp),%xmm11
+       paddd   32(%rsp),%xmm0
+       paddd   48(%rsp),%xmm1
+
+       movdqu  0(%rsi),%xmm4
+       movdqu  16(%rsi),%xmm5
+       pxor    %xmm4,%xmm8
+       movdqu  32(%rsi),%xmm4
+       pxor    %xmm5,%xmm9
+       movdqu  48(%rsi),%xmm5
+       pxor    %xmm4,%xmm2
+       movdqu  64(%rsi),%xmm4
+       pxor    %xmm5,%xmm3
+       movdqu  80(%rsi),%xmm5
+       pxor    %xmm4,%xmm10
+       movdqu  96(%rsi),%xmm4
+       pxor    %xmm5,%xmm11
+       movdqu  112(%rsi),%xmm5
+       pxor    %xmm4,%xmm0
+       pxor    %xmm5,%xmm1
+
+       movdqu  %xmm8,0(%rdi)
+       movdqu  %xmm9,16(%rdi)
+       movdqu  %xmm2,32(%rdi)
+       movdqu  %xmm3,48(%rdi)
+       movdqu  %xmm10,64(%rdi)
+       movdqu  %xmm11,80(%rdi)
+       movdqu  %xmm0,96(%rdi)
+       movdqu  %xmm1,112(%rdi)
+       leaq    (%r9),%rsp
+.cfi_def_cfa_register  %rsp
+.L128_epilogue:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  ChaCha20_128,.-ChaCha20_128
+.type  ChaCha20_4x,@function
+.align 32
+ChaCha20_4x:
+.cfi_startproc 
+.LChaCha20_4x:
+       movq    %rsp,%r9
+.cfi_def_cfa_register  %r9
+       movq    %r10,%r11
+       shrq    $32,%r10
+       testq   $32,%r10
+       jnz     .LChaCha20_8x
+       cmpq    $192,%rdx
+       ja      .Lproceed4x
+
+       andq    $71303168,%r11
+       cmpq    $4194304,%r11
+       je      .Ldo_sse3_after_all
+
+.Lproceed4x:
+       subq    $0x140+8,%rsp
+       movdqa  .Lsigma(%rip),%xmm11
+       movdqu  (%rcx),%xmm15
+       movdqu  16(%rcx),%xmm7
+       movdqu  (%r8),%xmm3
+       leaq    256(%rsp),%rcx
+       leaq    .Lrot16(%rip),%r10
+       leaq    .Lrot24(%rip),%r11
+
+       pshufd  $0x00,%xmm11,%xmm8
+       pshufd  $0x55,%xmm11,%xmm9
+       movdqa  %xmm8,64(%rsp)
+       pshufd  $0xaa,%xmm11,%xmm10
+       movdqa  %xmm9,80(%rsp)
+       pshufd  $0xff,%xmm11,%xmm11
+       movdqa  %xmm10,96(%rsp)
+       movdqa  %xmm11,112(%rsp)
+
+       pshufd  $0x00,%xmm15,%xmm12
+       pshufd  $0x55,%xmm15,%xmm13
+       movdqa  %xmm12,128-256(%rcx)
+       pshufd  $0xaa,%xmm15,%xmm14
+       movdqa  %xmm13,144-256(%rcx)
+       pshufd  $0xff,%xmm15,%xmm15
+       movdqa  %xmm14,160-256(%rcx)
+       movdqa  %xmm15,176-256(%rcx)
+
+       pshufd  $0x00,%xmm7,%xmm4
+       pshufd  $0x55,%xmm7,%xmm5
+       movdqa  %xmm4,192-256(%rcx)
+       pshufd  $0xaa,%xmm7,%xmm6
+       movdqa  %xmm5,208-256(%rcx)
+       pshufd  $0xff,%xmm7,%xmm7
+       movdqa  %xmm6,224-256(%rcx)
+       movdqa  %xmm7,240-256(%rcx)
+
+       pshufd  $0x00,%xmm3,%xmm0
+       pshufd  $0x55,%xmm3,%xmm1
+       paddd   .Linc(%rip),%xmm0
+       pshufd  $0xaa,%xmm3,%xmm2
+       movdqa  %xmm1,272-256(%rcx)
+       pshufd  $0xff,%xmm3,%xmm3
+       movdqa  %xmm2,288-256(%rcx)
+       movdqa  %xmm3,304-256(%rcx)
+
+       jmp     .Loop_enter4x
+
+.align 32
+.Loop_outer4x:
+       movdqa  64(%rsp),%xmm8
+       movdqa  80(%rsp),%xmm9
+       movdqa  96(%rsp),%xmm10
+       movdqa  112(%rsp),%xmm11
+       movdqa  128-256(%rcx),%xmm12
+       movdqa  144-256(%rcx),%xmm13
+       movdqa  160-256(%rcx),%xmm14
+       movdqa  176-256(%rcx),%xmm15
+       movdqa  192-256(%rcx),%xmm4
+       movdqa  208-256(%rcx),%xmm5
+       movdqa  224-256(%rcx),%xmm6
+       movdqa  240-256(%rcx),%xmm7
+       movdqa  256-256(%rcx),%xmm0
+       movdqa  272-256(%rcx),%xmm1
+       movdqa  288-256(%rcx),%xmm2
+       movdqa  304-256(%rcx),%xmm3
+       paddd   .Lfour(%rip),%xmm0
+
+.Loop_enter4x:
+       movdqa  %xmm6,32(%rsp)
+       movdqa  %xmm7,48(%rsp)
+       movdqa  (%r10),%xmm7
+       movl    $10,%eax
+       movdqa  %xmm0,256-256(%rcx)
+       jmp     .Loop4x
+
+.align 32
+.Loop4x:
+       paddd   %xmm12,%xmm8
+       paddd   %xmm13,%xmm9
+       pxor    %xmm8,%xmm0
+       pxor    %xmm9,%xmm1
+.byte  102,15,56,0,199
+.byte  102,15,56,0,207
+       paddd   %xmm0,%xmm4
+       paddd   %xmm1,%xmm5
+       pxor    %xmm4,%xmm12
+       pxor    %xmm5,%xmm13
+       movdqa  %xmm12,%xmm6
+       pslld   $12,%xmm12
+       psrld   $20,%xmm6
+       movdqa  %xmm13,%xmm7
+       pslld   $12,%xmm13
+       por     %xmm6,%xmm12
+       psrld   $20,%xmm7
+       movdqa  (%r11),%xmm6
+       por     %xmm7,%xmm13
+       paddd   %xmm12,%xmm8
+       paddd   %xmm13,%xmm9
+       pxor    %xmm8,%xmm0
+       pxor    %xmm9,%xmm1
+.byte  102,15,56,0,198
+.byte  102,15,56,0,206
+       paddd   %xmm0,%xmm4
+       paddd   %xmm1,%xmm5
+       pxor    %xmm4,%xmm12
+       pxor    %xmm5,%xmm13
+       movdqa  %xmm12,%xmm7
+       pslld   $7,%xmm12
+       psrld   $25,%xmm7
+       movdqa  %xmm13,%xmm6
+       pslld   $7,%xmm13
+       por     %xmm7,%xmm12
+       psrld   $25,%xmm6
+       movdqa  (%r10),%xmm7
+       por     %xmm6,%xmm13
+       movdqa  %xmm4,0(%rsp)
+       movdqa  %xmm5,16(%rsp)
+       movdqa  32(%rsp),%xmm4
+       movdqa  48(%rsp),%xmm5
+       paddd   %xmm14,%xmm10
+       paddd   %xmm15,%xmm11
+       pxor    %xmm10,%xmm2
+       pxor    %xmm11,%xmm3
+.byte  102,15,56,0,215
+.byte  102,15,56,0,223
+       paddd   %xmm2,%xmm4
+       paddd   %xmm3,%xmm5
+       pxor    %xmm4,%xmm14
+       pxor    %xmm5,%xmm15
+       movdqa  %xmm14,%xmm6
+       pslld   $12,%xmm14
+       psrld   $20,%xmm6
+       movdqa  %xmm15,%xmm7
+       pslld   $12,%xmm15
+       por     %xmm6,%xmm14
+       psrld   $20,%xmm7
+       movdqa  (%r11),%xmm6
+       por     %xmm7,%xmm15
+       paddd   %xmm14,%xmm10
+       paddd   %xmm15,%xmm11
+       pxor    %xmm10,%xmm2
+       pxor    %xmm11,%xmm3
+.byte  102,15,56,0,214
+.byte  102,15,56,0,222
+       paddd   %xmm2,%xmm4
+       paddd   %xmm3,%xmm5
+       pxor    %xmm4,%xmm14
+       pxor    %xmm5,%xmm15
+       movdqa  %xmm14,%xmm7
+       pslld   $7,%xmm14
+       psrld   $25,%xmm7
+       movdqa  %xmm15,%xmm6
+       pslld   $7,%xmm15
+       por     %xmm7,%xmm14
+       psrld   $25,%xmm6
+       movdqa  (%r10),%xmm7
+       por     %xmm6,%xmm15
+       paddd   %xmm13,%xmm8
+       paddd   %xmm14,%xmm9
+       pxor    %xmm8,%xmm3
+       pxor    %xmm9,%xmm0
+.byte  102,15,56,0,223
+.byte  102,15,56,0,199
+       paddd   %xmm3,%xmm4
+       paddd   %xmm0,%xmm5
+       pxor    %xmm4,%xmm13
+       pxor    %xmm5,%xmm14
+       movdqa  %xmm13,%xmm6
+       pslld   $12,%xmm13
+       psrld   $20,%xmm6
+       movdqa  %xmm14,%xmm7
+       pslld   $12,%xmm14
+       por     %xmm6,%xmm13
+       psrld   $20,%xmm7
+       movdqa  (%r11),%xmm6
+       por     %xmm7,%xmm14
+       paddd   %xmm13,%xmm8
+       paddd   %xmm14,%xmm9
+       pxor    %xmm8,%xmm3
+       pxor    %xmm9,%xmm0
+.byte  102,15,56,0,222
+.byte  102,15,56,0,198
+       paddd   %xmm3,%xmm4
+       paddd   %xmm0,%xmm5
+       pxor    %xmm4,%xmm13
+       pxor    %xmm5,%xmm14
+       movdqa  %xmm13,%xmm7
+       pslld   $7,%xmm13
+       psrld   $25,%xmm7
+       movdqa  %xmm14,%xmm6
+       pslld   $7,%xmm14
+       por     %xmm7,%xmm13
+       psrld   $25,%xmm6
+       movdqa  (%r10),%xmm7
+       por     %xmm6,%xmm14
+       movdqa  %xmm4,32(%rsp)
+       movdqa  %xmm5,48(%rsp)
+       movdqa  0(%rsp),%xmm4
+       movdqa  16(%rsp),%xmm5
+       paddd   %xmm15,%xmm10
+       paddd   %xmm12,%xmm11
+       pxor    %xmm10,%xmm1
+       pxor    %xmm11,%xmm2
+.byte  102,15,56,0,207
+.byte  102,15,56,0,215
+       paddd   %xmm1,%xmm4
+       paddd   %xmm2,%xmm5
+       pxor    %xmm4,%xmm15
+       pxor    %xmm5,%xmm12
+       movdqa  %xmm15,%xmm6
+       pslld   $12,%xmm15
+       psrld   $20,%xmm6
+       movdqa  %xmm12,%xmm7
+       pslld   $12,%xmm12
+       por     %xmm6,%xmm15
+       psrld   $20,%xmm7
+       movdqa  (%r11),%xmm6
+       por     %xmm7,%xmm12
+       paddd   %xmm15,%xmm10
+       paddd   %xmm12,%xmm11
+       pxor    %xmm10,%xmm1
+       pxor    %xmm11,%xmm2
+.byte  102,15,56,0,206
+.byte  102,15,56,0,214
+       paddd   %xmm1,%xmm4
+       paddd   %xmm2,%xmm5
+       pxor    %xmm4,%xmm15
+       pxor    %xmm5,%xmm12
+       movdqa  %xmm15,%xmm7
+       pslld   $7,%xmm15
+       psrld   $25,%xmm7
+       movdqa  %xmm12,%xmm6
+       pslld   $7,%xmm12
+       por     %xmm7,%xmm15
+       psrld   $25,%xmm6
+       movdqa  (%r10),%xmm7
+       por     %xmm6,%xmm12
+       decl    %eax
+       jnz     .Loop4x
+
+       paddd   64(%rsp),%xmm8
+       paddd   80(%rsp),%xmm9
+       paddd   96(%rsp),%xmm10
+       paddd   112(%rsp),%xmm11
+
+       movdqa  %xmm8,%xmm6
+       punpckldq       %xmm9,%xmm8
+       movdqa  %xmm10,%xmm7
+       punpckldq       %xmm11,%xmm10
+       punpckhdq       %xmm9,%xmm6
+       punpckhdq       %xmm11,%xmm7
+       movdqa  %xmm8,%xmm9
+       punpcklqdq      %xmm10,%xmm8
+       movdqa  %xmm6,%xmm11
+       punpcklqdq      %xmm7,%xmm6
+       punpckhqdq      %xmm10,%xmm9
+       punpckhqdq      %xmm7,%xmm11
+       paddd   128-256(%rcx),%xmm12
+       paddd   144-256(%rcx),%xmm13
+       paddd   160-256(%rcx),%xmm14
+       paddd   176-256(%rcx),%xmm15
+
+       movdqa  %xmm8,0(%rsp)
+       movdqa  %xmm9,16(%rsp)
+       movdqa  32(%rsp),%xmm8
+       movdqa  48(%rsp),%xmm9
+
+       movdqa  %xmm12,%xmm10
+       punpckldq       %xmm13,%xmm12
+       movdqa  %xmm14,%xmm7
+       punpckldq       %xmm15,%xmm14
+       punpckhdq       %xmm13,%xmm10
+       punpckhdq       %xmm15,%xmm7
+       movdqa  %xmm12,%xmm13
+       punpcklqdq      %xmm14,%xmm12
+       movdqa  %xmm10,%xmm15
+       punpcklqdq      %xmm7,%xmm10
+       punpckhqdq      %xmm14,%xmm13
+       punpckhqdq      %xmm7,%xmm15
+       paddd   192-256(%rcx),%xmm4
+       paddd   208-256(%rcx),%xmm5
+       paddd   224-256(%rcx),%xmm8
+       paddd   240-256(%rcx),%xmm9
+
+       movdqa  %xmm6,32(%rsp)
+       movdqa  %xmm11,48(%rsp)
+
+       movdqa  %xmm4,%xmm14
+       punpckldq       %xmm5,%xmm4
+       movdqa  %xmm8,%xmm7
+       punpckldq       %xmm9,%xmm8
+       punpckhdq       %xmm5,%xmm14
+       punpckhdq       %xmm9,%xmm7
+       movdqa  %xmm4,%xmm5
+       punpcklqdq      %xmm8,%xmm4
+       movdqa  %xmm14,%xmm9
+       punpcklqdq      %xmm7,%xmm14
+       punpckhqdq      %xmm8,%xmm5
+       punpckhqdq      %xmm7,%xmm9
+       paddd   256-256(%rcx),%xmm0
+       paddd   272-256(%rcx),%xmm1
+       paddd   288-256(%rcx),%xmm2
+       paddd   304-256(%rcx),%xmm3
+
+       movdqa  %xmm0,%xmm8
+       punpckldq       %xmm1,%xmm0
+       movdqa  %xmm2,%xmm7
+       punpckldq       %xmm3,%xmm2
+       punpckhdq       %xmm1,%xmm8
+       punpckhdq       %xmm3,%xmm7
+       movdqa  %xmm0,%xmm1
+       punpcklqdq      %xmm2,%xmm0
+       movdqa  %xmm8,%xmm3
+       punpcklqdq      %xmm7,%xmm8
+       punpckhqdq      %xmm2,%xmm1
+       punpckhqdq      %xmm7,%xmm3
+       cmpq    $256,%rdx
+       jb      .Ltail4x
+
+       movdqu  0(%rsi),%xmm6
+       movdqu  16(%rsi),%xmm11
+       movdqu  32(%rsi),%xmm2
+       movdqu  48(%rsi),%xmm7
+       pxor    0(%rsp),%xmm6
+       pxor    %xmm12,%xmm11
+       pxor    %xmm4,%xmm2
+       pxor    %xmm0,%xmm7
+
+       movdqu  %xmm6,0(%rdi)
+       movdqu  64(%rsi),%xmm6
+       movdqu  %xmm11,16(%rdi)
+       movdqu  80(%rsi),%xmm11
+       movdqu  %xmm2,32(%rdi)
+       movdqu  96(%rsi),%xmm2
+       movdqu  %xmm7,48(%rdi)
+       movdqu  112(%rsi),%xmm7
+       leaq    128(%rsi),%rsi
+       pxor    16(%rsp),%xmm6
+       pxor    %xmm13,%xmm11
+       pxor    %xmm5,%xmm2
+       pxor    %xmm1,%xmm7
+
+       movdqu  %xmm6,64(%rdi)
+       movdqu  0(%rsi),%xmm6
+       movdqu  %xmm11,80(%rdi)
+       movdqu  16(%rsi),%xmm11
+       movdqu  %xmm2,96(%rdi)
+       movdqu  32(%rsi),%xmm2
+       movdqu  %xmm7,112(%rdi)
+       leaq    128(%rdi),%rdi
+       movdqu  48(%rsi),%xmm7
+       pxor    32(%rsp),%xmm6
+       pxor    %xmm10,%xmm11
+       pxor    %xmm14,%xmm2
+       pxor    %xmm8,%xmm7
+
+       movdqu  %xmm6,0(%rdi)
+       movdqu  64(%rsi),%xmm6
+       movdqu  %xmm11,16(%rdi)
+       movdqu  80(%rsi),%xmm11
+       movdqu  %xmm2,32(%rdi)
+       movdqu  96(%rsi),%xmm2
+       movdqu  %xmm7,48(%rdi)
+       movdqu  112(%rsi),%xmm7
+       leaq    128(%rsi),%rsi
+       pxor    48(%rsp),%xmm6
+       pxor    %xmm15,%xmm11
+       pxor    %xmm9,%xmm2
+       pxor    %xmm3,%xmm7
+       movdqu  %xmm6,64(%rdi)
+       movdqu  %xmm11,80(%rdi)
+       movdqu  %xmm2,96(%rdi)
+       movdqu  %xmm7,112(%rdi)
+       leaq    128(%rdi),%rdi
+
+       subq    $256,%rdx
+       jnz     .Loop_outer4x
+
+       jmp     .Ldone4x
+
+.Ltail4x:
+       cmpq    $192,%rdx
+       jae     .L192_or_more4x
+       cmpq    $128,%rdx
+       jae     .L128_or_more4x
+       cmpq    $64,%rdx
+       jae     .L64_or_more4x
+
+
+       xorq    %r10,%r10
+
+       movdqa  %xmm12,16(%rsp)
+       movdqa  %xmm4,32(%rsp)
+       movdqa  %xmm0,48(%rsp)
+       jmp     .Loop_tail4x
+
+.align 32
+.L64_or_more4x:
+       movdqu  0(%rsi),%xmm6
+       movdqu  16(%rsi),%xmm11
+       movdqu  32(%rsi),%xmm2
+       movdqu  48(%rsi),%xmm7
+       pxor    0(%rsp),%xmm6
+       pxor    %xmm12,%xmm11
+       pxor    %xmm4,%xmm2
+       pxor    %xmm0,%xmm7
+       movdqu  %xmm6,0(%rdi)
+       movdqu  %xmm11,16(%rdi)
+       movdqu  %xmm2,32(%rdi)
+       movdqu  %xmm7,48(%rdi)
+       je      .Ldone4x
+
+       movdqa  16(%rsp),%xmm6
+       leaq    64(%rsi),%rsi
+       xorq    %r10,%r10
+       movdqa  %xmm6,0(%rsp)
+       movdqa  %xmm13,16(%rsp)
+       leaq    64(%rdi),%rdi
+       movdqa  %xmm5,32(%rsp)
+       subq    $64,%rdx
+       movdqa  %xmm1,48(%rsp)
+       jmp     .Loop_tail4x
+
+.align 32
+.L128_or_more4x:
+       movdqu  0(%rsi),%xmm6
+       movdqu  16(%rsi),%xmm11
+       movdqu  32(%rsi),%xmm2
+       movdqu  48(%rsi),%xmm7
+       pxor    0(%rsp),%xmm6
+       pxor    %xmm12,%xmm11
+       pxor    %xmm4,%xmm2
+       pxor    %xmm0,%xmm7
+
+       movdqu  %xmm6,0(%rdi)
+       movdqu  64(%rsi),%xmm6
+       movdqu  %xmm11,16(%rdi)
+       movdqu  80(%rsi),%xmm11
+       movdqu  %xmm2,32(%rdi)
+       movdqu  96(%rsi),%xmm2
+       movdqu  %xmm7,48(%rdi)
+       movdqu  112(%rsi),%xmm7
+       pxor    16(%rsp),%xmm6
+       pxor    %xmm13,%xmm11
+       pxor    %xmm5,%xmm2
+       pxor    %xmm1,%xmm7
+       movdqu  %xmm6,64(%rdi)
+       movdqu  %xmm11,80(%rdi)
+       movdqu  %xmm2,96(%rdi)
+       movdqu  %xmm7,112(%rdi)
+       je      .Ldone4x
+
+       movdqa  32(%rsp),%xmm6
+       leaq    128(%rsi),%rsi
+       xorq    %r10,%r10
+       movdqa  %xmm6,0(%rsp)
+       movdqa  %xmm10,16(%rsp)
+       leaq    128(%rdi),%rdi
+       movdqa  %xmm14,32(%rsp)
+       subq    $128,%rdx
+       movdqa  %xmm8,48(%rsp)
+       jmp     .Loop_tail4x
+
+.align 32
+.L192_or_more4x:
+       movdqu  0(%rsi),%xmm6
+       movdqu  16(%rsi),%xmm11
+       movdqu  32(%rsi),%xmm2
+       movdqu  48(%rsi),%xmm7
+       pxor    0(%rsp),%xmm6
+       pxor    %xmm12,%xmm11
+       pxor    %xmm4,%xmm2
+       pxor    %xmm0,%xmm7
+
+       movdqu  %xmm6,0(%rdi)
+       movdqu  64(%rsi),%xmm6
+       movdqu  %xmm11,16(%rdi)
+       movdqu  80(%rsi),%xmm11
+       movdqu  %xmm2,32(%rdi)
+       movdqu  96(%rsi),%xmm2
+       movdqu  %xmm7,48(%rdi)
+       movdqu  112(%rsi),%xmm7
+       leaq    128(%rsi),%rsi
+       pxor    16(%rsp),%xmm6
+       pxor    %xmm13,%xmm11
+       pxor    %xmm5,%xmm2
+       pxor    %xmm1,%xmm7
+
+       movdqu  %xmm6,64(%rdi)
+       movdqu  0(%rsi),%xmm6
+       movdqu  %xmm11,80(%rdi)
+       movdqu  16(%rsi),%xmm11
+       movdqu  %xmm2,96(%rdi)
+       movdqu  32(%rsi),%xmm2
+       movdqu  %xmm7,112(%rdi)
+       leaq    128(%rdi),%rdi
+       movdqu  48(%rsi),%xmm7
+       pxor    32(%rsp),%xmm6
+       pxor    %xmm10,%xmm11
+       pxor    %xmm14,%xmm2
+       pxor    %xmm8,%xmm7
+       movdqu  %xmm6,0(%rdi)
+       movdqu  %xmm11,16(%rdi)
+       movdqu  %xmm2,32(%rdi)
+       movdqu  %xmm7,48(%rdi)
+       je      .Ldone4x
+
+       movdqa  48(%rsp),%xmm6
+       leaq    64(%rsi),%rsi
+       xorq    %r10,%r10
+       movdqa  %xmm6,0(%rsp)
+       movdqa  %xmm15,16(%rsp)
+       leaq    64(%rdi),%rdi
+       movdqa  %xmm9,32(%rsp)
+       subq    $192,%rdx
+       movdqa  %xmm3,48(%rsp)
+
+.Loop_tail4x:
+       movzbl  (%rsi,%r10,1),%eax
+       movzbl  (%rsp,%r10,1),%ecx
+       leaq    1(%r10),%r10
+       xorl    %ecx,%eax
+       movb    %al,-1(%rdi,%r10,1)
+       decq    %rdx
+       jnz     .Loop_tail4x
+
+.Ldone4x:
+       leaq    (%r9),%rsp
+.cfi_def_cfa_register  %rsp
+.L4x_epilogue:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  ChaCha20_4x,.-ChaCha20_4x
+.type  ChaCha20_4xop,@function
+.align 32
+ChaCha20_4xop:
+.cfi_startproc 
+.LChaCha20_4xop:
+       movq    %rsp,%r9
+.cfi_def_cfa_register  %r9
+       subq    $0x140+8,%rsp
+       vzeroupper
+
+       vmovdqa .Lsigma(%rip),%xmm11
+       vmovdqu (%rcx),%xmm3
+       vmovdqu 16(%rcx),%xmm15
+       vmovdqu (%r8),%xmm7
+       leaq    256(%rsp),%rcx
+
+       vpshufd $0x00,%xmm11,%xmm8
+       vpshufd $0x55,%xmm11,%xmm9
+       vmovdqa %xmm8,64(%rsp)
+       vpshufd $0xaa,%xmm11,%xmm10
+       vmovdqa %xmm9,80(%rsp)
+       vpshufd $0xff,%xmm11,%xmm11
+       vmovdqa %xmm10,96(%rsp)
+       vmovdqa %xmm11,112(%rsp)
+
+       vpshufd $0x00,%xmm3,%xmm0
+       vpshufd $0x55,%xmm3,%xmm1
+       vmovdqa %xmm0,128-256(%rcx)
+       vpshufd $0xaa,%xmm3,%xmm2
+       vmovdqa %xmm1,144-256(%rcx)
+       vpshufd $0xff,%xmm3,%xmm3
+       vmovdqa %xmm2,160-256(%rcx)
+       vmovdqa %xmm3,176-256(%rcx)
+
+       vpshufd $0x00,%xmm15,%xmm12
+       vpshufd $0x55,%xmm15,%xmm13
+       vmovdqa %xmm12,192-256(%rcx)
+       vpshufd $0xaa,%xmm15,%xmm14
+       vmovdqa %xmm13,208-256(%rcx)
+       vpshufd $0xff,%xmm15,%xmm15
+       vmovdqa %xmm14,224-256(%rcx)
+       vmovdqa %xmm15,240-256(%rcx)
+
+       vpshufd $0x00,%xmm7,%xmm4
+       vpshufd $0x55,%xmm7,%xmm5
+       vpaddd  .Linc(%rip),%xmm4,%xmm4
+       vpshufd $0xaa,%xmm7,%xmm6
+       vmovdqa %xmm5,272-256(%rcx)
+       vpshufd $0xff,%xmm7,%xmm7
+       vmovdqa %xmm6,288-256(%rcx)
+       vmovdqa %xmm7,304-256(%rcx)
+
+       jmp     .Loop_enter4xop
+
+.align 32
+.Loop_outer4xop:
+       vmovdqa 64(%rsp),%xmm8
+       vmovdqa 80(%rsp),%xmm9
+       vmovdqa 96(%rsp),%xmm10
+       vmovdqa 112(%rsp),%xmm11
+       vmovdqa 128-256(%rcx),%xmm0
+       vmovdqa 144-256(%rcx),%xmm1
+       vmovdqa 160-256(%rcx),%xmm2
+       vmovdqa 176-256(%rcx),%xmm3
+       vmovdqa 192-256(%rcx),%xmm12
+       vmovdqa 208-256(%rcx),%xmm13
+       vmovdqa 224-256(%rcx),%xmm14
+       vmovdqa 240-256(%rcx),%xmm15
+       vmovdqa 256-256(%rcx),%xmm4
+       vmovdqa 272-256(%rcx),%xmm5
+       vmovdqa 288-256(%rcx),%xmm6
+       vmovdqa 304-256(%rcx),%xmm7
+       vpaddd  .Lfour(%rip),%xmm4,%xmm4
+
+.Loop_enter4xop:
+       movl    $10,%eax
+       vmovdqa %xmm4,256-256(%rcx)
+       jmp     .Loop4xop
+
+.align 32
+.Loop4xop:
+       vpaddd  %xmm0,%xmm8,%xmm8
+       vpaddd  %xmm1,%xmm9,%xmm9
+       vpaddd  %xmm2,%xmm10,%xmm10
+       vpaddd  %xmm3,%xmm11,%xmm11
+       vpxor   %xmm4,%xmm8,%xmm4
+       vpxor   %xmm5,%xmm9,%xmm5
+       vpxor   %xmm6,%xmm10,%xmm6
+       vpxor   %xmm7,%xmm11,%xmm7
+.byte  143,232,120,194,228,16
+.byte  143,232,120,194,237,16
+.byte  143,232,120,194,246,16
+.byte  143,232,120,194,255,16
+       vpaddd  %xmm4,%xmm12,%xmm12
+       vpaddd  %xmm5,%xmm13,%xmm13
+       vpaddd  %xmm6,%xmm14,%xmm14
+       vpaddd  %xmm7,%xmm15,%xmm15
+       vpxor   %xmm0,%xmm12,%xmm0
+       vpxor   %xmm1,%xmm13,%xmm1
+       vpxor   %xmm14,%xmm2,%xmm2
+       vpxor   %xmm15,%xmm3,%xmm3
+.byte  143,232,120,194,192,12
+.byte  143,232,120,194,201,12
+.byte  143,232,120,194,210,12
+.byte  143,232,120,194,219,12
+       vpaddd  %xmm8,%xmm0,%xmm8
+       vpaddd  %xmm9,%xmm1,%xmm9
+       vpaddd  %xmm2,%xmm10,%xmm10
+       vpaddd  %xmm3,%xmm11,%xmm11
+       vpxor   %xmm4,%xmm8,%xmm4
+       vpxor   %xmm5,%xmm9,%xmm5
+       vpxor   %xmm6,%xmm10,%xmm6
+       vpxor   %xmm7,%xmm11,%xmm7
+.byte  143,232,120,194,228,8
+.byte  143,232,120,194,237,8
+.byte  143,232,120,194,246,8
+.byte  143,232,120,194,255,8
+       vpaddd  %xmm4,%xmm12,%xmm12
+       vpaddd  %xmm5,%xmm13,%xmm13
+       vpaddd  %xmm6,%xmm14,%xmm14
+       vpaddd  %xmm7,%xmm15,%xmm15
+       vpxor   %xmm0,%xmm12,%xmm0
+       vpxor   %xmm1,%xmm13,%xmm1
+       vpxor   %xmm14,%xmm2,%xmm2
+       vpxor   %xmm15,%xmm3,%xmm3
+.byte  143,232,120,194,192,7
+.byte  143,232,120,194,201,7
+.byte  143,232,120,194,210,7
+.byte  143,232,120,194,219,7
+       vpaddd  %xmm1,%xmm8,%xmm8
+       vpaddd  %xmm2,%xmm9,%xmm9
+       vpaddd  %xmm3,%xmm10,%xmm10
+       vpaddd  %xmm0,%xmm11,%xmm11
+       vpxor   %xmm7,%xmm8,%xmm7
+       vpxor   %xmm4,%xmm9,%xmm4
+       vpxor   %xmm5,%xmm10,%xmm5
+       vpxor   %xmm6,%xmm11,%xmm6
+.byte  143,232,120,194,255,16
+.byte  143,232,120,194,228,16
+.byte  143,232,120,194,237,16
+.byte  143,232,120,194,246,16
+       vpaddd  %xmm7,%xmm14,%xmm14
+       vpaddd  %xmm4,%xmm15,%xmm15
+       vpaddd  %xmm5,%xmm12,%xmm12
+       vpaddd  %xmm6,%xmm13,%xmm13
+       vpxor   %xmm1,%xmm14,%xmm1
+       vpxor   %xmm2,%xmm15,%xmm2
+       vpxor   %xmm12,%xmm3,%xmm3
+       vpxor   %xmm13,%xmm0,%xmm0
+.byte  143,232,120,194,201,12
+.byte  143,232,120,194,210,12
+.byte  143,232,120,194,219,12
+.byte  143,232,120,194,192,12
+       vpaddd  %xmm8,%xmm1,%xmm8
+       vpaddd  %xmm9,%xmm2,%xmm9
+       vpaddd  %xmm3,%xmm10,%xmm10
+       vpaddd  %xmm0,%xmm11,%xmm11
+       vpxor   %xmm7,%xmm8,%xmm7
+       vpxor   %xmm4,%xmm9,%xmm4
+       vpxor   %xmm5,%xmm10,%xmm5
+       vpxor   %xmm6,%xmm11,%xmm6
+.byte  143,232,120,194,255,8
+.byte  143,232,120,194,228,8
+.byte  143,232,120,194,237,8
+.byte  143,232,120,194,246,8
+       vpaddd  %xmm7,%xmm14,%xmm14
+       vpaddd  %xmm4,%xmm15,%xmm15
+       vpaddd  %xmm5,%xmm12,%xmm12
+       vpaddd  %xmm6,%xmm13,%xmm13
+       vpxor   %xmm1,%xmm14,%xmm1
+       vpxor   %xmm2,%xmm15,%xmm2
+       vpxor   %xmm12,%xmm3,%xmm3
+       vpxor   %xmm13,%xmm0,%xmm0
+.byte  143,232,120,194,201,7
+.byte  143,232,120,194,210,7
+.byte  143,232,120,194,219,7
+.byte  143,232,120,194,192,7
+       decl    %eax
+       jnz     .Loop4xop
+
+       vpaddd  64(%rsp),%xmm8,%xmm8
+       vpaddd  80(%rsp),%xmm9,%xmm9
+       vpaddd  96(%rsp),%xmm10,%xmm10
+       vpaddd  112(%rsp),%xmm11,%xmm11
+
+       vmovdqa %xmm14,32(%rsp)
+       vmovdqa %xmm15,48(%rsp)
+
+       vpunpckldq      %xmm9,%xmm8,%xmm14
+       vpunpckldq      %xmm11,%xmm10,%xmm15
+       vpunpckhdq      %xmm9,%xmm8,%xmm8
+       vpunpckhdq      %xmm11,%xmm10,%xmm10
+       vpunpcklqdq     %xmm15,%xmm14,%xmm9
+       vpunpckhqdq     %xmm15,%xmm14,%xmm14
+       vpunpcklqdq     %xmm10,%xmm8,%xmm11
+       vpunpckhqdq     %xmm10,%xmm8,%xmm8
+       vpaddd  128-256(%rcx),%xmm0,%xmm0
+       vpaddd  144-256(%rcx),%xmm1,%xmm1
+       vpaddd  160-256(%rcx),%xmm2,%xmm2
+       vpaddd  176-256(%rcx),%xmm3,%xmm3
+
+       vmovdqa %xmm9,0(%rsp)
+       vmovdqa %xmm14,16(%rsp)
+       vmovdqa 32(%rsp),%xmm9
+       vmovdqa 48(%rsp),%xmm14
+
+       vpunpckldq      %xmm1,%xmm0,%xmm10
+       vpunpckldq      %xmm3,%xmm2,%xmm15
+       vpunpckhdq      %xmm1,%xmm0,%xmm0
+       vpunpckhdq      %xmm3,%xmm2,%xmm2
+       vpunpcklqdq     %xmm15,%xmm10,%xmm1
+       vpunpckhqdq     %xmm15,%xmm10,%xmm10
+       vpunpcklqdq     %xmm2,%xmm0,%xmm3
+       vpunpckhqdq     %xmm2,%xmm0,%xmm0
+       vpaddd  192-256(%rcx),%xmm12,%xmm12
+       vpaddd  208-256(%rcx),%xmm13,%xmm13
+       vpaddd  224-256(%rcx),%xmm9,%xmm9
+       vpaddd  240-256(%rcx),%xmm14,%xmm14
+
+       vpunpckldq      %xmm13,%xmm12,%xmm2
+       vpunpckldq      %xmm14,%xmm9,%xmm15
+       vpunpckhdq      %xmm13,%xmm12,%xmm12
+       vpunpckhdq      %xmm14,%xmm9,%xmm9
+       vpunpcklqdq     %xmm15,%xmm2,%xmm13
+       vpunpckhqdq     %xmm15,%xmm2,%xmm2
+       vpunpcklqdq     %xmm9,%xmm12,%xmm14
+       vpunpckhqdq     %xmm9,%xmm12,%xmm12
+       vpaddd  256-256(%rcx),%xmm4,%xmm4
+       vpaddd  272-256(%rcx),%xmm5,%xmm5
+       vpaddd  288-256(%rcx),%xmm6,%xmm6
+       vpaddd  304-256(%rcx),%xmm7,%xmm7
+
+       vpunpckldq      %xmm5,%xmm4,%xmm9
+       vpunpckldq      %xmm7,%xmm6,%xmm15
+       vpunpckhdq      %xmm5,%xmm4,%xmm4
+       vpunpckhdq      %xmm7,%xmm6,%xmm6
+       vpunpcklqdq     %xmm15,%xmm9,%xmm5
+       vpunpckhqdq     %xmm15,%xmm9,%xmm9
+       vpunpcklqdq     %xmm6,%xmm4,%xmm7
+       vpunpckhqdq     %xmm6,%xmm4,%xmm4
+       vmovdqa 0(%rsp),%xmm6
+       vmovdqa 16(%rsp),%xmm15
+
+       cmpq    $256,%rdx
+       jb      .Ltail4xop
+
+       vpxor   0(%rsi),%xmm6,%xmm6
+       vpxor   16(%rsi),%xmm1,%xmm1
+       vpxor   32(%rsi),%xmm13,%xmm13
+       vpxor   48(%rsi),%xmm5,%xmm5
+       vpxor   64(%rsi),%xmm15,%xmm15
+       vpxor   80(%rsi),%xmm10,%xmm10
+       vpxor   96(%rsi),%xmm2,%xmm2
+       vpxor   112(%rsi),%xmm9,%xmm9
+       leaq    128(%rsi),%rsi
+       vpxor   0(%rsi),%xmm11,%xmm11
+       vpxor   16(%rsi),%xmm3,%xmm3
+       vpxor   32(%rsi),%xmm14,%xmm14
+       vpxor   48(%rsi),%xmm7,%xmm7
+       vpxor   64(%rsi),%xmm8,%xmm8
+       vpxor   80(%rsi),%xmm0,%xmm0
+       vpxor   96(%rsi),%xmm12,%xmm12
+       vpxor   112(%rsi),%xmm4,%xmm4
+       leaq    128(%rsi),%rsi
+
+       vmovdqu %xmm6,0(%rdi)
+       vmovdqu %xmm1,16(%rdi)
+       vmovdqu %xmm13,32(%rdi)
+       vmovdqu %xmm5,48(%rdi)
+       vmovdqu %xmm15,64(%rdi)
+       vmovdqu %xmm10,80(%rdi)
+       vmovdqu %xmm2,96(%rdi)
+       vmovdqu %xmm9,112(%rdi)
+       leaq    128(%rdi),%rdi
+       vmovdqu %xmm11,0(%rdi)
+       vmovdqu %xmm3,16(%rdi)
+       vmovdqu %xmm14,32(%rdi)
+       vmovdqu %xmm7,48(%rdi)
+       vmovdqu %xmm8,64(%rdi)
+       vmovdqu %xmm0,80(%rdi)
+       vmovdqu %xmm12,96(%rdi)
+       vmovdqu %xmm4,112(%rdi)
+       leaq    128(%rdi),%rdi
+
+       subq    $256,%rdx
+       jnz     .Loop_outer4xop
+
+       jmp     .Ldone4xop
+
+.align 32
+.Ltail4xop:
+       cmpq    $192,%rdx
+       jae     .L192_or_more4xop
+       cmpq    $128,%rdx
+       jae     .L128_or_more4xop
+       cmpq    $64,%rdx
+       jae     .L64_or_more4xop
+
+       xorq    %r10,%r10
+       vmovdqa %xmm6,0(%rsp)
+       vmovdqa %xmm1,16(%rsp)
+       vmovdqa %xmm13,32(%rsp)
+       vmovdqa %xmm5,48(%rsp)
+       jmp     .Loop_tail4xop
+
+.align 32
+.L64_or_more4xop:
+       vpxor   0(%rsi),%xmm6,%xmm6
+       vpxor   16(%rsi),%xmm1,%xmm1
+       vpxor   32(%rsi),%xmm13,%xmm13
+       vpxor   48(%rsi),%xmm5,%xmm5
+       vmovdqu %xmm6,0(%rdi)
+       vmovdqu %xmm1,16(%rdi)
+       vmovdqu %xmm13,32(%rdi)
+       vmovdqu %xmm5,48(%rdi)
+       je      .Ldone4xop
+
+       leaq    64(%rsi),%rsi
+       vmovdqa %xmm15,0(%rsp)
+       xorq    %r10,%r10
+       vmovdqa %xmm10,16(%rsp)
+       leaq    64(%rdi),%rdi
+       vmovdqa %xmm2,32(%rsp)
+       subq    $64,%rdx
+       vmovdqa %xmm9,48(%rsp)
+       jmp     .Loop_tail4xop
+
+.align 32
+.L128_or_more4xop:
+       vpxor   0(%rsi),%xmm6,%xmm6
+       vpxor   16(%rsi),%xmm1,%xmm1
+       vpxor   32(%rsi),%xmm13,%xmm13
+       vpxor   48(%rsi),%xmm5,%xmm5
+       vpxor   64(%rsi),%xmm15,%xmm15
+       vpxor   80(%rsi),%xmm10,%xmm10
+       vpxor   96(%rsi),%xmm2,%xmm2
+       vpxor   112(%rsi),%xmm9,%xmm9
+
+       vmovdqu %xmm6,0(%rdi)
+       vmovdqu %xmm1,16(%rdi)
+       vmovdqu %xmm13,32(%rdi)
+       vmovdqu %xmm5,48(%rdi)
+       vmovdqu %xmm15,64(%rdi)
+       vmovdqu %xmm10,80(%rdi)
+       vmovdqu %xmm2,96(%rdi)
+       vmovdqu %xmm9,112(%rdi)
+       je      .Ldone4xop
+
+       leaq    128(%rsi),%rsi
+       vmovdqa %xmm11,0(%rsp)
+       xorq    %r10,%r10
+       vmovdqa %xmm3,16(%rsp)
+       leaq    128(%rdi),%rdi
+       vmovdqa %xmm14,32(%rsp)
+       subq    $128,%rdx
+       vmovdqa %xmm7,48(%rsp)
+       jmp     .Loop_tail4xop
+
+.align 32
+.L192_or_more4xop:
+       vpxor   0(%rsi),%xmm6,%xmm6
+       vpxor   16(%rsi),%xmm1,%xmm1
+       vpxor   32(%rsi),%xmm13,%xmm13
+       vpxor   48(%rsi),%xmm5,%xmm5
+       vpxor   64(%rsi),%xmm15,%xmm15
+       vpxor   80(%rsi),%xmm10,%xmm10
+       vpxor   96(%rsi),%xmm2,%xmm2
+       vpxor   112(%rsi),%xmm9,%xmm9
+       leaq    128(%rsi),%rsi
+       vpxor   0(%rsi),%xmm11,%xmm11
+       vpxor   16(%rsi),%xmm3,%xmm3
+       vpxor   32(%rsi),%xmm14,%xmm14
+       vpxor   48(%rsi),%xmm7,%xmm7
+
+       vmovdqu %xmm6,0(%rdi)
+       vmovdqu %xmm1,16(%rdi)
+       vmovdqu %xmm13,32(%rdi)
+       vmovdqu %xmm5,48(%rdi)
+       vmovdqu %xmm15,64(%rdi)
+       vmovdqu %xmm10,80(%rdi)
+       vmovdqu %xmm2,96(%rdi)
+       vmovdqu %xmm9,112(%rdi)
+       leaq    128(%rdi),%rdi
+       vmovdqu %xmm11,0(%rdi)
+       vmovdqu %xmm3,16(%rdi)
+       vmovdqu %xmm14,32(%rdi)
+       vmovdqu %xmm7,48(%rdi)
+       je      .Ldone4xop
+
+       leaq    64(%rsi),%rsi
+       vmovdqa %xmm8,0(%rsp)
+       xorq    %r10,%r10
+       vmovdqa %xmm0,16(%rsp)
+       leaq    64(%rdi),%rdi
+       vmovdqa %xmm12,32(%rsp)
+       subq    $192,%rdx
+       vmovdqa %xmm4,48(%rsp)
+
+.Loop_tail4xop:
+       movzbl  (%rsi,%r10,1),%eax
+       movzbl  (%rsp,%r10,1),%ecx
+       leaq    1(%r10),%r10
+       xorl    %ecx,%eax
+       movb    %al,-1(%rdi,%r10,1)
+       decq    %rdx
+       jnz     .Loop_tail4xop
+
+.Ldone4xop:
+       vzeroupper
+       leaq    (%r9),%rsp
+.cfi_def_cfa_register  %rsp
+.L4xop_epilogue:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  ChaCha20_4xop,.-ChaCha20_4xop
+.type  ChaCha20_8x,@function
+.align 32
+ChaCha20_8x:
+.cfi_startproc 
+.LChaCha20_8x:
+       movq    %rsp,%r9
+.cfi_def_cfa_register  %r9
+       subq    $0x280+8,%rsp
+       andq    $-32,%rsp
+       vzeroupper
+
+
+
+
+
+
+
+
+
+
+       vbroadcasti128  .Lsigma(%rip),%ymm11
+       vbroadcasti128  (%rcx),%ymm3
+       vbroadcasti128  16(%rcx),%ymm15
+       vbroadcasti128  (%r8),%ymm7
+       leaq    256(%rsp),%rcx
+       leaq    512(%rsp),%rax
+       leaq    .Lrot16(%rip),%r10
+       leaq    .Lrot24(%rip),%r11
+
+       vpshufd $0x00,%ymm11,%ymm8
+       vpshufd $0x55,%ymm11,%ymm9
+       vmovdqa %ymm8,128-256(%rcx)
+       vpshufd $0xaa,%ymm11,%ymm10
+       vmovdqa %ymm9,160-256(%rcx)
+       vpshufd $0xff,%ymm11,%ymm11
+       vmovdqa %ymm10,192-256(%rcx)
+       vmovdqa %ymm11,224-256(%rcx)
+
+       vpshufd $0x00,%ymm3,%ymm0
+       vpshufd $0x55,%ymm3,%ymm1
+       vmovdqa %ymm0,256-256(%rcx)
+       vpshufd $0xaa,%ymm3,%ymm2
+       vmovdqa %ymm1,288-256(%rcx)
+       vpshufd $0xff,%ymm3,%ymm3
+       vmovdqa %ymm2,320-256(%rcx)
+       vmovdqa %ymm3,352-256(%rcx)
+
+       vpshufd $0x00,%ymm15,%ymm12
+       vpshufd $0x55,%ymm15,%ymm13
+       vmovdqa %ymm12,384-512(%rax)
+       vpshufd $0xaa,%ymm15,%ymm14
+       vmovdqa %ymm13,416-512(%rax)
+       vpshufd $0xff,%ymm15,%ymm15
+       vmovdqa %ymm14,448-512(%rax)
+       vmovdqa %ymm15,480-512(%rax)
+
+       vpshufd $0x00,%ymm7,%ymm4
+       vpshufd $0x55,%ymm7,%ymm5
+       vpaddd  .Lincy(%rip),%ymm4,%ymm4
+       vpshufd $0xaa,%ymm7,%ymm6
+       vmovdqa %ymm5,544-512(%rax)
+       vpshufd $0xff,%ymm7,%ymm7
+       vmovdqa %ymm6,576-512(%rax)
+       vmovdqa %ymm7,608-512(%rax)
+
+       jmp     .Loop_enter8x
+
+.align 32
+.Loop_outer8x:
+       vmovdqa 128-256(%rcx),%ymm8
+       vmovdqa 160-256(%rcx),%ymm9
+       vmovdqa 192-256(%rcx),%ymm10
+       vmovdqa 224-256(%rcx),%ymm11
+       vmovdqa 256-256(%rcx),%ymm0
+       vmovdqa 288-256(%rcx),%ymm1
+       vmovdqa 320-256(%rcx),%ymm2
+       vmovdqa 352-256(%rcx),%ymm3
+       vmovdqa 384-512(%rax),%ymm12
+       vmovdqa 416-512(%rax),%ymm13
+       vmovdqa 448-512(%rax),%ymm14
+       vmovdqa 480-512(%rax),%ymm15
+       vmovdqa 512-512(%rax),%ymm4
+       vmovdqa 544-512(%rax),%ymm5
+       vmovdqa 576-512(%rax),%ymm6
+       vmovdqa 608-512(%rax),%ymm7
+       vpaddd  .Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+       vmovdqa %ymm14,64(%rsp)
+       vmovdqa %ymm15,96(%rsp)
+       vbroadcasti128  (%r10),%ymm15
+       vmovdqa %ymm4,512-512(%rax)
+       movl    $10,%eax
+       jmp     .Loop8x
+
+.align 32
+.Loop8x:
+       vpaddd  %ymm0,%ymm8,%ymm8
+       vpxor   %ymm4,%ymm8,%ymm4
+       vpshufb %ymm15,%ymm4,%ymm4
+       vpaddd  %ymm1,%ymm9,%ymm9
+       vpxor   %ymm5,%ymm9,%ymm5
+       vpshufb %ymm15,%ymm5,%ymm5
+       vpaddd  %ymm4,%ymm12,%ymm12
+       vpxor   %ymm0,%ymm12,%ymm0
+       vpslld  $12,%ymm0,%ymm14
+       vpsrld  $20,%ymm0,%ymm0
+       vpor    %ymm0,%ymm14,%ymm0
+       vbroadcasti128  (%r11),%ymm14
+       vpaddd  %ymm5,%ymm13,%ymm13
+       vpxor   %ymm1,%ymm13,%ymm1
+       vpslld  $12,%ymm1,%ymm15
+       vpsrld  $20,%ymm1,%ymm1
+       vpor    %ymm1,%ymm15,%ymm1
+       vpaddd  %ymm0,%ymm8,%ymm8
+       vpxor   %ymm4,%ymm8,%ymm4
+       vpshufb %ymm14,%ymm4,%ymm4
+       vpaddd  %ymm1,%ymm9,%ymm9
+       vpxor   %ymm5,%ymm9,%ymm5
+       vpshufb %ymm14,%ymm5,%ymm5
+       vpaddd  %ymm4,%ymm12,%ymm12
+       vpxor   %ymm0,%ymm12,%ymm0
+       vpslld  $7,%ymm0,%ymm15
+       vpsrld  $25,%ymm0,%ymm0
+       vpor    %ymm0,%ymm15,%ymm0
+       vbroadcasti128  (%r10),%ymm15
+       vpaddd  %ymm5,%ymm13,%ymm13
+       vpxor   %ymm1,%ymm13,%ymm1
+       vpslld  $7,%ymm1,%ymm14
+       vpsrld  $25,%ymm1,%ymm1
+       vpor    %ymm1,%ymm14,%ymm1
+       vmovdqa %ymm12,0(%rsp)
+       vmovdqa %ymm13,32(%rsp)
+       vmovdqa 64(%rsp),%ymm12
+       vmovdqa 96(%rsp),%ymm13
+       vpaddd  %ymm2,%ymm10,%ymm10
+       vpxor   %ymm6,%ymm10,%ymm6
+       vpshufb %ymm15,%ymm6,%ymm6
+       vpaddd  %ymm3,%ymm11,%ymm11
+       vpxor   %ymm7,%ymm11,%ymm7
+       vpshufb %ymm15,%ymm7,%ymm7
+       vpaddd  %ymm6,%ymm12,%ymm12
+       vpxor   %ymm2,%ymm12,%ymm2
+       vpslld  $12,%ymm2,%ymm14
+       vpsrld  $20,%ymm2,%ymm2
+       vpor    %ymm2,%ymm14,%ymm2
+       vbroadcasti128  (%r11),%ymm14
+       vpaddd  %ymm7,%ymm13,%ymm13
+       vpxor   %ymm3,%ymm13,%ymm3
+       vpslld  $12,%ymm3,%ymm15
+       vpsrld  $20,%ymm3,%ymm3
+       vpor    %ymm3,%ymm15,%ymm3
+       vpaddd  %ymm2,%ymm10,%ymm10
+       vpxor   %ymm6,%ymm10,%ymm6
+       vpshufb %ymm14,%ymm6,%ymm6
+       vpaddd  %ymm3,%ymm11,%ymm11
+       vpxor   %ymm7,%ymm11,%ymm7
+       vpshufb %ymm14,%ymm7,%ymm7
+       vpaddd  %ymm6,%ymm12,%ymm12
+       vpxor   %ymm2,%ymm12,%ymm2
+       vpslld  $7,%ymm2,%ymm15
+       vpsrld  $25,%ymm2,%ymm2
+       vpor    %ymm2,%ymm15,%ymm2
+       vbroadcasti128  (%r10),%ymm15
+       vpaddd  %ymm7,%ymm13,%ymm13
+       vpxor   %ymm3,%ymm13,%ymm3
+       vpslld  $7,%ymm3,%ymm14
+       vpsrld  $25,%ymm3,%ymm3
+       vpor    %ymm3,%ymm14,%ymm3
+       vpaddd  %ymm1,%ymm8,%ymm8
+       vpxor   %ymm7,%ymm8,%ymm7
+       vpshufb %ymm15,%ymm7,%ymm7
+       vpaddd  %ymm2,%ymm9,%ymm9
+       vpxor   %ymm4,%ymm9,%ymm4
+       vpshufb %ymm15,%ymm4,%ymm4
+       vpaddd  %ymm7,%ymm12,%ymm12
+       vpxor   %ymm1,%ymm12,%ymm1
+       vpslld  $12,%ymm1,%ymm14
+       vpsrld  $20,%ymm1,%ymm1
+       vpor    %ymm1,%ymm14,%ymm1
+       vbroadcasti128  (%r11),%ymm14
+       vpaddd  %ymm4,%ymm13,%ymm13
+       vpxor   %ymm2,%ymm13,%ymm2
+       vpslld  $12,%ymm2,%ymm15
+       vpsrld  $20,%ymm2,%ymm2
+       vpor    %ymm2,%ymm15,%ymm2
+       vpaddd  %ymm1,%ymm8,%ymm8
+       vpxor   %ymm7,%ymm8,%ymm7
+       vpshufb %ymm14,%ymm7,%ymm7
+       vpaddd  %ymm2,%ymm9,%ymm9
+       vpxor   %ymm4,%ymm9,%ymm4
+       vpshufb %ymm14,%ymm4,%ymm4
+       vpaddd  %ymm7,%ymm12,%ymm12
+       vpxor   %ymm1,%ymm12,%ymm1
+       vpslld  $7,%ymm1,%ymm15
+       vpsrld  $25,%ymm1,%ymm1
+       vpor    %ymm1,%ymm15,%ymm1
+       vbroadcasti128  (%r10),%ymm15
+       vpaddd  %ymm4,%ymm13,%ymm13
+       vpxor   %ymm2,%ymm13,%ymm2
+       vpslld  $7,%ymm2,%ymm14
+       vpsrld  $25,%ymm2,%ymm2
+       vpor    %ymm2,%ymm14,%ymm2
+       vmovdqa %ymm12,64(%rsp)
+       vmovdqa %ymm13,96(%rsp)
+       vmovdqa 0(%rsp),%ymm12
+       vmovdqa 32(%rsp),%ymm13
+       vpaddd  %ymm3,%ymm10,%ymm10
+       vpxor   %ymm5,%ymm10,%ymm5
+       vpshufb %ymm15,%ymm5,%ymm5
+       vpaddd  %ymm0,%ymm11,%ymm11
+       vpxor   %ymm6,%ymm11,%ymm6
+       vpshufb %ymm15,%ymm6,%ymm6
+       vpaddd  %ymm5,%ymm12,%ymm12
+       vpxor   %ymm3,%ymm12,%ymm3
+       vpslld  $12,%ymm3,%ymm14
+       vpsrld  $20,%ymm3,%ymm3
+       vpor    %ymm3,%ymm14,%ymm3
+       vbroadcasti128  (%r11),%ymm14
+       vpaddd  %ymm6,%ymm13,%ymm13
+       vpxor   %ymm0,%ymm13,%ymm0
+       vpslld  $12,%ymm0,%ymm15
+       vpsrld  $20,%ymm0,%ymm0
+       vpor    %ymm0,%ymm15,%ymm0
+       vpaddd  %ymm3,%ymm10,%ymm10
+       vpxor   %ymm5,%ymm10,%ymm5
+       vpshufb %ymm14,%ymm5,%ymm5
+       vpaddd  %ymm0,%ymm11,%ymm11
+       vpxor   %ymm6,%ymm11,%ymm6
+       vpshufb %ymm14,%ymm6,%ymm6
+       vpaddd  %ymm5,%ymm12,%ymm12
+       vpxor   %ymm3,%ymm12,%ymm3
+       vpslld  $7,%ymm3,%ymm15
+       vpsrld  $25,%ymm3,%ymm3
+       vpor    %ymm3,%ymm15,%ymm3
+       vbroadcasti128  (%r10),%ymm15
+       vpaddd  %ymm6,%ymm13,%ymm13
+       vpxor   %ymm0,%ymm13,%ymm0
+       vpslld  $7,%ymm0,%ymm14
+       vpsrld  $25,%ymm0,%ymm0
+       vpor    %ymm0,%ymm14,%ymm0
+       decl    %eax
+       jnz     .Loop8x
+
+       leaq    512(%rsp),%rax
+       vpaddd  128-256(%rcx),%ymm8,%ymm8
+       vpaddd  160-256(%rcx),%ymm9,%ymm9
+       vpaddd  192-256(%rcx),%ymm10,%ymm10
+       vpaddd  224-256(%rcx),%ymm11,%ymm11
+
+       vpunpckldq      %ymm9,%ymm8,%ymm14
+       vpunpckldq      %ymm11,%ymm10,%ymm15
+       vpunpckhdq      %ymm9,%ymm8,%ymm8
+       vpunpckhdq      %ymm11,%ymm10,%ymm10
+       vpunpcklqdq     %ymm15,%ymm14,%ymm9
+       vpunpckhqdq     %ymm15,%ymm14,%ymm14
+       vpunpcklqdq     %ymm10,%ymm8,%ymm11
+       vpunpckhqdq     %ymm10,%ymm8,%ymm8
+       vpaddd  256-256(%rcx),%ymm0,%ymm0
+       vpaddd  288-256(%rcx),%ymm1,%ymm1
+       vpaddd  320-256(%rcx),%ymm2,%ymm2
+       vpaddd  352-256(%rcx),%ymm3,%ymm3
+
+       vpunpckldq      %ymm1,%ymm0,%ymm10
+       vpunpckldq      %ymm3,%ymm2,%ymm15
+       vpunpckhdq      %ymm1,%ymm0,%ymm0
+       vpunpckhdq      %ymm3,%ymm2,%ymm2
+       vpunpcklqdq     %ymm15,%ymm10,%ymm1
+       vpunpckhqdq     %ymm15,%ymm10,%ymm10
+       vpunpcklqdq     %ymm2,%ymm0,%ymm3
+       vpunpckhqdq     %ymm2,%ymm0,%ymm0
+       vperm2i128      $0x20,%ymm1,%ymm9,%ymm15
+       vperm2i128      $0x31,%ymm1,%ymm9,%ymm1
+       vperm2i128      $0x20,%ymm10,%ymm14,%ymm9
+       vperm2i128      $0x31,%ymm10,%ymm14,%ymm10
+       vperm2i128      $0x20,%ymm3,%ymm11,%ymm14
+       vperm2i128      $0x31,%ymm3,%ymm11,%ymm3
+       vperm2i128      $0x20,%ymm0,%ymm8,%ymm11
+       vperm2i128      $0x31,%ymm0,%ymm8,%ymm0
+       vmovdqa %ymm15,0(%rsp)
+       vmovdqa %ymm9,32(%rsp)
+       vmovdqa 64(%rsp),%ymm15
+       vmovdqa 96(%rsp),%ymm9
+
+       vpaddd  384-512(%rax),%ymm12,%ymm12
+       vpaddd  416-512(%rax),%ymm13,%ymm13
+       vpaddd  448-512(%rax),%ymm15,%ymm15
+       vpaddd  480-512(%rax),%ymm9,%ymm9
+
+       vpunpckldq      %ymm13,%ymm12,%ymm2
+       vpunpckldq      %ymm9,%ymm15,%ymm8
+       vpunpckhdq      %ymm13,%ymm12,%ymm12
+       vpunpckhdq      %ymm9,%ymm15,%ymm15
+       vpunpcklqdq     %ymm8,%ymm2,%ymm13
+       vpunpckhqdq     %ymm8,%ymm2,%ymm2
+       vpunpcklqdq     %ymm15,%ymm12,%ymm9
+       vpunpckhqdq     %ymm15,%ymm12,%ymm12
+       vpaddd  512-512(%rax),%ymm4,%ymm4
+       vpaddd  544-512(%rax),%ymm5,%ymm5
+       vpaddd  576-512(%rax),%ymm6,%ymm6
+       vpaddd  608-512(%rax),%ymm7,%ymm7
+
+       vpunpckldq      %ymm5,%ymm4,%ymm15
+       vpunpckldq      %ymm7,%ymm6,%ymm8
+       vpunpckhdq      %ymm5,%ymm4,%ymm4
+       vpunpckhdq      %ymm7,%ymm6,%ymm6
+       vpunpcklqdq     %ymm8,%ymm15,%ymm5
+       vpunpckhqdq     %ymm8,%ymm15,%ymm15
+       vpunpcklqdq     %ymm6,%ymm4,%ymm7
+       vpunpckhqdq     %ymm6,%ymm4,%ymm4
+       vperm2i128      $0x20,%ymm5,%ymm13,%ymm8
+       vperm2i128      $0x31,%ymm5,%ymm13,%ymm5
+       vperm2i128      $0x20,%ymm15,%ymm2,%ymm13
+       vperm2i128      $0x31,%ymm15,%ymm2,%ymm15
+       vperm2i128      $0x20,%ymm7,%ymm9,%ymm2
+       vperm2i128      $0x31,%ymm7,%ymm9,%ymm7
+       vperm2i128      $0x20,%ymm4,%ymm12,%ymm9
+       vperm2i128      $0x31,%ymm4,%ymm12,%ymm4
+       vmovdqa 0(%rsp),%ymm6
+       vmovdqa 32(%rsp),%ymm12
+
+       cmpq    $512,%rdx
+       jb      .Ltail8x
+
+       vpxor   0(%rsi),%ymm6,%ymm6
+       vpxor   32(%rsi),%ymm8,%ymm8
+       vpxor   64(%rsi),%ymm1,%ymm1
+       vpxor   96(%rsi),%ymm5,%ymm5
+       leaq    128(%rsi),%rsi
+       vmovdqu %ymm6,0(%rdi)
+       vmovdqu %ymm8,32(%rdi)
+       vmovdqu %ymm1,64(%rdi)
+       vmovdqu %ymm5,96(%rdi)
+       leaq    128(%rdi),%rdi
+
+       vpxor   0(%rsi),%ymm12,%ymm12
+       vpxor   32(%rsi),%ymm13,%ymm13
+       vpxor   64(%rsi),%ymm10,%ymm10
+       vpxor   96(%rsi),%ymm15,%ymm15
+       leaq    128(%rsi),%rsi
+       vmovdqu %ymm12,0(%rdi)
+       vmovdqu %ymm13,32(%rdi)
+       vmovdqu %ymm10,64(%rdi)
+       vmovdqu %ymm15,96(%rdi)
+       leaq    128(%rdi),%rdi
+
+       vpxor   0(%rsi),%ymm14,%ymm14
+       vpxor   32(%rsi),%ymm2,%ymm2
+       vpxor   64(%rsi),%ymm3,%ymm3
+       vpxor   96(%rsi),%ymm7,%ymm7
+       leaq    128(%rsi),%rsi
+       vmovdqu %ymm14,0(%rdi)
+       vmovdqu %ymm2,32(%rdi)
+       vmovdqu %ymm3,64(%rdi)
+       vmovdqu %ymm7,96(%rdi)
+       leaq    128(%rdi),%rdi
+
+       vpxor   0(%rsi),%ymm11,%ymm11
+       vpxor   32(%rsi),%ymm9,%ymm9
+       vpxor   64(%rsi),%ymm0,%ymm0
+       vpxor   96(%rsi),%ymm4,%ymm4
+       leaq    128(%rsi),%rsi
+       vmovdqu %ymm11,0(%rdi)
+       vmovdqu %ymm9,32(%rdi)
+       vmovdqu %ymm0,64(%rdi)
+       vmovdqu %ymm4,96(%rdi)
+       leaq    128(%rdi),%rdi
+
+       subq    $512,%rdx
+       jnz     .Loop_outer8x
+
+       jmp     .Ldone8x
+
+.Ltail8x:
+       cmpq    $448,%rdx
+       jae     .L448_or_more8x
+       cmpq    $384,%rdx
+       jae     .L384_or_more8x
+       cmpq    $320,%rdx
+       jae     .L320_or_more8x
+       cmpq    $256,%rdx
+       jae     .L256_or_more8x
+       cmpq    $192,%rdx
+       jae     .L192_or_more8x
+       cmpq    $128,%rdx
+       jae     .L128_or_more8x
+       cmpq    $64,%rdx
+       jae     .L64_or_more8x
+
+       xorq    %r10,%r10
+       vmovdqa %ymm6,0(%rsp)
+       vmovdqa %ymm8,32(%rsp)
+       jmp     .Loop_tail8x
+
+.align 32
+.L64_or_more8x:
+       vpxor   0(%rsi),%ymm6,%ymm6
+       vpxor   32(%rsi),%ymm8,%ymm8
+       vmovdqu %ymm6,0(%rdi)
+       vmovdqu %ymm8,32(%rdi)
+       je      .Ldone8x
+
+       leaq    64(%rsi),%rsi
+       xorq    %r10,%r10
+       vmovdqa %ymm1,0(%rsp)
+       leaq    64(%rdi),%rdi
+       subq    $64,%rdx
+       vmovdqa %ymm5,32(%rsp)
+       jmp     .Loop_tail8x
+
+.align 32
+.L128_or_more8x:
+       vpxor   0(%rsi),%ymm6,%ymm6
+       vpxor   32(%rsi),%ymm8,%ymm8
+       vpxor   64(%rsi),%ymm1,%ymm1
+       vpxor   96(%rsi),%ymm5,%ymm5
+       vmovdqu %ymm6,0(%rdi)
+       vmovdqu %ymm8,32(%rdi)
+       vmovdqu %ymm1,64(%rdi)
+       vmovdqu %ymm5,96(%rdi)
+       je      .Ldone8x
+
+       leaq    128(%rsi),%rsi
+       xorq    %r10,%r10
+       vmovdqa %ymm12,0(%rsp)
+       leaq    128(%rdi),%rdi
+       subq    $128,%rdx
+       vmovdqa %ymm13,32(%rsp)
+       jmp     .Loop_tail8x
+
+.align 32
+.L192_or_more8x:
+       vpxor   0(%rsi),%ymm6,%ymm6
+       vpxor   32(%rsi),%ymm8,%ymm8
+       vpxor   64(%rsi),%ymm1,%ymm1
+       vpxor   96(%rsi),%ymm5,%ymm5
+       vpxor   128(%rsi),%ymm12,%ymm12
+       vpxor   160(%rsi),%ymm13,%ymm13
+       vmovdqu %ymm6,0(%rdi)
+       vmovdqu %ymm8,32(%rdi)
+       vmovdqu %ymm1,64(%rdi)
+       vmovdqu %ymm5,96(%rdi)
+       vmovdqu %ymm12,128(%rdi)
+       vmovdqu %ymm13,160(%rdi)
+       je      .Ldone8x
+
+       leaq    192(%rsi),%rsi
+       xorq    %r10,%r10
+       vmovdqa %ymm10,0(%rsp)
+       leaq    192(%rdi),%rdi
+       subq    $192,%rdx
+       vmovdqa %ymm15,32(%rsp)
+       jmp     .Loop_tail8x
+
+.align 32
+.L256_or_more8x:
+       vpxor   0(%rsi),%ymm6,%ymm6
+       vpxor   32(%rsi),%ymm8,%ymm8
+       vpxor   64(%rsi),%ymm1,%ymm1
+       vpxor   96(%rsi),%ymm5,%ymm5
+       vpxor   128(%rsi),%ymm12,%ymm12
+       vpxor   160(%rsi),%ymm13,%ymm13
+       vpxor   192(%rsi),%ymm10,%ymm10
+       vpxor   224(%rsi),%ymm15,%ymm15
+       vmovdqu %ymm6,0(%rdi)
+       vmovdqu %ymm8,32(%rdi)
+       vmovdqu %ymm1,64(%rdi)
+       vmovdqu %ymm5,96(%rdi)
+       vmovdqu %ymm12,128(%rdi)
+       vmovdqu %ymm13,160(%rdi)
+       vmovdqu %ymm10,192(%rdi)
+       vmovdqu %ymm15,224(%rdi)
+       je      .Ldone8x
+
+       leaq    256(%rsi),%rsi
+       xorq    %r10,%r10
+       vmovdqa %ymm14,0(%rsp)
+       leaq    256(%rdi),%rdi
+       subq    $256,%rdx
+       vmovdqa %ymm2,32(%rsp)
+       jmp     .Loop_tail8x
+
+.align 32
+.L320_or_more8x:
+       vpxor   0(%rsi),%ymm6,%ymm6
+       vpxor   32(%rsi),%ymm8,%ymm8
+       vpxor   64(%rsi),%ymm1,%ymm1
+       vpxor   96(%rsi),%ymm5,%ymm5
+       vpxor   128(%rsi),%ymm12,%ymm12
+       vpxor   160(%rsi),%ymm13,%ymm13
+       vpxor   192(%rsi),%ymm10,%ymm10
+       vpxor   224(%rsi),%ymm15,%ymm15
+       vpxor   256(%rsi),%ymm14,%ymm14
+       vpxor   288(%rsi),%ymm2,%ymm2
+       vmovdqu %ymm6,0(%rdi)
+       vmovdqu %ymm8,32(%rdi)
+       vmovdqu %ymm1,64(%rdi)
+       vmovdqu %ymm5,96(%rdi)
+       vmovdqu %ymm12,128(%rdi)
+       vmovdqu %ymm13,160(%rdi)
+       vmovdqu %ymm10,192(%rdi)
+       vmovdqu %ymm15,224(%rdi)
+       vmovdqu %ymm14,256(%rdi)
+       vmovdqu %ymm2,288(%rdi)
+       je      .Ldone8x
+
+       leaq    320(%rsi),%rsi
+       xorq    %r10,%r10
+       vmovdqa %ymm3,0(%rsp)
+       leaq    320(%rdi),%rdi
+       subq    $320,%rdx
+       vmovdqa %ymm7,32(%rsp)
+       jmp     .Loop_tail8x
+
+.align 32
+.L384_or_more8x:
+       vpxor   0(%rsi),%ymm6,%ymm6
+       vpxor   32(%rsi),%ymm8,%ymm8
+       vpxor   64(%rsi),%ymm1,%ymm1
+       vpxor   96(%rsi),%ymm5,%ymm5
+       vpxor   128(%rsi),%ymm12,%ymm12
+       vpxor   160(%rsi),%ymm13,%ymm13
+       vpxor   192(%rsi),%ymm10,%ymm10
+       vpxor   224(%rsi),%ymm15,%ymm15
+       vpxor   256(%rsi),%ymm14,%ymm14
+       vpxor   288(%rsi),%ymm2,%ymm2
+       vpxor   320(%rsi),%ymm3,%ymm3
+       vpxor   352(%rsi),%ymm7,%ymm7
+       vmovdqu %ymm6,0(%rdi)
+       vmovdqu %ymm8,32(%rdi)
+       vmovdqu %ymm1,64(%rdi)
+       vmovdqu %ymm5,96(%rdi)
+       vmovdqu %ymm12,128(%rdi)
+       vmovdqu %ymm13,160(%rdi)
+       vmovdqu %ymm10,192(%rdi)
+       vmovdqu %ymm15,224(%rdi)
+       vmovdqu %ymm14,256(%rdi)
+       vmovdqu %ymm2,288(%rdi)
+       vmovdqu %ymm3,320(%rdi)
+       vmovdqu %ymm7,352(%rdi)
+       je      .Ldone8x
+
+       leaq    384(%rsi),%rsi
+       xorq    %r10,%r10
+       vmovdqa %ymm11,0(%rsp)
+       leaq    384(%rdi),%rdi
+       subq    $384,%rdx
+       vmovdqa %ymm9,32(%rsp)
+       jmp     .Loop_tail8x
+
+.align 32
+.L448_or_more8x:
+       vpxor   0(%rsi),%ymm6,%ymm6
+       vpxor   32(%rsi),%ymm8,%ymm8
+       vpxor   64(%rsi),%ymm1,%ymm1
+       vpxor   96(%rsi),%ymm5,%ymm5
+       vpxor   128(%rsi),%ymm12,%ymm12
+       vpxor   160(%rsi),%ymm13,%ymm13
+       vpxor   192(%rsi),%ymm10,%ymm10
+       vpxor   224(%rsi),%ymm15,%ymm15
+       vpxor   256(%rsi),%ymm14,%ymm14
+       vpxor   288(%rsi),%ymm2,%ymm2
+       vpxor   320(%rsi),%ymm3,%ymm3
+       vpxor   352(%rsi),%ymm7,%ymm7
+       vpxor   384(%rsi),%ymm11,%ymm11
+       vpxor   416(%rsi),%ymm9,%ymm9
+       vmovdqu %ymm6,0(%rdi)
+       vmovdqu %ymm8,32(%rdi)
+       vmovdqu %ymm1,64(%rdi)
+       vmovdqu %ymm5,96(%rdi)
+       vmovdqu %ymm12,128(%rdi)
+       vmovdqu %ymm13,160(%rdi)
+       vmovdqu %ymm10,192(%rdi)
+       vmovdqu %ymm15,224(%rdi)
+       vmovdqu %ymm14,256(%rdi)
+       vmovdqu %ymm2,288(%rdi)
+       vmovdqu %ymm3,320(%rdi)
+       vmovdqu %ymm7,352(%rdi)
+       vmovdqu %ymm11,384(%rdi)
+       vmovdqu %ymm9,416(%rdi)
+       je      .Ldone8x
+
+       leaq    448(%rsi),%rsi
+       xorq    %r10,%r10
+       vmovdqa %ymm0,0(%rsp)
+       leaq    448(%rdi),%rdi
+       subq    $448,%rdx
+       vmovdqa %ymm4,32(%rsp)
+
+.Loop_tail8x:
+       movzbl  (%rsi,%r10,1),%eax
+       movzbl  (%rsp,%r10,1),%ecx
+       leaq    1(%r10),%r10
+       xorl    %ecx,%eax
+       movb    %al,-1(%rdi,%r10,1)
+       decq    %rdx
+       jnz     .Loop_tail8x
+
+.Ldone8x:
+       vzeroall
+       leaq    (%r9),%rsp
+.cfi_def_cfa_register  %rsp
+.L8x_epilogue:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  ChaCha20_8x,.-ChaCha20_8x
+.type  ChaCha20_avx512,@function
+.align 32
+ChaCha20_avx512:
+.cfi_startproc 
+.LChaCha20_avx512:
+       movq    %rsp,%r9
+.cfi_def_cfa_register  %r9
+       cmpq    $512,%rdx
+       ja      .LChaCha20_16x
+
+       subq    $64+8,%rsp
+       vbroadcasti32x4 .Lsigma(%rip),%zmm0
+       vbroadcasti32x4 (%rcx),%zmm1
+       vbroadcasti32x4 16(%rcx),%zmm2
+       vbroadcasti32x4 (%r8),%zmm3
+
+       vmovdqa32       %zmm0,%zmm16
+       vmovdqa32       %zmm1,%zmm17
+       vmovdqa32       %zmm2,%zmm18
+       vpaddd  .Lzeroz(%rip),%zmm3,%zmm3
+       vmovdqa32       .Lfourz(%rip),%zmm20
+       movq    $10,%r8
+       vmovdqa32       %zmm3,%zmm19
+       jmp     .Loop_avx512
+
+.align 16
+.Loop_outer_avx512:
+       vmovdqa32       %zmm16,%zmm0
+       vmovdqa32       %zmm17,%zmm1
+       vmovdqa32       %zmm18,%zmm2
+       vpaddd  %zmm20,%zmm19,%zmm3
+       movq    $10,%r8
+       vmovdqa32       %zmm3,%zmm19
+       jmp     .Loop_avx512
+
+.align 32
+.Loop_avx512:
+       vpaddd  %zmm1,%zmm0,%zmm0
+       vpxord  %zmm0,%zmm3,%zmm3
+       vprold  $16,%zmm3,%zmm3
+       vpaddd  %zmm3,%zmm2,%zmm2
+       vpxord  %zmm2,%zmm1,%zmm1
+       vprold  $12,%zmm1,%zmm1
+       vpaddd  %zmm1,%zmm0,%zmm0
+       vpxord  %zmm0,%zmm3,%zmm3
+       vprold  $8,%zmm3,%zmm3
+       vpaddd  %zmm3,%zmm2,%zmm2
+       vpxord  %zmm2,%zmm1,%zmm1
+       vprold  $7,%zmm1,%zmm1
+       vpshufd $78,%zmm2,%zmm2
+       vpshufd $57,%zmm1,%zmm1
+       vpshufd $147,%zmm3,%zmm3
+       vpaddd  %zmm1,%zmm0,%zmm0
+       vpxord  %zmm0,%zmm3,%zmm3
+       vprold  $16,%zmm3,%zmm3
+       vpaddd  %zmm3,%zmm2,%zmm2
+       vpxord  %zmm2,%zmm1,%zmm1
+       vprold  $12,%zmm1,%zmm1
+       vpaddd  %zmm1,%zmm0,%zmm0
+       vpxord  %zmm0,%zmm3,%zmm3
+       vprold  $8,%zmm3,%zmm3
+       vpaddd  %zmm3,%zmm2,%zmm2
+       vpxord  %zmm2,%zmm1,%zmm1
+       vprold  $7,%zmm1,%zmm1
+       vpshufd $78,%zmm2,%zmm2
+       vpshufd $147,%zmm1,%zmm1
+       vpshufd $57,%zmm3,%zmm3
+       decq    %r8
+       jnz     .Loop_avx512
+       vpaddd  %zmm16,%zmm0,%zmm0
+       vpaddd  %zmm17,%zmm1,%zmm1
+       vpaddd  %zmm18,%zmm2,%zmm2
+       vpaddd  %zmm19,%zmm3,%zmm3
+
+       subq    $64,%rdx
+       jb      .Ltail64_avx512
+
+       vpxor   0(%rsi),%xmm0,%xmm4
+       vpxor   16(%rsi),%xmm1,%xmm5
+       vpxor   32(%rsi),%xmm2,%xmm6
+       vpxor   48(%rsi),%xmm3,%xmm7
+       leaq    64(%rsi),%rsi
+
+       vmovdqu %xmm4,0(%rdi)
+       vmovdqu %xmm5,16(%rdi)
+       vmovdqu %xmm6,32(%rdi)
+       vmovdqu %xmm7,48(%rdi)
+       leaq    64(%rdi),%rdi
+
+       jz      .Ldone_avx512
+
+       vextracti32x4   $1,%zmm0,%xmm4
+       vextracti32x4   $1,%zmm1,%xmm5
+       vextracti32x4   $1,%zmm2,%xmm6
+       vextracti32x4   $1,%zmm3,%xmm7
+
+       subq    $64,%rdx
+       jb      .Ltail_avx512
+
+       vpxor   0(%rsi),%xmm4,%xmm4
+       vpxor   16(%rsi),%xmm5,%xmm5
+       vpxor   32(%rsi),%xmm6,%xmm6
+       vpxor   48(%rsi),%xmm7,%xmm7
+       leaq    64(%rsi),%rsi
+
+       vmovdqu %xmm4,0(%rdi)
+       vmovdqu %xmm5,16(%rdi)
+       vmovdqu %xmm6,32(%rdi)
+       vmovdqu %xmm7,48(%rdi)
+       leaq    64(%rdi),%rdi
+
+       jz      .Ldone_avx512
+
+       vextracti32x4   $2,%zmm0,%xmm4
+       vextracti32x4   $2,%zmm1,%xmm5
+       vextracti32x4   $2,%zmm2,%xmm6
+       vextracti32x4   $2,%zmm3,%xmm7
+
+       subq    $64,%rdx
+       jb      .Ltail_avx512
+
+       vpxor   0(%rsi),%xmm4,%xmm4
+       vpxor   16(%rsi),%xmm5,%xmm5
+       vpxor   32(%rsi),%xmm6,%xmm6
+       vpxor   48(%rsi),%xmm7,%xmm7
+       leaq    64(%rsi),%rsi
+
+       vmovdqu %xmm4,0(%rdi)
+       vmovdqu %xmm5,16(%rdi)
+       vmovdqu %xmm6,32(%rdi)
+       vmovdqu %xmm7,48(%rdi)
+       leaq    64(%rdi),%rdi
+
+       jz      .Ldone_avx512
+
+       vextracti32x4   $3,%zmm0,%xmm4
+       vextracti32x4   $3,%zmm1,%xmm5
+       vextracti32x4   $3,%zmm2,%xmm6
+       vextracti32x4   $3,%zmm3,%xmm7
+
+       subq    $64,%rdx
+       jb      .Ltail_avx512
+
+       vpxor   0(%rsi),%xmm4,%xmm4
+       vpxor   16(%rsi),%xmm5,%xmm5
+       vpxor   32(%rsi),%xmm6,%xmm6
+       vpxor   48(%rsi),%xmm7,%xmm7
+       leaq    64(%rsi),%rsi
+
+       vmovdqu %xmm4,0(%rdi)
+       vmovdqu %xmm5,16(%rdi)
+       vmovdqu %xmm6,32(%rdi)
+       vmovdqu %xmm7,48(%rdi)
+       leaq    64(%rdi),%rdi
+
+       jnz     .Loop_outer_avx512
+
+       jmp     .Ldone_avx512
+
+.align 16
+.Ltail64_avx512:
+       vmovdqa %xmm0,0(%rsp)
+       vmovdqa %xmm1,16(%rsp)
+       vmovdqa %xmm2,32(%rsp)
+       vmovdqa %xmm3,48(%rsp)
+       addq    $64,%rdx
+       jmp     .Loop_tail_avx512
+
+.align 16
+.Ltail_avx512:
+       vmovdqa %xmm4,0(%rsp)
+       vmovdqa %xmm5,16(%rsp)
+       vmovdqa %xmm6,32(%rsp)
+       vmovdqa %xmm7,48(%rsp)
+       addq    $64,%rdx
+
+.Loop_tail_avx512:
+       movzbl  (%rsi,%r8,1),%eax
+       movzbl  (%rsp,%r8,1),%ecx
+       leaq    1(%r8),%r8
+       xorl    %ecx,%eax
+       movb    %al,-1(%rdi,%r8,1)
+       decq    %rdx
+       jnz     .Loop_tail_avx512
+
+       vmovdqu32       %zmm16,0(%rsp)
+
+.Ldone_avx512:
+       vzeroall
+       leaq    (%r9),%rsp
+.cfi_def_cfa_register  %rsp
+.Lavx512_epilogue:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  ChaCha20_avx512,.-ChaCha20_avx512
+.type  ChaCha20_avx512vl,@function
+.align 32
+ChaCha20_avx512vl:
+.cfi_startproc 
+.LChaCha20_avx512vl:
+       movq    %rsp,%r9
+.cfi_def_cfa_register  %r9
+       cmpq    $128,%rdx
+       ja      .LChaCha20_8xvl
+
+       subq    $64+8,%rsp
+       vbroadcasti128  .Lsigma(%rip),%ymm0
+       vbroadcasti128  (%rcx),%ymm1
+       vbroadcasti128  16(%rcx),%ymm2
+       vbroadcasti128  (%r8),%ymm3
+
+       vmovdqa32       %ymm0,%ymm16
+       vmovdqa32       %ymm1,%ymm17
+       vmovdqa32       %ymm2,%ymm18
+       vpaddd  .Lzeroz(%rip),%ymm3,%ymm3
+       vmovdqa32       .Ltwoy(%rip),%ymm20
+       movq    $10,%r8
+       vmovdqa32       %ymm3,%ymm19
+       jmp     .Loop_avx512vl
+
+.align 16
+.Loop_outer_avx512vl:
+       vmovdqa32       %ymm18,%ymm2
+       vpaddd  %ymm20,%ymm19,%ymm3
+       movq    $10,%r8
+       vmovdqa32       %ymm3,%ymm19
+       jmp     .Loop_avx512vl
+
+.align 32
+.Loop_avx512vl:
+       vpaddd  %ymm1,%ymm0,%ymm0
+       vpxor   %ymm0,%ymm3,%ymm3
+       vprold  $16,%ymm3,%ymm3
+       vpaddd  %ymm3,%ymm2,%ymm2
+       vpxor   %ymm2,%ymm1,%ymm1
+       vprold  $12,%ymm1,%ymm1
+       vpaddd  %ymm1,%ymm0,%ymm0
+       vpxor   %ymm0,%ymm3,%ymm3
+       vprold  $8,%ymm3,%ymm3
+       vpaddd  %ymm3,%ymm2,%ymm2
+       vpxor   %ymm2,%ymm1,%ymm1
+       vprold  $7,%ymm1,%ymm1
+       vpshufd $78,%ymm2,%ymm2
+       vpshufd $57,%ymm1,%ymm1
+       vpshufd $147,%ymm3,%ymm3
+       vpaddd  %ymm1,%ymm0,%ymm0
+       vpxor   %ymm0,%ymm3,%ymm3
+       vprold  $16,%ymm3,%ymm3
+       vpaddd  %ymm3,%ymm2,%ymm2
+       vpxor   %ymm2,%ymm1,%ymm1
+       vprold  $12,%ymm1,%ymm1
+       vpaddd  %ymm1,%ymm0,%ymm0
+       vpxor   %ymm0,%ymm3,%ymm3
+       vprold  $8,%ymm3,%ymm3
+       vpaddd  %ymm3,%ymm2,%ymm2
+       vpxor   %ymm2,%ymm1,%ymm1
+       vprold  $7,%ymm1,%ymm1
+       vpshufd $78,%ymm2,%ymm2
+       vpshufd $147,%ymm1,%ymm1
+       vpshufd $57,%ymm3,%ymm3
+       decq    %r8
+       jnz     .Loop_avx512vl
+       vpaddd  %ymm16,%ymm0,%ymm0
+       vpaddd  %ymm17,%ymm1,%ymm1
+       vpaddd  %ymm18,%ymm2,%ymm2
+       vpaddd  %ymm19,%ymm3,%ymm3
+
+       subq    $64,%rdx
+       jb      .Ltail64_avx512vl
+
+       vpxor   0(%rsi),%xmm0,%xmm4
+       vpxor   16(%rsi),%xmm1,%xmm5
+       vpxor   32(%rsi),%xmm2,%xmm6
+       vpxor   48(%rsi),%xmm3,%xmm7
+       leaq    64(%rsi),%rsi
+
+       vmovdqu %xmm4,0(%rdi)
+       vmovdqu %xmm5,16(%rdi)
+       vmovdqu %xmm6,32(%rdi)
+       vmovdqu %xmm7,48(%rdi)
+       leaq    64(%rdi),%rdi
+
+       jz      .Ldone_avx512vl
+
+       vextracti128    $1,%ymm0,%xmm4
+       vextracti128    $1,%ymm1,%xmm5
+       vextracti128    $1,%ymm2,%xmm6
+       vextracti128    $1,%ymm3,%xmm7
+
+       subq    $64,%rdx
+       jb      .Ltail_avx512vl
+
+       vpxor   0(%rsi),%xmm4,%xmm4
+       vpxor   16(%rsi),%xmm5,%xmm5
+       vpxor   32(%rsi),%xmm6,%xmm6
+       vpxor   48(%rsi),%xmm7,%xmm7
+       leaq    64(%rsi),%rsi
+
+       vmovdqu %xmm4,0(%rdi)
+       vmovdqu %xmm5,16(%rdi)
+       vmovdqu %xmm6,32(%rdi)
+       vmovdqu %xmm7,48(%rdi)
+       leaq    64(%rdi),%rdi
+
+       vmovdqa32       %ymm16,%ymm0
+       vmovdqa32       %ymm17,%ymm1
+       jnz     .Loop_outer_avx512vl
+
+       jmp     .Ldone_avx512vl
+
+.align 16
+.Ltail64_avx512vl:
+       vmovdqa %xmm0,0(%rsp)
+       vmovdqa %xmm1,16(%rsp)
+       vmovdqa %xmm2,32(%rsp)
+       vmovdqa %xmm3,48(%rsp)
+       addq    $64,%rdx
+       jmp     .Loop_tail_avx512vl
+
+.align 16
+.Ltail_avx512vl:
+       vmovdqa %xmm4,0(%rsp)
+       vmovdqa %xmm5,16(%rsp)
+       vmovdqa %xmm6,32(%rsp)
+       vmovdqa %xmm7,48(%rsp)
+       addq    $64,%rdx
+
+.Loop_tail_avx512vl:
+       movzbl  (%rsi,%r8,1),%eax
+       movzbl  (%rsp,%r8,1),%ecx
+       leaq    1(%r8),%r8
+       xorl    %ecx,%eax
+       movb    %al,-1(%rdi,%r8,1)
+       decq    %rdx
+       jnz     .Loop_tail_avx512vl
+
+       vmovdqu32       %ymm16,0(%rsp)
+       vmovdqu32       %ymm16,32(%rsp)
+
+.Ldone_avx512vl:
+       vzeroall
+       leaq    (%r9),%rsp
+.cfi_def_cfa_register  %rsp
+.Lavx512vl_epilogue:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  ChaCha20_avx512vl,.-ChaCha20_avx512vl
+.type  ChaCha20_16x,@function
+.align 32
+ChaCha20_16x:
+.cfi_startproc 
+.LChaCha20_16x:
+       movq    %rsp,%r9
+.cfi_def_cfa_register  %r9
+       subq    $64+8,%rsp
+       andq    $-64,%rsp
+       vzeroupper
+
+       leaq    .Lsigma(%rip),%r10
+       vbroadcasti32x4 (%r10),%zmm3
+       vbroadcasti32x4 (%rcx),%zmm7
+       vbroadcasti32x4 16(%rcx),%zmm11
+       vbroadcasti32x4 (%r8),%zmm15
+
+       vpshufd $0x00,%zmm3,%zmm0
+       vpshufd $0x55,%zmm3,%zmm1
+       vpshufd $0xaa,%zmm3,%zmm2
+       vpshufd $0xff,%zmm3,%zmm3
+       vmovdqa64       %zmm0,%zmm16
+       vmovdqa64       %zmm1,%zmm17
+       vmovdqa64       %zmm2,%zmm18
+       vmovdqa64       %zmm3,%zmm19
+
+       vpshufd $0x00,%zmm7,%zmm4
+       vpshufd $0x55,%zmm7,%zmm5
+       vpshufd $0xaa,%zmm7,%zmm6
+       vpshufd $0xff,%zmm7,%zmm7
+       vmovdqa64       %zmm4,%zmm20
+       vmovdqa64       %zmm5,%zmm21
+       vmovdqa64       %zmm6,%zmm22
+       vmovdqa64       %zmm7,%zmm23
+
+       vpshufd $0x00,%zmm11,%zmm8
+       vpshufd $0x55,%zmm11,%zmm9
+       vpshufd $0xaa,%zmm11,%zmm10
+       vpshufd $0xff,%zmm11,%zmm11
+       vmovdqa64       %zmm8,%zmm24
+       vmovdqa64       %zmm9,%zmm25
+       vmovdqa64       %zmm10,%zmm26
+       vmovdqa64       %zmm11,%zmm27
+
+       vpshufd $0x00,%zmm15,%zmm12
+       vpshufd $0x55,%zmm15,%zmm13
+       vpshufd $0xaa,%zmm15,%zmm14
+       vpshufd $0xff,%zmm15,%zmm15
+       vpaddd  .Lincz(%rip),%zmm12,%zmm12
+       vmovdqa64       %zmm12,%zmm28
+       vmovdqa64       %zmm13,%zmm29
+       vmovdqa64       %zmm14,%zmm30
+       vmovdqa64       %zmm15,%zmm31
+
+       movl    $10,%eax
+       jmp     .Loop16x
+
+.align 32
+.Loop_outer16x:
+       vpbroadcastd    0(%r10),%zmm0
+       vpbroadcastd    4(%r10),%zmm1
+       vpbroadcastd    8(%r10),%zmm2
+       vpbroadcastd    12(%r10),%zmm3
+       vpaddd  .Lsixteen(%rip),%zmm28,%zmm28
+       vmovdqa64       %zmm20,%zmm4
+       vmovdqa64       %zmm21,%zmm5
+       vmovdqa64       %zmm22,%zmm6
+       vmovdqa64       %zmm23,%zmm7
+       vmovdqa64       %zmm24,%zmm8
+       vmovdqa64       %zmm25,%zmm9
+       vmovdqa64       %zmm26,%zmm10
+       vmovdqa64       %zmm27,%zmm11
+       vmovdqa64       %zmm28,%zmm12
+       vmovdqa64       %zmm29,%zmm13
+       vmovdqa64       %zmm30,%zmm14
+       vmovdqa64       %zmm31,%zmm15
+
+       vmovdqa64       %zmm0,%zmm16
+       vmovdqa64       %zmm1,%zmm17
+       vmovdqa64       %zmm2,%zmm18
+       vmovdqa64       %zmm3,%zmm19
+
+       movl    $10,%eax
+       jmp     .Loop16x
+
+.align 32
+.Loop16x:
+       vpaddd  %zmm4,%zmm0,%zmm0
+       vpaddd  %zmm5,%zmm1,%zmm1
+       vpaddd  %zmm6,%zmm2,%zmm2
+       vpaddd  %zmm7,%zmm3,%zmm3
+       vpxord  %zmm0,%zmm12,%zmm12
+       vpxord  %zmm1,%zmm13,%zmm13
+       vpxord  %zmm2,%zmm14,%zmm14
+       vpxord  %zmm3,%zmm15,%zmm15
+       vprold  $16,%zmm12,%zmm12
+       vprold  $16,%zmm13,%zmm13
+       vprold  $16,%zmm14,%zmm14
+       vprold  $16,%zmm15,%zmm15
+       vpaddd  %zmm12,%zmm8,%zmm8
+       vpaddd  %zmm13,%zmm9,%zmm9
+       vpaddd  %zmm14,%zmm10,%zmm10
+       vpaddd  %zmm15,%zmm11,%zmm11
+       vpxord  %zmm8,%zmm4,%zmm4
+       vpxord  %zmm9,%zmm5,%zmm5
+       vpxord  %zmm10,%zmm6,%zmm6
+       vpxord  %zmm11,%zmm7,%zmm7
+       vprold  $12,%zmm4,%zmm4
+       vprold  $12,%zmm5,%zmm5
+       vprold  $12,%zmm6,%zmm6
+       vprold  $12,%zmm7,%zmm7
+       vpaddd  %zmm4,%zmm0,%zmm0
+       vpaddd  %zmm5,%zmm1,%zmm1
+       vpaddd  %zmm6,%zmm2,%zmm2
+       vpaddd  %zmm7,%zmm3,%zmm3
+       vpxord  %zmm0,%zmm12,%zmm12
+       vpxord  %zmm1,%zmm13,%zmm13
+       vpxord  %zmm2,%zmm14,%zmm14
+       vpxord  %zmm3,%zmm15,%zmm15
+       vprold  $8,%zmm12,%zmm12
+       vprold  $8,%zmm13,%zmm13
+       vprold  $8,%zmm14,%zmm14
+       vprold  $8,%zmm15,%zmm15
+       vpaddd  %zmm12,%zmm8,%zmm8
+       vpaddd  %zmm13,%zmm9,%zmm9
+       vpaddd  %zmm14,%zmm10,%zmm10
+       vpaddd  %zmm15,%zmm11,%zmm11
+       vpxord  %zmm8,%zmm4,%zmm4
+       vpxord  %zmm9,%zmm5,%zmm5
+       vpxord  %zmm10,%zmm6,%zmm6
+       vpxord  %zmm11,%zmm7,%zmm7
+       vprold  $7,%zmm4,%zmm4
+       vprold  $7,%zmm5,%zmm5
+       vprold  $7,%zmm6,%zmm6
+       vprold  $7,%zmm7,%zmm7
+       vpaddd  %zmm5,%zmm0,%zmm0
+       vpaddd  %zmm6,%zmm1,%zmm1
+       vpaddd  %zmm7,%zmm2,%zmm2
+       vpaddd  %zmm4,%zmm3,%zmm3
+       vpxord  %zmm0,%zmm15,%zmm15
+       vpxord  %zmm1,%zmm12,%zmm12
+       vpxord  %zmm2,%zmm13,%zmm13
+       vpxord  %zmm3,%zmm14,%zmm14
+       vprold  $16,%zmm15,%zmm15
+       vprold  $16,%zmm12,%zmm12
+       vprold  $16,%zmm13,%zmm13
+       vprold  $16,%zmm14,%zmm14
+       vpaddd  %zmm15,%zmm10,%zmm10
+       vpaddd  %zmm12,%zmm11,%zmm11
+       vpaddd  %zmm13,%zmm8,%zmm8
+       vpaddd  %zmm14,%zmm9,%zmm9
+       vpxord  %zmm10,%zmm5,%zmm5
+       vpxord  %zmm11,%zmm6,%zmm6
+       vpxord  %zmm8,%zmm7,%zmm7
+       vpxord  %zmm9,%zmm4,%zmm4
+       vprold  $12,%zmm5,%zmm5
+       vprold  $12,%zmm6,%zmm6
+       vprold  $12,%zmm7,%zmm7
+       vprold  $12,%zmm4,%zmm4
+       vpaddd  %zmm5,%zmm0,%zmm0
+       vpaddd  %zmm6,%zmm1,%zmm1
+       vpaddd  %zmm7,%zmm2,%zmm2
+       vpaddd  %zmm4,%zmm3,%zmm3
+       vpxord  %zmm0,%zmm15,%zmm15
+       vpxord  %zmm1,%zmm12,%zmm12
+       vpxord  %zmm2,%zmm13,%zmm13
+       vpxord  %zmm3,%zmm14,%zmm14
+       vprold  $8,%zmm15,%zmm15
+       vprold  $8,%zmm12,%zmm12
+       vprold  $8,%zmm13,%zmm13
+       vprold  $8,%zmm14,%zmm14
+       vpaddd  %zmm15,%zmm10,%zmm10
+       vpaddd  %zmm12,%zmm11,%zmm11
+       vpaddd  %zmm13,%zmm8,%zmm8
+       vpaddd  %zmm14,%zmm9,%zmm9
+       vpxord  %zmm10,%zmm5,%zmm5
+       vpxord  %zmm11,%zmm6,%zmm6
+       vpxord  %zmm8,%zmm7,%zmm7
+       vpxord  %zmm9,%zmm4,%zmm4
+       vprold  $7,%zmm5,%zmm5
+       vprold  $7,%zmm6,%zmm6
+       vprold  $7,%zmm7,%zmm7
+       vprold  $7,%zmm4,%zmm4
+       decl    %eax
+       jnz     .Loop16x
+
+       vpaddd  %zmm16,%zmm0,%zmm0
+       vpaddd  %zmm17,%zmm1,%zmm1
+       vpaddd  %zmm18,%zmm2,%zmm2
+       vpaddd  %zmm19,%zmm3,%zmm3
+
+       vpunpckldq      %zmm1,%zmm0,%zmm18
+       vpunpckldq      %zmm3,%zmm2,%zmm19
+       vpunpckhdq      %zmm1,%zmm0,%zmm0
+       vpunpckhdq      %zmm3,%zmm2,%zmm2
+       vpunpcklqdq     %zmm19,%zmm18,%zmm1
+       vpunpckhqdq     %zmm19,%zmm18,%zmm18
+       vpunpcklqdq     %zmm2,%zmm0,%zmm3
+       vpunpckhqdq     %zmm2,%zmm0,%zmm0
+       vpaddd  %zmm20,%zmm4,%zmm4
+       vpaddd  %zmm21,%zmm5,%zmm5
+       vpaddd  %zmm22,%zmm6,%zmm6
+       vpaddd  %zmm23,%zmm7,%zmm7
+
+       vpunpckldq      %zmm5,%zmm4,%zmm2
+       vpunpckldq      %zmm7,%zmm6,%zmm19
+       vpunpckhdq      %zmm5,%zmm4,%zmm4
+       vpunpckhdq      %zmm7,%zmm6,%zmm6
+       vpunpcklqdq     %zmm19,%zmm2,%zmm5
+       vpunpckhqdq     %zmm19,%zmm2,%zmm2
+       vpunpcklqdq     %zmm6,%zmm4,%zmm7
+       vpunpckhqdq     %zmm6,%zmm4,%zmm4
+       vshufi32x4      $0x44,%zmm5,%zmm1,%zmm19
+       vshufi32x4      $0xee,%zmm5,%zmm1,%zmm5
+       vshufi32x4      $0x44,%zmm2,%zmm18,%zmm1
+       vshufi32x4      $0xee,%zmm2,%zmm18,%zmm2
+       vshufi32x4      $0x44,%zmm7,%zmm3,%zmm18
+       vshufi32x4      $0xee,%zmm7,%zmm3,%zmm7
+       vshufi32x4      $0x44,%zmm4,%zmm0,%zmm3
+       vshufi32x4      $0xee,%zmm4,%zmm0,%zmm4
+       vpaddd  %zmm24,%zmm8,%zmm8
+       vpaddd  %zmm25,%zmm9,%zmm9
+       vpaddd  %zmm26,%zmm10,%zmm10
+       vpaddd  %zmm27,%zmm11,%zmm11
+
+       vpunpckldq      %zmm9,%zmm8,%zmm6
+       vpunpckldq      %zmm11,%zmm10,%zmm0
+       vpunpckhdq      %zmm9,%zmm8,%zmm8
+       vpunpckhdq      %zmm11,%zmm10,%zmm10
+       vpunpcklqdq     %zmm0,%zmm6,%zmm9
+       vpunpckhqdq     %zmm0,%zmm6,%zmm6
+       vpunpcklqdq     %zmm10,%zmm8,%zmm11
+       vpunpckhqdq     %zmm10,%zmm8,%zmm8
+       vpaddd  %zmm28,%zmm12,%zmm12
+       vpaddd  %zmm29,%zmm13,%zmm13
+       vpaddd  %zmm30,%zmm14,%zmm14
+       vpaddd  %zmm31,%zmm15,%zmm15
+
+       vpunpckldq      %zmm13,%zmm12,%zmm10
+       vpunpckldq      %zmm15,%zmm14,%zmm0
+       vpunpckhdq      %zmm13,%zmm12,%zmm12
+       vpunpckhdq      %zmm15,%zmm14,%zmm14
+       vpunpcklqdq     %zmm0,%zmm10,%zmm13
+       vpunpckhqdq     %zmm0,%zmm10,%zmm10
+       vpunpcklqdq     %zmm14,%zmm12,%zmm15
+       vpunpckhqdq     %zmm14,%zmm12,%zmm12
+       vshufi32x4      $0x44,%zmm13,%zmm9,%zmm0
+       vshufi32x4      $0xee,%zmm13,%zmm9,%zmm13
+       vshufi32x4      $0x44,%zmm10,%zmm6,%zmm9
+       vshufi32x4      $0xee,%zmm10,%zmm6,%zmm10
+       vshufi32x4      $0x44,%zmm15,%zmm11,%zmm6
+       vshufi32x4      $0xee,%zmm15,%zmm11,%zmm15
+       vshufi32x4      $0x44,%zmm12,%zmm8,%zmm11
+       vshufi32x4      $0xee,%zmm12,%zmm8,%zmm12
+       vshufi32x4      $0x88,%zmm0,%zmm19,%zmm16
+       vshufi32x4      $0xdd,%zmm0,%zmm19,%zmm19
+       vshufi32x4      $0x88,%zmm13,%zmm5,%zmm0
+       vshufi32x4      $0xdd,%zmm13,%zmm5,%zmm13
+       vshufi32x4      $0x88,%zmm9,%zmm1,%zmm17
+       vshufi32x4      $0xdd,%zmm9,%zmm1,%zmm1
+       vshufi32x4      $0x88,%zmm10,%zmm2,%zmm9
+       vshufi32x4      $0xdd,%zmm10,%zmm2,%zmm10
+       vshufi32x4      $0x88,%zmm6,%zmm18,%zmm14
+       vshufi32x4      $0xdd,%zmm6,%zmm18,%zmm18
+       vshufi32x4      $0x88,%zmm15,%zmm7,%zmm6
+       vshufi32x4      $0xdd,%zmm15,%zmm7,%zmm15
+       vshufi32x4      $0x88,%zmm11,%zmm3,%zmm8
+       vshufi32x4      $0xdd,%zmm11,%zmm3,%zmm3
+       vshufi32x4      $0x88,%zmm12,%zmm4,%zmm11
+       vshufi32x4      $0xdd,%zmm12,%zmm4,%zmm12
+       cmpq    $1024,%rdx
+       jb      .Ltail16x
+
+       vpxord  0(%rsi),%zmm16,%zmm16
+       vpxord  64(%rsi),%zmm17,%zmm17
+       vpxord  128(%rsi),%zmm14,%zmm14
+       vpxord  192(%rsi),%zmm8,%zmm8
+       vmovdqu32       %zmm16,0(%rdi)
+       vmovdqu32       %zmm17,64(%rdi)
+       vmovdqu32       %zmm14,128(%rdi)
+       vmovdqu32       %zmm8,192(%rdi)
+
+       vpxord  256(%rsi),%zmm19,%zmm19
+       vpxord  320(%rsi),%zmm1,%zmm1
+       vpxord  384(%rsi),%zmm18,%zmm18
+       vpxord  448(%rsi),%zmm3,%zmm3
+       vmovdqu32       %zmm19,256(%rdi)
+       vmovdqu32       %zmm1,320(%rdi)
+       vmovdqu32       %zmm18,384(%rdi)
+       vmovdqu32       %zmm3,448(%rdi)
+
+       vpxord  512(%rsi),%zmm0,%zmm0
+       vpxord  576(%rsi),%zmm9,%zmm9
+       vpxord  640(%rsi),%zmm6,%zmm6
+       vpxord  704(%rsi),%zmm11,%zmm11
+       vmovdqu32       %zmm0,512(%rdi)
+       vmovdqu32       %zmm9,576(%rdi)
+       vmovdqu32       %zmm6,640(%rdi)
+       vmovdqu32       %zmm11,704(%rdi)
+
+       vpxord  768(%rsi),%zmm13,%zmm13
+       vpxord  832(%rsi),%zmm10,%zmm10
+       vpxord  896(%rsi),%zmm15,%zmm15
+       vpxord  960(%rsi),%zmm12,%zmm12
+       leaq    1024(%rsi),%rsi
+       vmovdqu32       %zmm13,768(%rdi)
+       vmovdqu32       %zmm10,832(%rdi)
+       vmovdqu32       %zmm15,896(%rdi)
+       vmovdqu32       %zmm12,960(%rdi)
+       leaq    1024(%rdi),%rdi
+
+       subq    $1024,%rdx
+       jnz     .Loop_outer16x
+
+       jmp     .Ldone16x
+
+.align 32
+.Ltail16x:
+       xorq    %r10,%r10
+       subq    %rsi,%rdi
+       cmpq    $64,%rdx
+       jb      .Less_than_64_16x
+       vpxord  (%rsi),%zmm16,%zmm16
+       vmovdqu32       %zmm16,(%rdi,%rsi,1)
+       je      .Ldone16x
+       vmovdqa32       %zmm17,%zmm16
+       leaq    64(%rsi),%rsi
+
+       cmpq    $128,%rdx
+       jb      .Less_than_64_16x
+       vpxord  (%rsi),%zmm17,%zmm17
+       vmovdqu32       %zmm17,(%rdi,%rsi,1)
+       je      .Ldone16x
+       vmovdqa32       %zmm14,%zmm16
+       leaq    64(%rsi),%rsi
+
+       cmpq    $192,%rdx
+       jb      .Less_than_64_16x
+       vpxord  (%rsi),%zmm14,%zmm14
+       vmovdqu32       %zmm14,(%rdi,%rsi,1)
+       je      .Ldone16x
+       vmovdqa32       %zmm8,%zmm16
+       leaq    64(%rsi),%rsi
+
+       cmpq    $256,%rdx
+       jb      .Less_than_64_16x
+       vpxord  (%rsi),%zmm8,%zmm8
+       vmovdqu32       %zmm8,(%rdi,%rsi,1)
+       je      .Ldone16x
+       vmovdqa32       %zmm19,%zmm16
+       leaq    64(%rsi),%rsi
+
+       cmpq    $320,%rdx
+       jb      .Less_than_64_16x
+       vpxord  (%rsi),%zmm19,%zmm19
+       vmovdqu32       %zmm19,(%rdi,%rsi,1)
+       je      .Ldone16x
+       vmovdqa32       %zmm1,%zmm16
+       leaq    64(%rsi),%rsi
+
+       cmpq    $384,%rdx
+       jb      .Less_than_64_16x
+       vpxord  (%rsi),%zmm1,%zmm1
+       vmovdqu32       %zmm1,(%rdi,%rsi,1)
+       je      .Ldone16x
+       vmovdqa32       %zmm18,%zmm16
+       leaq    64(%rsi),%rsi
+
+       cmpq    $448,%rdx
+       jb      .Less_than_64_16x
+       vpxord  (%rsi),%zmm18,%zmm18
+       vmovdqu32       %zmm18,(%rdi,%rsi,1)
+       je      .Ldone16x
+       vmovdqa32       %zmm3,%zmm16
+       leaq    64(%rsi),%rsi
+
+       cmpq    $512,%rdx
+       jb      .Less_than_64_16x
+       vpxord  (%rsi),%zmm3,%zmm3
+       vmovdqu32       %zmm3,(%rdi,%rsi,1)
+       je      .Ldone16x
+       vmovdqa32       %zmm0,%zmm16
+       leaq    64(%rsi),%rsi
+
+       cmpq    $576,%rdx
+       jb      .Less_than_64_16x
+       vpxord  (%rsi),%zmm0,%zmm0
+       vmovdqu32       %zmm0,(%rdi,%rsi,1)
+       je      .Ldone16x
+       vmovdqa32       %zmm9,%zmm16
+       leaq    64(%rsi),%rsi
+
+       cmpq    $640,%rdx
+       jb      .Less_than_64_16x
+       vpxord  (%rsi),%zmm9,%zmm9
+       vmovdqu32       %zmm9,(%rdi,%rsi,1)
+       je      .Ldone16x
+       vmovdqa32       %zmm6,%zmm16
+       leaq    64(%rsi),%rsi
+
+       cmpq    $704,%rdx
+       jb      .Less_than_64_16x
+       vpxord  (%rsi),%zmm6,%zmm6
+       vmovdqu32       %zmm6,(%rdi,%rsi,1)
+       je      .Ldone16x
+       vmovdqa32       %zmm11,%zmm16
+       leaq    64(%rsi),%rsi
+
+       cmpq    $768,%rdx
+       jb      .Less_than_64_16x
+       vpxord  (%rsi),%zmm11,%zmm11
+       vmovdqu32       %zmm11,(%rdi,%rsi,1)
+       je      .Ldone16x
+       vmovdqa32       %zmm13,%zmm16
+       leaq    64(%rsi),%rsi
+
+       cmpq    $832,%rdx
+       jb      .Less_than_64_16x
+       vpxord  (%rsi),%zmm13,%zmm13
+       vmovdqu32       %zmm13,(%rdi,%rsi,1)
+       je      .Ldone16x
+       vmovdqa32       %zmm10,%zmm16
+       leaq    64(%rsi),%rsi
+
+       cmpq    $896,%rdx
+       jb      .Less_than_64_16x
+       vpxord  (%rsi),%zmm10,%zmm10
+       vmovdqu32       %zmm10,(%rdi,%rsi,1)
+       je      .Ldone16x
+       vmovdqa32       %zmm15,%zmm16
+       leaq    64(%rsi),%rsi
+
+       cmpq    $960,%rdx
+       jb      .Less_than_64_16x
+       vpxord  (%rsi),%zmm15,%zmm15
+       vmovdqu32       %zmm15,(%rdi,%rsi,1)
+       je      .Ldone16x
+       vmovdqa32       %zmm12,%zmm16
+       leaq    64(%rsi),%rsi
+
+.Less_than_64_16x:
+       vmovdqa32       %zmm16,0(%rsp)
+       leaq    (%rdi,%rsi,1),%rdi
+       andq    $63,%rdx
+
+.Loop_tail16x:
+       movzbl  (%rsi,%r10,1),%eax
+       movzbl  (%rsp,%r10,1),%ecx
+       leaq    1(%r10),%r10
+       xorl    %ecx,%eax
+       movb    %al,-1(%rdi,%r10,1)
+       decq    %rdx
+       jnz     .Loop_tail16x
+
+       vpxord  %zmm16,%zmm16,%zmm16
+       vmovdqa32       %zmm16,0(%rsp)
+
+.Ldone16x:
+       vzeroall
+       leaq    (%r9),%rsp
+.cfi_def_cfa_register  %rsp
+.L16x_epilogue:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  ChaCha20_16x,.-ChaCha20_16x
+.type  ChaCha20_8xvl,@function
+.align 32
+ChaCha20_8xvl:
+.cfi_startproc 
+.LChaCha20_8xvl:
+       movq    %rsp,%r9
+.cfi_def_cfa_register  %r9
+       subq    $64+8,%rsp
+       andq    $-64,%rsp
+       vzeroupper
+
+       leaq    .Lsigma(%rip),%r10
+       vbroadcasti128  (%r10),%ymm3
+       vbroadcasti128  (%rcx),%ymm7
+       vbroadcasti128  16(%rcx),%ymm11
+       vbroadcasti128  (%r8),%ymm15
+
+       vpshufd $0x00,%ymm3,%ymm0
+       vpshufd $0x55,%ymm3,%ymm1
+       vpshufd $0xaa,%ymm3,%ymm2
+       vpshufd $0xff,%ymm3,%ymm3
+       vmovdqa64       %ymm0,%ymm16
+       vmovdqa64       %ymm1,%ymm17
+       vmovdqa64       %ymm2,%ymm18
+       vmovdqa64       %ymm3,%ymm19
+
+       vpshufd $0x00,%ymm7,%ymm4
+       vpshufd $0x55,%ymm7,%ymm5
+       vpshufd $0xaa,%ymm7,%ymm6
+       vpshufd $0xff,%ymm7,%ymm7
+       vmovdqa64       %ymm4,%ymm20
+       vmovdqa64       %ymm5,%ymm21
+       vmovdqa64       %ymm6,%ymm22
+       vmovdqa64       %ymm7,%ymm23
+
+       vpshufd $0x00,%ymm11,%ymm8
+       vpshufd $0x55,%ymm11,%ymm9
+       vpshufd $0xaa,%ymm11,%ymm10
+       vpshufd $0xff,%ymm11,%ymm11
+       vmovdqa64       %ymm8,%ymm24
+       vmovdqa64       %ymm9,%ymm25
+       vmovdqa64       %ymm10,%ymm26
+       vmovdqa64       %ymm11,%ymm27
+
+       vpshufd $0x00,%ymm15,%ymm12
+       vpshufd $0x55,%ymm15,%ymm13
+       vpshufd $0xaa,%ymm15,%ymm14
+       vpshufd $0xff,%ymm15,%ymm15
+       vpaddd  .Lincy(%rip),%ymm12,%ymm12
+       vmovdqa64       %ymm12,%ymm28
+       vmovdqa64       %ymm13,%ymm29
+       vmovdqa64       %ymm14,%ymm30
+       vmovdqa64       %ymm15,%ymm31
+
+       movl    $10,%eax
+       jmp     .Loop8xvl
+
+.align 32
+.Loop_outer8xvl:
+
+
+       vpbroadcastd    8(%r10),%ymm2
+       vpbroadcastd    12(%r10),%ymm3
+       vpaddd  .Leight(%rip),%ymm28,%ymm28
+       vmovdqa64       %ymm20,%ymm4
+       vmovdqa64       %ymm21,%ymm5
+       vmovdqa64       %ymm22,%ymm6
+       vmovdqa64       %ymm23,%ymm7
+       vmovdqa64       %ymm24,%ymm8
+       vmovdqa64       %ymm25,%ymm9
+       vmovdqa64       %ymm26,%ymm10
+       vmovdqa64       %ymm27,%ymm11
+       vmovdqa64       %ymm28,%ymm12
+       vmovdqa64       %ymm29,%ymm13
+       vmovdqa64       %ymm30,%ymm14
+       vmovdqa64       %ymm31,%ymm15
+
+       vmovdqa64       %ymm0,%ymm16
+       vmovdqa64       %ymm1,%ymm17
+       vmovdqa64       %ymm2,%ymm18
+       vmovdqa64       %ymm3,%ymm19
+
+       movl    $10,%eax
+       jmp     .Loop8xvl
+
+.align 32
+.Loop8xvl:
+       vpaddd  %ymm4,%ymm0,%ymm0
+       vpaddd  %ymm5,%ymm1,%ymm1
+       vpaddd  %ymm6,%ymm2,%ymm2
+       vpaddd  %ymm7,%ymm3,%ymm3
+       vpxor   %ymm0,%ymm12,%ymm12
+       vpxor   %ymm1,%ymm13,%ymm13
+       vpxor   %ymm2,%ymm14,%ymm14
+       vpxor   %ymm3,%ymm15,%ymm15
+       vprold  $16,%ymm12,%ymm12
+       vprold  $16,%ymm13,%ymm13
+       vprold  $16,%ymm14,%ymm14
+       vprold  $16,%ymm15,%ymm15
+       vpaddd  %ymm12,%ymm8,%ymm8
+       vpaddd  %ymm13,%ymm9,%ymm9
+       vpaddd  %ymm14,%ymm10,%ymm10
+       vpaddd  %ymm15,%ymm11,%ymm11
+       vpxor   %ymm8,%ymm4,%ymm4
+       vpxor   %ymm9,%ymm5,%ymm5
+       vpxor   %ymm10,%ymm6,%ymm6
+       vpxor   %ymm11,%ymm7,%ymm7
+       vprold  $12,%ymm4,%ymm4
+       vprold  $12,%ymm5,%ymm5
+       vprold  $12,%ymm6,%ymm6
+       vprold  $12,%ymm7,%ymm7
+       vpaddd  %ymm4,%ymm0,%ymm0
+       vpaddd  %ymm5,%ymm1,%ymm1
+       vpaddd  %ymm6,%ymm2,%ymm2
+       vpaddd  %ymm7,%ymm3,%ymm3
+       vpxor   %ymm0,%ymm12,%ymm12
+       vpxor   %ymm1,%ymm13,%ymm13
+       vpxor   %ymm2,%ymm14,%ymm14
+       vpxor   %ymm3,%ymm15,%ymm15
+       vprold  $8,%ymm12,%ymm12
+       vprold  $8,%ymm13,%ymm13
+       vprold  $8,%ymm14,%ymm14
+       vprold  $8,%ymm15,%ymm15
+       vpaddd  %ymm12,%ymm8,%ymm8
+       vpaddd  %ymm13,%ymm9,%ymm9
+       vpaddd  %ymm14,%ymm10,%ymm10
+       vpaddd  %ymm15,%ymm11,%ymm11
+       vpxor   %ymm8,%ymm4,%ymm4
+       vpxor   %ymm9,%ymm5,%ymm5
+       vpxor   %ymm10,%ymm6,%ymm6
+       vpxor   %ymm11,%ymm7,%ymm7
+       vprold  $7,%ymm4,%ymm4
+       vprold  $7,%ymm5,%ymm5
+       vprold  $7,%ymm6,%ymm6
+       vprold  $7,%ymm7,%ymm7
+       vpaddd  %ymm5,%ymm0,%ymm0
+       vpaddd  %ymm6,%ymm1,%ymm1
+       vpaddd  %ymm7,%ymm2,%ymm2
+       vpaddd  %ymm4,%ymm3,%ymm3
+       vpxor   %ymm0,%ymm15,%ymm15
+       vpxor   %ymm1,%ymm12,%ymm12
+       vpxor   %ymm2,%ymm13,%ymm13
+       vpxor   %ymm3,%ymm14,%ymm14
+       vprold  $16,%ymm15,%ymm15
+       vprold  $16,%ymm12,%ymm12
+       vprold  $16,%ymm13,%ymm13
+       vprold  $16,%ymm14,%ymm14
+       vpaddd  %ymm15,%ymm10,%ymm10
+       vpaddd  %ymm12,%ymm11,%ymm11
+       vpaddd  %ymm13,%ymm8,%ymm8
+       vpaddd  %ymm14,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm5,%ymm5
+       vpxor   %ymm11,%ymm6,%ymm6
+       vpxor   %ymm8,%ymm7,%ymm7
+       vpxor   %ymm9,%ymm4,%ymm4
+       vprold  $12,%ymm5,%ymm5
+       vprold  $12,%ymm6,%ymm6
+       vprold  $12,%ymm7,%ymm7
+       vprold  $12,%ymm4,%ymm4
+       vpaddd  %ymm5,%ymm0,%ymm0
+       vpaddd  %ymm6,%ymm1,%ymm1
+       vpaddd  %ymm7,%ymm2,%ymm2
+       vpaddd  %ymm4,%ymm3,%ymm3
+       vpxor   %ymm0,%ymm15,%ymm15
+       vpxor   %ymm1,%ymm12,%ymm12
+       vpxor   %ymm2,%ymm13,%ymm13
+       vpxor   %ymm3,%ymm14,%ymm14
+       vprold  $8,%ymm15,%ymm15
+       vprold  $8,%ymm12,%ymm12
+       vprold  $8,%ymm13,%ymm13
+       vprold  $8,%ymm14,%ymm14
+       vpaddd  %ymm15,%ymm10,%ymm10
+       vpaddd  %ymm12,%ymm11,%ymm11
+       vpaddd  %ymm13,%ymm8,%ymm8
+       vpaddd  %ymm14,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm5,%ymm5
+       vpxor   %ymm11,%ymm6,%ymm6
+       vpxor   %ymm8,%ymm7,%ymm7
+       vpxor   %ymm9,%ymm4,%ymm4
+       vprold  $7,%ymm5,%ymm5
+       vprold  $7,%ymm6,%ymm6
+       vprold  $7,%ymm7,%ymm7
+       vprold  $7,%ymm4,%ymm4
+       decl    %eax
+       jnz     .Loop8xvl
+
+       vpaddd  %ymm16,%ymm0,%ymm0
+       vpaddd  %ymm17,%ymm1,%ymm1
+       vpaddd  %ymm18,%ymm2,%ymm2
+       vpaddd  %ymm19,%ymm3,%ymm3
+
+       vpunpckldq      %ymm1,%ymm0,%ymm18
+       vpunpckldq      %ymm3,%ymm2,%ymm19
+       vpunpckhdq      %ymm1,%ymm0,%ymm0
+       vpunpckhdq      %ymm3,%ymm2,%ymm2
+       vpunpcklqdq     %ymm19,%ymm18,%ymm1
+       vpunpckhqdq     %ymm19,%ymm18,%ymm18
+       vpunpcklqdq     %ymm2,%ymm0,%ymm3
+       vpunpckhqdq     %ymm2,%ymm0,%ymm0
+       vpaddd  %ymm20,%ymm4,%ymm4
+       vpaddd  %ymm21,%ymm5,%ymm5
+       vpaddd  %ymm22,%ymm6,%ymm6
+       vpaddd  %ymm23,%ymm7,%ymm7
+
+       vpunpckldq      %ymm5,%ymm4,%ymm2
+       vpunpckldq      %ymm7,%ymm6,%ymm19
+       vpunpckhdq      %ymm5,%ymm4,%ymm4
+       vpunpckhdq      %ymm7,%ymm6,%ymm6
+       vpunpcklqdq     %ymm19,%ymm2,%ymm5
+       vpunpckhqdq     %ymm19,%ymm2,%ymm2
+       vpunpcklqdq     %ymm6,%ymm4,%ymm7
+       vpunpckhqdq     %ymm6,%ymm4,%ymm4
+       vshufi32x4      $0,%ymm5,%ymm1,%ymm19
+       vshufi32x4      $3,%ymm5,%ymm1,%ymm5
+       vshufi32x4      $0,%ymm2,%ymm18,%ymm1
+       vshufi32x4      $3,%ymm2,%ymm18,%ymm2
+       vshufi32x4      $0,%ymm7,%ymm3,%ymm18
+       vshufi32x4      $3,%ymm7,%ymm3,%ymm7
+       vshufi32x4      $0,%ymm4,%ymm0,%ymm3
+       vshufi32x4      $3,%ymm4,%ymm0,%ymm4
+       vpaddd  %ymm24,%ymm8,%ymm8
+       vpaddd  %ymm25,%ymm9,%ymm9
+       vpaddd  %ymm26,%ymm10,%ymm10
+       vpaddd  %ymm27,%ymm11,%ymm11
+
+       vpunpckldq      %ymm9,%ymm8,%ymm6
+       vpunpckldq      %ymm11,%ymm10,%ymm0
+       vpunpckhdq      %ymm9,%ymm8,%ymm8
+       vpunpckhdq      %ymm11,%ymm10,%ymm10
+       vpunpcklqdq     %ymm0,%ymm6,%ymm9
+       vpunpckhqdq     %ymm0,%ymm6,%ymm6
+       vpunpcklqdq     %ymm10,%ymm8,%ymm11
+       vpunpckhqdq     %ymm10,%ymm8,%ymm8
+       vpaddd  %ymm28,%ymm12,%ymm12
+       vpaddd  %ymm29,%ymm13,%ymm13
+       vpaddd  %ymm30,%ymm14,%ymm14
+       vpaddd  %ymm31,%ymm15,%ymm15
+
+       vpunpckldq      %ymm13,%ymm12,%ymm10
+       vpunpckldq      %ymm15,%ymm14,%ymm0
+       vpunpckhdq      %ymm13,%ymm12,%ymm12
+       vpunpckhdq      %ymm15,%ymm14,%ymm14
+       vpunpcklqdq     %ymm0,%ymm10,%ymm13
+       vpunpckhqdq     %ymm0,%ymm10,%ymm10
+       vpunpcklqdq     %ymm14,%ymm12,%ymm15
+       vpunpckhqdq     %ymm14,%ymm12,%ymm12
+       vperm2i128      $0x20,%ymm13,%ymm9,%ymm0
+       vperm2i128      $0x31,%ymm13,%ymm9,%ymm13
+       vperm2i128      $0x20,%ymm10,%ymm6,%ymm9
+       vperm2i128      $0x31,%ymm10,%ymm6,%ymm10
+       vperm2i128      $0x20,%ymm15,%ymm11,%ymm6
+       vperm2i128      $0x31,%ymm15,%ymm11,%ymm15
+       vperm2i128      $0x20,%ymm12,%ymm8,%ymm11
+       vperm2i128      $0x31,%ymm12,%ymm8,%ymm12
+       cmpq    $512,%rdx
+       jb      .Ltail8xvl
+
+       movl    $0x80,%eax
+       vpxord  0(%rsi),%ymm19,%ymm19
+       vpxor   32(%rsi),%ymm0,%ymm0
+       vpxor   64(%rsi),%ymm5,%ymm5
+       vpxor   96(%rsi),%ymm13,%ymm13
+       leaq    (%rsi,%rax,1),%rsi
+       vmovdqu32       %ymm19,0(%rdi)
+       vmovdqu %ymm0,32(%rdi)
+       vmovdqu %ymm5,64(%rdi)
+       vmovdqu %ymm13,96(%rdi)
+       leaq    (%rdi,%rax,1),%rdi
+
+       vpxor   0(%rsi),%ymm1,%ymm1
+       vpxor   32(%rsi),%ymm9,%ymm9
+       vpxor   64(%rsi),%ymm2,%ymm2
+       vpxor   96(%rsi),%ymm10,%ymm10
+       leaq    (%rsi,%rax,1),%rsi
+       vmovdqu %ymm1,0(%rdi)
+       vmovdqu %ymm9,32(%rdi)
+       vmovdqu %ymm2,64(%rdi)
+       vmovdqu %ymm10,96(%rdi)
+       leaq    (%rdi,%rax,1),%rdi
+
+       vpxord  0(%rsi),%ymm18,%ymm18
+       vpxor   32(%rsi),%ymm6,%ymm6
+       vpxor   64(%rsi),%ymm7,%ymm7
+       vpxor   96(%rsi),%ymm15,%ymm15
+       leaq    (%rsi,%rax,1),%rsi
+       vmovdqu32       %ymm18,0(%rdi)
+       vmovdqu %ymm6,32(%rdi)
+       vmovdqu %ymm7,64(%rdi)
+       vmovdqu %ymm15,96(%rdi)
+       leaq    (%rdi,%rax,1),%rdi
+
+       vpxor   0(%rsi),%ymm3,%ymm3
+       vpxor   32(%rsi),%ymm11,%ymm11
+       vpxor   64(%rsi),%ymm4,%ymm4
+       vpxor   96(%rsi),%ymm12,%ymm12
+       leaq    (%rsi,%rax,1),%rsi
+       vmovdqu %ymm3,0(%rdi)
+       vmovdqu %ymm11,32(%rdi)
+       vmovdqu %ymm4,64(%rdi)
+       vmovdqu %ymm12,96(%rdi)
+       leaq    (%rdi,%rax,1),%rdi
+
+       vpbroadcastd    0(%r10),%ymm0
+       vpbroadcastd    4(%r10),%ymm1
+
+       subq    $512,%rdx
+       jnz     .Loop_outer8xvl
+
+       jmp     .Ldone8xvl
+
+.align 32
+.Ltail8xvl:
+       vmovdqa64       %ymm19,%ymm8
+       xorq    %r10,%r10
+       subq    %rsi,%rdi
+       cmpq    $64,%rdx
+       jb      .Less_than_64_8xvl
+       vpxor   0(%rsi),%ymm8,%ymm8
+       vpxor   32(%rsi),%ymm0,%ymm0
+       vmovdqu %ymm8,0(%rdi,%rsi,1)
+       vmovdqu %ymm0,32(%rdi,%rsi,1)
+       je      .Ldone8xvl
+       vmovdqa %ymm5,%ymm8
+       vmovdqa %ymm13,%ymm0
+       leaq    64(%rsi),%rsi
+
+       cmpq    $128,%rdx
+       jb      .Less_than_64_8xvl
+       vpxor   0(%rsi),%ymm5,%ymm5
+       vpxor   32(%rsi),%ymm13,%ymm13
+       vmovdqu %ymm5,0(%rdi,%rsi,1)
+       vmovdqu %ymm13,32(%rdi,%rsi,1)
+       je      .Ldone8xvl
+       vmovdqa %ymm1,%ymm8
+       vmovdqa %ymm9,%ymm0
+       leaq    64(%rsi),%rsi
+
+       cmpq    $192,%rdx
+       jb      .Less_than_64_8xvl
+       vpxor   0(%rsi),%ymm1,%ymm1
+       vpxor   32(%rsi),%ymm9,%ymm9
+       vmovdqu %ymm1,0(%rdi,%rsi,1)
+       vmovdqu %ymm9,32(%rdi,%rsi,1)
+       je      .Ldone8xvl
+       vmovdqa %ymm2,%ymm8
+       vmovdqa %ymm10,%ymm0
+       leaq    64(%rsi),%rsi
+
+       cmpq    $256,%rdx
+       jb      .Less_than_64_8xvl
+       vpxor   0(%rsi),%ymm2,%ymm2
+       vpxor   32(%rsi),%ymm10,%ymm10
+       vmovdqu %ymm2,0(%rdi,%rsi,1)
+       vmovdqu %ymm10,32(%rdi,%rsi,1)
+       je      .Ldone8xvl
+       vmovdqa32       %ymm18,%ymm8
+       vmovdqa %ymm6,%ymm0
+       leaq    64(%rsi),%rsi
+
+       cmpq    $320,%rdx
+       jb      .Less_than_64_8xvl
+       vpxord  0(%rsi),%ymm18,%ymm18
+       vpxor   32(%rsi),%ymm6,%ymm6
+       vmovdqu32       %ymm18,0(%rdi,%rsi,1)
+       vmovdqu %ymm6,32(%rdi,%rsi,1)
+       je      .Ldone8xvl
+       vmovdqa %ymm7,%ymm8
+       vmovdqa %ymm15,%ymm0
+       leaq    64(%rsi),%rsi
+
+       cmpq    $384,%rdx
+       jb      .Less_than_64_8xvl
+       vpxor   0(%rsi),%ymm7,%ymm7
+       vpxor   32(%rsi),%ymm15,%ymm15
+       vmovdqu %ymm7,0(%rdi,%rsi,1)
+       vmovdqu %ymm15,32(%rdi,%rsi,1)
+       je      .Ldone8xvl
+       vmovdqa %ymm3,%ymm8
+       vmovdqa %ymm11,%ymm0
+       leaq    64(%rsi),%rsi
+
+       cmpq    $448,%rdx
+       jb      .Less_than_64_8xvl
+       vpxor   0(%rsi),%ymm3,%ymm3
+       vpxor   32(%rsi),%ymm11,%ymm11
+       vmovdqu %ymm3,0(%rdi,%rsi,1)
+       vmovdqu %ymm11,32(%rdi,%rsi,1)
+       je      .Ldone8xvl
+       vmovdqa %ymm4,%ymm8
+       vmovdqa %ymm12,%ymm0
+       leaq    64(%rsi),%rsi
+
+.Less_than_64_8xvl:
+       vmovdqa %ymm8,0(%rsp)
+       vmovdqa %ymm0,32(%rsp)
+       leaq    (%rdi,%rsi,1),%rdi
+       andq    $63,%rdx
+
+.Loop_tail8xvl:
+       movzbl  (%rsi,%r10,1),%eax
+       movzbl  (%rsp,%r10,1),%ecx
+       leaq    1(%r10),%r10
+       xorl    %ecx,%eax
+       movb    %al,-1(%rdi,%r10,1)
+       decq    %rdx
+       jnz     .Loop_tail8xvl
+
+       vpxor   %ymm8,%ymm8,%ymm8
+       vmovdqa %ymm8,0(%rsp)
+       vmovdqa %ymm8,32(%rsp)
+
+.Ldone8xvl:
+       vzeroall
+       leaq    (%r9),%rsp
+.cfi_def_cfa_register  %rsp
+.L8xvl_epilogue:
+       .byte   0xf3,0xc3
+.cfi_endproc   
+.size  ChaCha20_8xvl,.-ChaCha20_8xvl
-- 
2.19.0

Reply via email to