https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108141
--- Comment #1 from Jakub Jelinek <jakub at gcc dot gnu.org> --- Just counting number of instructions it is shorter though: --- pr64110.s.r13-4726 2022-12-16 03:57:42.000000000 -0500 +++ pr64110.s.r13-4727 2022-12-16 03:57:48.000000000 -0500 @@ -11,106 +11,105 @@ bar: pushl %ebx andl $-32, %esp subl $64, %esp - movzwl 8(%ebp), %edi + movzwl 8(%ebp), %eax + movw %ax, 30(%esp) movl a, %eax - vmovd %edi, %xmm1 + vpbroadcastw 30(%esp), %xmm2 + vpbroadcastw 30(%esp), %ymm0 leal -1(%eax), %edx - vmovd %edi, %xmm0 movl %edx, a - vpbroadcastw %xmm1, %xmm1 - vpbroadcastw %xmm0, %ymm0 + vmovdqa %xmm2, (%esp) testl %eax, %eax - je .L24 - movw %di, 62(%esp) - vmovdqa %xmm1, 32(%esp) - vmovdqa %ymm0, (%esp) - vzeroupper + je .L23 .p2align 4,,10 .p2align 3 .L2: + vmovdqa %ymm0, 32(%esp) + vzeroupper call foo + vmovdqa 32(%esp), %ymm0 testl %eax, %eax jle .L3 - leal -1(%eax), %edi + leal -1(%eax), %esi movl b, %ebx - movl %edi, 52(%esp) - cmpl $14, %edi + movl %esi, 24(%esp) + cmpl $14, %esi jbe .L10 movl %eax, %ecx movl %ebx, %edx shrl $4, %ecx sall $5, %ecx - leal (%ecx,%ebx), %esi + leal (%ecx,%ebx), %edi andl $32, %ecx - jne .L21 - movzwl 62(%esp), %edi - vmovdqa (%esp), %ymm0 + je .L5 + leal 32(%ebx), %edx + vmovdqu %ymm0, (%ebx) + cmpl %edi, %edx + je .L22 .p2align 4,,10 .p2align 3 .L5: vmovdqu %ymm0, (%edx) addl $64, %edx vmovdqu %ymm0, -32(%edx) - cmpl %esi, %edx + cmpl %edi, %edx jne .L5 - movw %di, 62(%esp) .L22: movl %eax, %edx andl $-16, %edx - movl %edx, %esi + movl %edx, %edi leal (%ebx,%edx,2), %ecx cmpl %edx, %eax - je .L29 - vzeroupper + je .L6 .L4: - movl %eax, %edi - subl %esi, %edi - movl %edi, 56(%esp) - decl %edi - cmpl $6, %edi + movl %eax, %esi + subl %edi, %esi + movl %esi, 32(%esp) + decl %esi + cmpl $6, %esi jbe .L7 - movl 56(%esp), %edi - vmovdqa 32(%esp), %xmm2 - vmovdqu %xmm2, (%ebx,%esi,2) - movl %edi, %esi - andl $-8, %esi - addl %esi, %edx - leal (%ecx,%esi,2), %ecx - movl %edi, %esi - andl $7, %esi + movl 32(%esp), %esi + vmovdqa (%esp), %xmm1 + vmovdqu %xmm1, (%ebx,%edi,2) + movl %esi, %edi + andl $-8, %edi + addl %edi, %edx + leal (%ecx,%edi,2), %ecx + movl %esi, %edi + andl $7, %edi je .L6 .L7: - movzwl 62(%esp), %edi - leal 1(%edx), %esi - movw %di, (%ecx) - cmpl %esi, %eax - jle .L6 - leal 2(%edx), %esi - movw %di, 2(%ecx) - cmpl %esi, %eax - jle .L6 - leal 3(%edx), %esi - movw %di, 4(%ecx) - cmpl %esi, %eax - jle .L6 - leal 4(%edx), %esi - movw %di, 6(%ecx) - cmpl %esi, %eax - jle .L6 - leal 5(%edx), %esi - movw %di, 8(%ecx) - cmpl %esi, %eax + movzwl 30(%esp), %esi + leal 1(%edx), %edi + movw %si, (%ecx) + cmpl %edi, %eax + jle .L6 + leal 2(%edx), %edi + movw %si, 2(%ecx) + cmpl %edi, %eax + jle .L6 + leal 3(%edx), %edi + movw %si, 4(%ecx) + cmpl %edi, %eax + jle .L6 + leal 4(%edx), %edi + movw %si, 6(%ecx) + cmpl %edi, %eax + jle .L6 + leal 5(%edx), %edi + movw %si, 8(%ecx) + cmpl %edi, %eax jle .L6 addl $6, %edx - movw %di, 10(%ecx) + movw %si, 10(%ecx) cmpl %edx, %eax jle .L6 - movw %di, 12(%ecx) + movw %si, 12(%ecx) .L6: leal (%ebx,%eax,2), %eax - movzwl 62(%esp), %ecx + movzwl 30(%esp), %ecx movl %eax, b - movl 52(%esp), %eax + movl 24(%esp), %eax movw %cx, (%ebx,%eax,2) .L3: movl a, %eax @@ -118,39 +117,19 @@ bar: movl %edx, a testl %eax, %eax jne .L2 +.L23: + vzeroupper leal -12(%ebp), %esp popl %ebx popl %esi popl %edi popl %ebp ret - .p2align 4,,10 - .p2align 3 -.L21: - vmovdqa (%esp), %ymm3 - leal 32(%ebx), %edx - vmovdqu %ymm3, (%ebx) - cmpl %esi, %edx - je .L22 - movzwl 62(%esp), %edi - vmovdqa %ymm3, %ymm0 - jmp .L5 .L10: movl %ebx, %ecx - xorl %esi, %esi + xorl %edi, %edi xorl %edx, %edx jmp .L4 -.L29: - vzeroupper - jmp .L6 -.L24: - vzeroupper - leal -12(%ebp), %esp - popl %ebx - popl %esi - popl %edi - popl %ebp - ret .size bar, .-bar .globl b .bss