https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106081

--- Comment #8 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
Imagemagick improved by 17% on zen3 and 11% on altra
https://lnt.opensuse.org/db_default/v4/SPEC/37550
https://lnt.opensuse.org/db_default/v4/SPEC/37543
which is cool :)

The loop is now optimized as:

.L2:
        vmovdqu16       (%rax), %zmm0
        vmovupd (%rdx), %zmm2
        addq    $64, %rax
        subq    $64, %rdx
        vpermpd %zmm2, %zmm15, %zmm9
        vpermpd %zmm2, %zmm14, %zmm8
        vpermpd %zmm2, %zmm13, %zmm7
        vpermpd %zmm2, %zmm11, %zmm2
        vpshufb %zmm12, %zmm0, %zmm0
        vpmovsxwd       %ymm0, %zmm1
        vextracti64x4   $0x1, %zmm0, %ymm0
        vpmovsxwd       %ymm0, %zmm0
        vcvtdq2pd       %ymm1, %zmm10
        vextracti32x8   $0x1, %zmm1, %ymm1
        vcvtdq2pd       %ymm1, %zmm1
        vfmadd231pd     %zmm2, %zmm10, %zmm6
        vfmadd231pd     %zmm9, %zmm1, %zmm3
        vcvtdq2pd       %ymm0, %zmm1
        vextracti32x8   $0x1, %zmm0, %ymm0
        vcvtdq2pd       %ymm0, %zmm0
        vfmadd231pd     %zmm8, %zmm1, %zmm5
        vfmadd231pd     %zmm7, %zmm0, %zmm4
        cmpq    %rax, %rcx
        jne     .L2

Reply via email to