https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82426

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
     Ever confirmed|0                           |1
             Status|UNCONFIRMED                 |NEW
           Severity|normal                      |enhancement
   Last reconfirmed|                            |2021-08-25

--- Comment #4 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
Hmm, on aarch64 we do a decent job at vectorizing this (since GCC 11):
        ldp     d4, d0, [x1]
        ldr     d7, [x0, 16]
        ldp     d6, d5, [x0]
        fmul    v3.2s, v0.2s, v7.s[1]
        ldr     d1, [x1, 16]
        fmul    v2.2s, v0.2s, v6.s[1]
        fmul    v0.2s, v0.2s, v5.s[1]
        fmla    v3.2s, v4.2s, v7.s[0]
        fmla    v2.2s, v4.2s, v6.s[0]
        fmla    v0.2s, v4.2s, v5.s[0]
        fadd    v1.2s, v1.2s, v3.2s
        stp     d2, d0, [x8]
        str     d1, [x8, 16]

I suspect this is because V2SF does not exist on x86_64.
Using -Dfloat=double seems to get better for x86_64 (with -mavx2):
        vmovupd (%rdx), %ymm0
        vpermilpd       $0, (%rsi), %ymm1
        movq    %rdi, %rax
        vmovsd  32(%rsi), %xmm5
        vmovsd  40(%rsi), %xmm4
        vpermpd $68, %ymm0, %ymm2
        vpermpd $238, %ymm0, %ymm3
        vmulpd  %ymm2, %ymm1, %ymm2
        vpermilpd       $15, (%rsi), %ymm1
        vmulpd  %ymm3, %ymm1, %ymm1
        vaddpd  %ymm1, %ymm2, %ymm1
        vmulsd  %xmm5, %xmm0, %xmm2
        vmovupd %ymm1, (%rdi)
        vmovapd %xmm0, %xmm1
        vextractf128    $0x1, %ymm0, %xmm0
        vmulsd  %xmm4, %xmm0, %xmm3
        vunpckhpd       %xmm1, %xmm1, %xmm1
        vunpckhpd       %xmm0, %xmm0, %xmm0
        vmulsd  %xmm5, %xmm1, %xmm1
        vmulsd  %xmm4, %xmm0, %xmm0
        vaddsd  %xmm3, %xmm2, %xmm2
        vaddsd  32(%rdx), %xmm2, %xmm2
        vaddsd  %xmm0, %xmm1, %xmm1
        vaddsd  40(%rdx), %xmm1, %xmm1
        vmovsd  %xmm2, 32(%rdi)
        vmovsd  %xmm1, 40(%rdi)

Reply via email to