https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79336

            Bug ID: 79336
           Summary: Poor vectorisation of additive reduction of complex
                    array
           Product: gcc
           Version: 7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: drraph at gmail dot com
  Target Milestone: ---

Consider this code:

#include <complex.h>
complex float f(complex float x[]) {
  complex float p = 1.0;
  for (int i = 0; i < 32; i++)
    p += x[i];
  return p;
}

gcc 7 with -march=core-avx2 -ffast-math gives

f:
        lea     r10, [rsp+8]
        and     rsp, -32
        push    QWORD PTR [r10-8]
        push    rbp
        mov     rbp, rsp
        push    r10
        vmovups ymm0, YMMWORD PTR [rdi+64]
        vmovaps ymm1, YMMWORD PTR .LC0[rip]
        vaddps  ymm0, ymm0, YMMWORD PTR [rdi+32]
        vaddps  ymm1, ymm1, YMMWORD PTR [rdi]
        vaddps  ymm0, ymm0, ymm1
        vmovups ymm1, YMMWORD PTR [rdi+128]
        vaddps  ymm1, ymm1, YMMWORD PTR [rdi+96]
        vaddps  ymm0, ymm0, ymm1
        vmovups ymm1, YMMWORD PTR [rdi+192]
        vaddps  ymm1, ymm1, YMMWORD PTR [rdi+160]
        vaddps  ymm0, ymm0, ymm1
        vaddps  ymm0, ymm0, YMMWORD PTR [rdi+224]
        vunpckhps       xmm3, xmm0, xmm0
        vshufps xmm2, xmm0, xmm0, 255
        vshufps xmm1, xmm0, xmm0, 85
        vaddss  xmm1, xmm2, xmm1
        vaddss  xmm3, xmm3, xmm0
        vextractf128    xmm0, ymm0, 0x1
        vunpckhps       xmm4, xmm0, xmm0
        vshufps xmm2, xmm0, xmm0, 85
        vaddss  xmm4, xmm4, xmm0
        vshufps xmm0, xmm0, xmm0, 255
        vaddss  xmm0, xmm2, xmm0
        vaddss  xmm3, xmm3, xmm4
        vaddss  xmm1, xmm1, xmm0
        vmovss  DWORD PTR [rbp-24], xmm3
        vmovss  DWORD PTR [rbp-20], xmm1
        vzeroupper
        vmovq   xmm0, QWORD PTR [rbp-24]
        pop     r10
        pop     rbp
        lea     rsp, [r10-8]
        ret

This is vectorised but appears to perform a number of unnecessary instructions.

By contrast, icc using the same options gives:


f:
        vmovups   ymm1, YMMWORD PTR [rdi]                       #5.10
        vmovups   ymm2, YMMWORD PTR [64+rdi]                    #5.10
        vmovups   ymm5, YMMWORD PTR [128+rdi]                   #5.10
        vmovups   ymm6, YMMWORD PTR [192+rdi]                   #5.10
        vmovsd    xmm0, QWORD PTR p.152.0.0.1[rip]              #3.19
        vaddps    ymm3, ymm1, YMMWORD PTR [32+rdi]              #3.19
        vaddps    ymm4, ymm2, YMMWORD PTR [96+rdi]              #3.19
        vaddps    ymm7, ymm5, YMMWORD PTR [160+rdi]             #3.19
        vaddps    ymm8, ymm6, YMMWORD PTR [224+rdi]             #3.19
        vaddps    ymm9, ymm3, ymm4                              #3.19
        vaddps    ymm10, ymm7, ymm8                             #3.19
        vaddps    ymm11, ymm9, ymm10                            #3.19
        vextractf128 xmm12, ymm11, 1                            #3.19
        vaddps    xmm13, xmm11, xmm12                           #3.19
        vmovhlps  xmm14, xmm13, xmm13                           #3.19
        vaddps    xmm15, xmm13, xmm14                           #3.19
        vaddps    xmm0, xmm15, xmm0                             #3.19
        vzeroupper                                              #6.10
        ret

Reply via email to