https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79726

            Bug ID: 79726
           Summary: Type conversion not vectorisde
           Product: gcc
           Version: 7.0.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: drraph at gmail dot com
  Target Milestone: ---

Consider:

double f(double x[]) {
  float p = 1.0;
  for (int i = 0; i < 16; i++)
    p += x[i];
  return p;
}

gcc with -O3 -march=core-avx2 -ffast-math gives:

f:
        vmovsd  xmm0, QWORD PTR .LC0[rip]
        vaddsd  xmm0, xmm0, QWORD PTR [rdi]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, QWORD PTR [rdi+8]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, QWORD PTR [rdi+16]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, QWORD PTR [rdi+24]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, QWORD PTR [rdi+32]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, QWORD PTR [rdi+40]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, QWORD PTR [rdi+48]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, QWORD PTR [rdi+56]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, QWORD PTR [rdi+64]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, QWORD PTR [rdi+72]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, QWORD PTR [rdi+80]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, QWORD PTR [rdi+88]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, QWORD PTR [rdi+96]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, QWORD PTR [rdi+104]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, QWORD PTR [rdi+112]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, QWORD PTR [rdi+120]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        ret
.LC0:
        .long   0
        .long   1072693248


However more efficient would be:

f:
        vcvtpd2ps xmm0, YMMWORD PTR [rdi]                       #4.5
        vcvtpd2ps xmm1, YMMWORD PTR [32+rdi]                    #4.5
        vcvtpd2ps xmm2, YMMWORD PTR [64+rdi]                    #4.5
        vcvtpd2ps xmm3, YMMWORD PTR [96+rdi]                    #4.5
        vaddps    xmm4, xmm0, xmm1                              #2.11
        vaddps    xmm5, xmm2, xmm3                              #2.11
        vaddps    xmm6, xmm4, xmm5                              #2.11
        vmovhlps  xmm7, xmm6, xmm6                              #2.11
        vaddps    xmm8, xmm6, xmm7                              #2.11
        vshufps   xmm9, xmm8, xmm8, 245                         #2.11
        vaddss    xmm10, xmm8, xmm9                             #2.11
        vaddss    xmm0, xmm10, DWORD PTR .L_2il0floatpacket.0[rip] #2.11
        vcvtss2sd xmm0, xmm0, xmm0                              #5.10
        vzeroupper                                              #5.10
        ret                                                     #5.10
.L_2il0floatpacket.0:
        .long   0x3f800000

Reply via email to