https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89176

            Bug ID: 89176
           Summary: Vectorizer fails to consider narrower vector width for
                    res[i] = v1[i] < v2[i] ? v2[i] : v1[i]
           Product: gcc
           Version: 9.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hjl.tools at gmail dot com
  Target Milestone: ---

[hjl@gnu-cfl-1 pr89028]$ cat 2c.i
float v1[] = { 8.3, 3.4, 8.3, 3.4, 5.8, 9.7, 5.8, 9.7, 8.3, 3.4, 8.3, 3.4 };
float v2[] = { 5.8, 9.7, 8.3, 3.4, 8.3, 3.4, 8.3, 3.4, 8.3, 3.4, 5.8, 9.7 };

float res[12];

void
foo (void)
{
  int i;

  for (i = 0; i < sizeof (res) / sizeof (res[0]); i++)
    res[i] = v1[i] < v2[i] ? v2[i] : v1[i];
}
[hjl@gnu-cfl-1 pr89028]$ make 2c.s 
/export/build/gnu/tools-build/gcc-mmx-debug/build-x86_64-linux/gcc/xgcc
-B/export/build/gnu/tools-build/gcc-mmx-debug/build-x86_64-linux/gcc/ -O3 
-march=haswell -S 2c.i
[hjl@gnu-cfl-1 pr89028]$ cat 2c.s
        .file   "2c.i"
        .text
        .p2align 4
        .globl  foo
        .type   foo, @function
foo:
.LFB0:
        .cfi_startproc
        vmovaps v2(%rip), %ymm1
        vmaxps  v1(%rip), %ymm1, %ymm0
        vmovups %ymm0, res(%rip)
        vmovss  v2+32(%rip), %xmm0
        vmaxss  v1+32(%rip), %xmm0, %xmm0
        vmovss  %xmm0, res+32(%rip)
        vmovss  v2+36(%rip), %xmm0
        vmaxss  v1+36(%rip), %xmm0, %xmm0
        vmovss  %xmm0, res+36(%rip)
        vmovss  v2+40(%rip), %xmm0
        vmaxss  v1+40(%rip), %xmm0, %xmm0
        vmovss  %xmm0, res+40(%rip)
        vmovss  v2+44(%rip), %xmm0
        vmaxss  v1+44(%rip), %xmm0, %xmm0
        vmovss  %xmm0, res+44(%rip)
        vzeroupper
        ret
        .cfi_endproc

We generate 4 scalar res[i] = v1[i] < v2[i] ? v2[i] : v1[i].  But this
works:

[hjl@gnu-cfl-1 pr89028]$ cat 3a.i
float v1[] = { 8.3, 3.4, 8.3, 3.4, 5.8, 9.7, 5.8, 9.7, 8.3, 3.4, 8.3, 3.4 };
float v2[] = { 5.8, 9.7, 8.3, 3.4, 8.3, 3.4, 8.3, 3.4, 8.3, 3.4, 5.8, 9.7 };

float res[12];


void
foo (void)
{
  int i;

  for (i = 0; i < sizeof (res) / sizeof (res[0]); i++)
    res[i] = v2[i] * v1[i];
}
[hjl@gnu-cfl-1 pr89028]$ make 3a.s
/export/build/gnu/tools-build/gcc-mmx-debug/build-x86_64-linux/gcc/xgcc
-B/export/build/gnu/tools-build/gcc-mmx-debug/build-x86_64-linux/gcc/ -O3 
-march=haswell -S 3a.i
[hjl@gnu-cfl-1 pr89028]$ cat 3a.s
        .file   "3a.i"
        .text
        .p2align 4
        .globl  foo
        .type   foo, @function
foo:
.LFB0:
        .cfi_startproc
        vmovaps v2(%rip), %ymm1
        vmulps  v1(%rip), %ymm1, %ymm0
        vmovaps v1+32(%rip), %xmm2
        vmovups %ymm0, res(%rip)
        vmulps  v2+32(%rip), %xmm2, %xmm0
        vmovaps %xmm0, res+32(%rip)
        vzeroupper
        ret

Reply via email to