https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89176
Bug ID: 89176
Summary: Vectorizer fails to consider narrower vector width for
res[i] = v1[i] < v2[i] ? v2[i] : v1[i]
Product: gcc
Version: 9.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: hjl.tools at gmail dot com
Target Milestone: ---
[hjl@gnu-cfl-1 pr89028]$ cat 2c.i
float v1[] = { 8.3, 3.4, 8.3, 3.4, 5.8, 9.7, 5.8, 9.7, 8.3, 3.4, 8.3, 3.4 };
float v2[] = { 5.8, 9.7, 8.3, 3.4, 8.3, 3.4, 8.3, 3.4, 8.3, 3.4, 5.8, 9.7 };
float res[12];
void
foo (void)
{
int i;
for (i = 0; i < sizeof (res) / sizeof (res[0]); i++)
res[i] = v1[i] < v2[i] ? v2[i] : v1[i];
}
[hjl@gnu-cfl-1 pr89028]$ make 2c.s
/export/build/gnu/tools-build/gcc-mmx-debug/build-x86_64-linux/gcc/xgcc
-B/export/build/gnu/tools-build/gcc-mmx-debug/build-x86_64-linux/gcc/ -O3
-march=haswell -S 2c.i
[hjl@gnu-cfl-1 pr89028]$ cat 2c.s
.file "2c.i"
.text
.p2align 4
.globl foo
.type foo, @function
foo:
.LFB0:
.cfi_startproc
vmovaps v2(%rip), %ymm1
vmaxps v1(%rip), %ymm1, %ymm0
vmovups %ymm0, res(%rip)
vmovss v2+32(%rip), %xmm0
vmaxss v1+32(%rip), %xmm0, %xmm0
vmovss %xmm0, res+32(%rip)
vmovss v2+36(%rip), %xmm0
vmaxss v1+36(%rip), %xmm0, %xmm0
vmovss %xmm0, res+36(%rip)
vmovss v2+40(%rip), %xmm0
vmaxss v1+40(%rip), %xmm0, %xmm0
vmovss %xmm0, res+40(%rip)
vmovss v2+44(%rip), %xmm0
vmaxss v1+44(%rip), %xmm0, %xmm0
vmovss %xmm0, res+44(%rip)
vzeroupper
ret
.cfi_endproc
We generate 4 scalar res[i] = v1[i] < v2[i] ? v2[i] : v1[i]. But this
works:
[hjl@gnu-cfl-1 pr89028]$ cat 3a.i
float v1[] = { 8.3, 3.4, 8.3, 3.4, 5.8, 9.7, 5.8, 9.7, 8.3, 3.4, 8.3, 3.4 };
float v2[] = { 5.8, 9.7, 8.3, 3.4, 8.3, 3.4, 8.3, 3.4, 8.3, 3.4, 5.8, 9.7 };
float res[12];
void
foo (void)
{
int i;
for (i = 0; i < sizeof (res) / sizeof (res[0]); i++)
res[i] = v2[i] * v1[i];
}
[hjl@gnu-cfl-1 pr89028]$ make 3a.s
/export/build/gnu/tools-build/gcc-mmx-debug/build-x86_64-linux/gcc/xgcc
-B/export/build/gnu/tools-build/gcc-mmx-debug/build-x86_64-linux/gcc/ -O3
-march=haswell -S 3a.i
[hjl@gnu-cfl-1 pr89028]$ cat 3a.s
.file "3a.i"
.text
.p2align 4
.globl foo
.type foo, @function
foo:
.LFB0:
.cfi_startproc
vmovaps v2(%rip), %ymm1
vmulps v1(%rip), %ymm1, %ymm0
vmovaps v1+32(%rip), %xmm2
vmovups %ymm0, res(%rip)
vmulps v2+32(%rip), %xmm2, %xmm0
vmovaps %xmm0, res+32(%rip)
vzeroupper
ret