https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89176
Bug ID: 89176 Summary: Vectorizer fails to consider narrower vector width for res[i] = v1[i] < v2[i] ? v2[i] : v1[i] Product: gcc Version: 9.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: hjl.tools at gmail dot com Target Milestone: --- [hjl@gnu-cfl-1 pr89028]$ cat 2c.i float v1[] = { 8.3, 3.4, 8.3, 3.4, 5.8, 9.7, 5.8, 9.7, 8.3, 3.4, 8.3, 3.4 }; float v2[] = { 5.8, 9.7, 8.3, 3.4, 8.3, 3.4, 8.3, 3.4, 8.3, 3.4, 5.8, 9.7 }; float res[12]; void foo (void) { int i; for (i = 0; i < sizeof (res) / sizeof (res[0]); i++) res[i] = v1[i] < v2[i] ? v2[i] : v1[i]; } [hjl@gnu-cfl-1 pr89028]$ make 2c.s /export/build/gnu/tools-build/gcc-mmx-debug/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/tools-build/gcc-mmx-debug/build-x86_64-linux/gcc/ -O3 -march=haswell -S 2c.i [hjl@gnu-cfl-1 pr89028]$ cat 2c.s .file "2c.i" .text .p2align 4 .globl foo .type foo, @function foo: .LFB0: .cfi_startproc vmovaps v2(%rip), %ymm1 vmaxps v1(%rip), %ymm1, %ymm0 vmovups %ymm0, res(%rip) vmovss v2+32(%rip), %xmm0 vmaxss v1+32(%rip), %xmm0, %xmm0 vmovss %xmm0, res+32(%rip) vmovss v2+36(%rip), %xmm0 vmaxss v1+36(%rip), %xmm0, %xmm0 vmovss %xmm0, res+36(%rip) vmovss v2+40(%rip), %xmm0 vmaxss v1+40(%rip), %xmm0, %xmm0 vmovss %xmm0, res+40(%rip) vmovss v2+44(%rip), %xmm0 vmaxss v1+44(%rip), %xmm0, %xmm0 vmovss %xmm0, res+44(%rip) vzeroupper ret .cfi_endproc We generate 4 scalar res[i] = v1[i] < v2[i] ? v2[i] : v1[i]. But this works: [hjl@gnu-cfl-1 pr89028]$ cat 3a.i float v1[] = { 8.3, 3.4, 8.3, 3.4, 5.8, 9.7, 5.8, 9.7, 8.3, 3.4, 8.3, 3.4 }; float v2[] = { 5.8, 9.7, 8.3, 3.4, 8.3, 3.4, 8.3, 3.4, 8.3, 3.4, 5.8, 9.7 }; float res[12]; void foo (void) { int i; for (i = 0; i < sizeof (res) / sizeof (res[0]); i++) res[i] = v2[i] * v1[i]; } [hjl@gnu-cfl-1 pr89028]$ make 3a.s /export/build/gnu/tools-build/gcc-mmx-debug/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/tools-build/gcc-mmx-debug/build-x86_64-linux/gcc/ -O3 -march=haswell -S 3a.i [hjl@gnu-cfl-1 pr89028]$ cat 3a.s .file "3a.i" .text .p2align 4 .globl foo .type foo, @function foo: .LFB0: .cfi_startproc vmovaps v2(%rip), %ymm1 vmulps v1(%rip), %ymm1, %ymm0 vmovaps v1+32(%rip), %xmm2 vmovups %ymm0, res(%rip) vmulps v2+32(%rip), %xmm2, %xmm0 vmovaps %xmm0, res+32(%rip) vzeroupper ret