https://gcc.gnu.org/g:acba8b3d8dec0124c8b3a7e112b3a784a5091214
commit r15-4787-gacba8b3d8dec0124c8b3a7e112b3a784a5091214 Author: Kugan Vivekanandarajah <kvivekana...@nvidia.com> Date: Thu Oct 31 07:23:10 2024 +1100 [PATCH] Fix SLP when ifcvt versioned loop is not vectorized When ifcvt version a loop, it sets dont_vectorize to the scalar loop. If the vector loop is not vectorized and removed, the scalar loop is still left with dont_vectorize. As a result, BB vectorization will not happen. This patch resets dont_vectorize to scalar loop when IFN_LOOP_VECTORIZED is set to false. gcc/ChangeLog: * tree-vectorizer.cc (pass_vectorize::execute): Reset dont_vectorize to scalar loop when setting IFN_LOOP_VECTORIZED to false. gcc/testsuite/ChangeLog: * gcc.dg/vect/bb-slp-77.c: New test. Diff: --- gcc/testsuite/gcc.dg/vect/bb-slp-77.c | 74 +++++++++++++++++++++++++++++++++++ gcc/tree-vectorizer.cc | 2 + 2 files changed, 76 insertions(+) diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-77.c b/gcc/testsuite/gcc.dg/vect/bb-slp-77.c new file mode 100644 index 000000000000..b2cc1d114f10 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-77.c @@ -0,0 +1,74 @@ + +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +#include <stdint.h> +#include <string.h> + + +typedef struct { + uint16_t d; + uint16_t m; + uint8_t val1[4]; + uint8_t val2[16]; +} st1; + +typedef struct { + float d; + float s; + int8_t val2[32]; +} st2; + +float table[1 << 16]; + +inline static float foo(uint16_t f) { + uint16_t s; + memcpy(&s, &f, sizeof(uint16_t)); + return table[s]; +} + + +void test(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int nb = n / 32; + + + const st1 * restrict x = vx; + const st2 * restrict y = vy; + + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + uint32_t val1; + memcpy(&val1, x[i].val1, sizeof(val1)); + + int sumi0 = 0; + int sumi1 = 0; + + if (val1) { + for (int j = 0; j < 16; ++j) { + const uint8_t xh_0 = ((val1 >> (j)) << 4) & 0x10; + const uint8_t xh_1 = ((val1 >> (j + 12)) ) & 0x10; + + const int32_t x0 = (x[i].val2[j] & 0xF) | xh_0; + const int32_t x1 = (x[i].val2[j] >> 4) | xh_1; + + sumi0 += (x0 * y[i].val2[j]); + sumi1 += (x1 * y[i].val2[j + 16]); + } + } else { + for (int j = 0; j < 16; ++j) { + const int32_t x0 = (x[i].val2[j] & 0xF); + const int32_t x1 = (x[i].val2[j] >> 4); + + sumi0 += (x0 * y[i].val2[j]); + sumi1 += (x1 * y[i].val2[j + 16]); + } + } + + int sumi = sumi0 + sumi1; + sumf += (foo(x[i].d)*y[i].d)*sumi + foo(x[i].m)*y[i].s; + } + + *s = sumf; +} + +/* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp1" { target { { vect_int_mult && vect_element_align } && { ! powerpc*-*-* } } } } } */ diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc index af112f212fe2..16fa0ec1bb77 100644 --- a/gcc/tree-vectorizer.cc +++ b/gcc/tree-vectorizer.cc @@ -1326,6 +1326,7 @@ pass_vectorize::execute (function *fun) if (g) { fold_loop_internal_call (g, boolean_false_node); + loop->dont_vectorize = false; ret |= TODO_cleanup_cfg; g = NULL; } @@ -1335,6 +1336,7 @@ pass_vectorize::execute (function *fun) if (g) { fold_loop_internal_call (g, boolean_false_node); + loop->dont_vectorize = false; ret |= TODO_cleanup_cfg; } }