addition with promotion

rguenth at gcc dot gnu.org via Gcc-bugs Thu, 18 Jan 2024 00:05:37 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113458


--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
On x86_64 with -mavx2 we vectorize

t.c:7:13: note: Vectorizing SLP tree:
t.c:7:13: note: Root stmt: sum_26 = _20 + sum_25;
t.c:7:13: note: node 0x57386c0 (max_nunits=4, refcnt=1) vector(4) int
t.c:7:13: note: op template: _20 = _17 * _19;
t.c:7:13: note:         stmt 0 _5 = _2 * _4;
t.c:7:13: note:         stmt 1 _10 = _7 * _9;
t.c:7:13: note:         stmt 2 _15 = _12 * _14;
t.c:7:13: note:         stmt 3 _20 = _17 * _19;
t.c:7:13: note:         children 0x5738748 0x5738858
t.c:7:13: note: node 0x5738748 (max_nunits=4, refcnt=1) vector(4) int
t.c:7:13: note: op template: _17 = (int) _16;
t.c:7:13: note:         stmt 0 _2 = (int) _1;
t.c:7:13: note:         stmt 1 _7 = (int) _6;
t.c:7:13: note:         stmt 2 _12 = (int) _11;
t.c:7:13: note:         stmt 3 _17 = (int) _16;
t.c:7:13: note:         children 0x57387d0
t.c:7:13: note: node 0x57387d0 (max_nunits=4, refcnt=1) vector(4) short int
t.c:7:13: note: op template: _16 = MEM[(short int *)a_22(D) + 6B];
t.c:7:13: note:         stmt 0 _1 = *a_22(D);
t.c:7:13: note:         stmt 1 _6 = MEM[(short int *)a_22(D) + 2B];
t.c:7:13: note:         stmt 2 _11 = MEM[(short int *)a_22(D) + 4B];
t.c:7:13: note:         stmt 3 _16 = MEM[(short int *)a_22(D) + 6B];
t.c:7:13: note: node 0x5738858 (max_nunits=4, refcnt=1) vector(4) int
t.c:7:13: note: op template: patt_37 = (int) patt_36;
t.c:7:13: note:         stmt 0 patt_28 = (int) patt_27;
t.c:7:13: note:         stmt 1 patt_31 = (int) patt_30;
t.c:7:13: note:         stmt 2 patt_34 = (int) patt_33;
t.c:7:13: note:         stmt 3 patt_37 = (int) patt_36;
t.c:7:13: note:         children 0x57388e0
t.c:7:13: note: node 0x57388e0 (max_nunits=4, refcnt=1) vector(4) signed short
t.c:7:13: note: op template: patt_36 = (signed short) _18;
t.c:7:13: note:         stmt 0 patt_27 = (signed short) _3;
t.c:7:13: note:         stmt 1 patt_30 = (signed short) _8;
t.c:7:13: note:         stmt 2 patt_33 = (signed short) _13;
t.c:7:13: note:         stmt 3 patt_36 = (signed short) _18;
t.c:7:13: note:         children 0x5738968
t.c:7:13: note: node 0x5738968 (max_nunits=4, refcnt=1) vector(4) signed char
t.c:7:13: note: op template: _18 = MEM[(signed char *)b_23(D) + 3B];
t.c:7:13: note:         stmt 0 _3 = *b_23(D);
t.c:7:13: note:         stmt 1 _8 = MEM[(signed char *)b_23(D) + 1B];
t.c:7:13: note:         stmt 2 _13 = MEM[(signed char *)b_23(D) + 2B];
t.c:7:13: note:         stmt 3 _18 = MEM[(signed char *)b_23(D) + 3B];

thus

  vect__16.5_40 = MEM <vector(4) short int> [(short int *)a_22(D)];
  vect__17.6_41 = (vector(4) int) vect__16.5_40;
  vect__18.9_44 = MEM <vector(4) signed char> [(signed char *)b_23(D)];
  vect_patt_36.10_45 = (vector(4) signed short) vect__18.9_44;
  vect_patt_37.11_46 = (vector(4) int) vect_patt_36.10_45;
  vect__20.12_48 = vect__17.6_41 * vect_patt_37.11_46;
  _49 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(vect__20.12_48);
  _50 = .REDUC_PLUS (_49); [tail call]
  _51 = (int) _50;

f:
.LFB0:
        .cfi_startproc
        vpmovsxbd       (%rsi), %xmm1
        vpmovsxwd       (%rdi), %xmm0
        vpmulld %xmm1, %xmm0, %xmm0
        vpsrldq $8, %xmm0, %xmm1
        vpaddd  %xmm1, %xmm0, %xmm0
        vpsrldq $4, %xmm0, %xmm1
        vpaddd  %xmm1, %xmm0, %xmm0
        vmovd   %xmm0, %eax
        ret

similar with SSE4.

We do recognize widening mults as patterns but we're somehow not using them
which is likely the failure of reduction root detection not looking for
patterns (that's an issue for all of them) - root detection is done before
pattern recog here.  Interestingly enough for x86 we end up doing

t.c:7:13: note: ------>vectorizing SLP node starting from: patt_38 = _16 w*
patt_36;
t.c:7:13: note: vect_is_simple_use: operand MEM[(short int *)a_22(D) + 6B],
type of def: internal
t.c:7:13: note: vect_is_simple_use: operand (signed short) _18, type of def:
internal
t.c:7:13: note: transform conversion. ncopies = 1.
t.c:7:13: note: add new stmt: _44 = (vector(4) int) vect__16.5_40;
t.c:7:13: note: add new stmt: _45 = (vector(4) int) vect_patt_36.9_43;
t.c:7:13: note: add new stmt: vect_patt_38.10_46 = _44 * _45;

thus add the very same code as without the pattern.

Does the following help for ARM?

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 086377a9ac0..c0626720651 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3649,6 +3649,9 @@ vect_analyze_slp (vec_info *vinfo, unsigned
max_tree_size)
       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
        {
          vect_location = bb_vinfo->roots[i].roots[0]->stmt;
+         for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
+           bb_vinfo->roots[i].stmts[j]
+             = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
          if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
                                       bb_vinfo->roots[i].stmts,
                                       bb_vinfo->roots[i].roots,

[Bug tree-optimization/113458] Missed SLP for reduction of multiplication/addition with promotion

Reply via email to