https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92344

            Bug ID: 92344
           Summary: Missing considering fre optimization of vector load in
                    auto-vectorization
           Product: gcc
           Version: 10.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: crazylht at gmail dot com
  Target Milestone: ---

For testcase loop.c
-------------------
cat loop.c:

int loop
(unsigned char * input1, unsigned char * input2, int stride1, int stride2)
{
    unsigned int tmp[4][4];
    unsigned int var0, var1, var2, var3;
    int sum = 0;
    for (int i = 0; i < 4; i++, input1 += stride1, input2 += stride2) {
        var0 = (input1[0] + input2[0]) + (input1[4] + input2[4]);
        var1 = (input1[1] + input2[1]) + (input1[5] + input2[5]);
        var2 = (input1[2] + input2[2]) + (input1[6] + input2[6]);
        var3 = (input1[3] + input2[3]) + (input1[7] + input2[7]);
        int inter0 = var0 + var1;
        int inter1 = var0 + var1;
        int inter2 = var2 + var3;
        int inter3 = var2 + var3;
        tmp[i][0] = inter0 + inter2;
        tmp[i][2] = inter0 + inter2;
        tmp[i][1] = inter1 + inter3;
        tmp[i][3] = inter1 + inter3;
    }
    for (int i = 0; i < 4; i++) {
        int inter0 = tmp[0][i] + tmp[1][i];
        int inter1 = tmp[0][i] + tmp[1][i];
        int inter2 = tmp[2][i] + tmp[3][i];
        int inter3 = tmp[2][i] + tmp[3][i];
        var0 = inter0 + inter2;
        var2 = inter0 + inter2;
        var1 = inter1 + inter3;
        var3 = inter1 + inter3;
        sum += var0 + var1 + var2 + var3;
    }

    return sum;
}
---------------

Command line:
--------------------------
/usr/gcc10_20191101/bin/gcc -Ofast -march=skylake loop.c -S
--------------------------

before slp1, we have:
----------
bb2:
  ...
  tmp[0][0] = _95;
  tmp[0][2] = _95;
  tmp[0][1] = _95;
  tmp[0][3] = _95;
  ...
  i_168 = 1;
  tmp[i_168][0] = _168;
  tmp[i_168][2] = _168;
  tmp[i_168][1] = _168;
  tmp[i_168][3] = _168;
  ...
  i_238 = i_168 + 1;
  tmp[i_238][0] = _238;
  tmp[i_238][2] = _238;
  tmp[i_238][1] = _238;
  tmp[i_238][3] = _238;
  ...
  i_309 = i_238 + 1;
  tmp[i_309][0] = _48;
  tmp[i_309][2] = _48;
  tmp[i_309][1] = _48;
  tmp[i_309][3] = _48;
  ...

  vectp_tmp.9_284 = &tmp + 16; ------ &tmp[1][0]
  vectp_tmp.14_276 = &tmp + 32; ----- &tmp[2][0]
  vectp_tmp.17_272 = &tmp + 48; ----- &tmp[1][0]
  vect__51.7_285 = MEM <vector(4) unsigned int> [(unsigned int *)&tmp];
  vect__52.10_281 = MEM <vector(4) unsigned int> [(unsigned int
*)vectp_tmp.9_284];
  vect__55.15_273 = MEM <vector(4) unsigned int> [(unsigned int
*)vectp_tmp.14_276];
  vect__56.18_269 = MEM <vector(4) unsigned int> [(unsigned int
*)vectp_tmp.17_272];
  ..........
-------------

in slp1 we have 256bit vector generated since 256bit vector_cost less than
128bits vector cost:

256bits vectorization
--------
   _540 = {_238, _238, _238, _238, _48, _48, _48, _48};
  vect_cst__541 = _540;
  _542 = {_95, _95, _95, _95, _168, _168, _168, _168};
  vect_cst__543 = _542;
  MEM <vector(8) unsigned int> [(unsigned int *)&tmp] = vect_cst__543;
  _545 = &tmp[0][0] + 32;
  MEM <vector(8) unsigned int> [(unsigned int *)_545] = vect_cst__541;
-----------

256bits vectorization cost
------------------------------------------
2 times 256bits vector_store costs 48 in body
(2 256bits vector store costs)
---------------------------------------

128bits vectorization
------------------
  _540 = {_95, _95, _95, _95};
  vect_cst__541 = _540;
 _543 = {_48, _48, _48, _48};
  vect_cst__544 = _543;
  _545 = {_238, _238, _238, _238};
  vect_cst__546 = _545;
  _547 = {_168, _168, _168, _168};
  vect_cst__548 = _547;
  MEM <vector(4) unsigned int> [(unsigned int *)&tmp] = vect_cst__541;
  vectp.32_549 = &tmp[i_168][0];
  MEM <vector(4) unsigned int> [(unsigned int *)vectp.32_549] = vect_cst__548;
  vectp.32_551 = vectp.32_549 + 16;
  MEM <vector(4) unsigned int> [(unsigned int *)vectp.32_551] = vect_cst__546;
  vectp.32_553 = vectp.32_551 + 16;
  MEM <vector(4) unsigned int> [(unsigned int *)vectp.32_553] = vect_cst__544;
------------------

128bits vectorization cost:

----------------------------
4 times 128bit vector_store costs 64 in body
(4 128bit vector store costs)
---------------------------

But since there's 128bit loads after these stores, using 128bit stores can
enable full redudant elimation of the folowing 128bits vector loads, it should
be more accurate to have vectorization cost like:

128bit vectorization cost:
-------------------------------
4 times 128bit vector_store costs 64 in body **minus 4 times 128bit vector_load
cost 48 in body**

totally cost 64 - 48 = 16 inside loop body.
-------------------------------

256 bit vectorization cost:
-----------------
2 times 256bit vector_store costs 64 in body 

totally cost 48 inside loop body.
----------------

Then 128bit vectorization will be generated.

Reply via email to