------- Comment #40 from hubicka at gcc dot gnu dot org  2008-02-01 16:47 
-------
Well, I still meant that simplifying the cascaded addition into accumulator
into direct addition from base makes the code to simplify. I implemented
experimentally the trick in fwprop and will attach later, but the patch itself
doesn't help.

What happens is now obvious.  For sequence like:

  D.185211 = *py.4861;
  d = D.180590 * D.185211;
  p1 = py.4861 + 8;
  D.185213 = *p1;
  d = D.180606 * D.185213;
  p1 = py.4861 + 16;
  D.185215 = *p1;
  d = D.180642 * D.185215;
  p1 = py.4861 + 24;
  D.185217 = *p1;
  d = D.180728 * D.185217;
  p1 = py.4861 + 32;
  D.185219 = *p1;
  d = D.180888 * D.185219;
  p1 = py.4861 + 40;
  D.185221 = *p1;
  d = D.181157 * D.185221;
  p1 = py.4861 + 48;
  D.185223 = *p1;
  D.185098 = D.181571 * D.185223;
  D.185094 = d + D.185098;
  D.185090 = d + D.185094;
  D.185086 = d + D.185090;
  D.185082 = d + D.185086;
  D.185078 = d + D.185082;
  D.185074 = d + D.185078;
  D.185210 = *pz;
  d = D.185074 * D.185210;
  py = pz + 8; 
  d = D.180606 * D.185211;
  d = D.180642 * D.185213;
  d = D.180728 * D.185215;
  d = D.180888 * D.185217;
  d = D.181157 * D.185219;
  d = D.181571 * D.185221;
  D.185130 = D.182177 * D.185223;
  D.185126 = d + D.185130;
  D.185122 = d + D.185126;
  D.185118 = d + D.185122;
  D.185114 = d + D.185118;
  D.185110 = d + D.185114;
  D.185106 = d + D.185110;
  D.185225 = *py;
  d = D.185106 * D.185225;
  py = pz + 16;
  d = D.180642 * D.185211;
  d = D.180728 * D.185213;
  d = D.180888 * D.185215;
  d = D.181157 * D.185217;
  d = D.181571 * D.185219;
  d = D.182177 * D.185221;
  D.185162 = D.183023 * D.185223;
  D.185158 = d + D.185162;
  D.185154 = d + D.185158;
  D.185150 = d + D.185154;
  D.185146 = d + D.185150;
  D.185142 = d + D.185146;
  D.185138 = d + D.185142;
  D.185240 = *py;
  d = D.185138 * D.185240;
  py = pz + 24;
  d = D.180728 * D.185211;
  d = D.180888 * D.185213;
  d = D.181157 * D.185215;
  d = D.181571 * D.185217;
  d = D.182177 * D.185219;
  d = D.183023 * D.185221;
  D.185194 = D.184168 * D.185223;
  D.185190 = d + D.185194;
  D.185186 = d + D.185190;
  D.185182 = d + D.185186;
  D.185178 = d + D.185182;
  D.185174 = d + D.185178;
  D.185170 = d + D.185174;
  D.185255 = *py;
  d = D.185170 * D.185255;
  D.185134 = d + d;
  D.185102 = d + D.185134;
  D.185195 = d + D.185102;
  *ap1.4607 = D.185195;
  if (z1.4734 == 0)
    goto <bb 339> (<L351>);
  else
    goto <bb 144>;

that are accumulating values from array into few variables, TER merges all the
arithmetic into single giant expression leaving the loads in the front of it.

<L262>:;
  D.197135 = *pz;
  D.197137 = *(pz + 8);
  D.197139 = *(pz + 16);
  D.197141 = *(pz + 24);
  D.197143 = *(pz + 32);
  D.197145 = *(pz + 40);
  D.197147 = *(pz + 48);
  D.197149 = *(pz + 56);
  D.197151 = *(pz + 64);
  D.197153 = *(pz + 72);
  D.197155 = *(pz + 80);
  D.197157 = *(pz + 88);
  D.197159 = *(pz + 96);
  *ap1.4658 = (D.180590 * D.197135 + (D.180606 * D.197137 + (D.180642 *
D.197139 + (D.180728 * D.197141 + (D.180888 * D.197143 + (D.181157 * D.197145 +
(D.181571 * D.197147 + (D.182177 * D.197149 + (D.183023 * D.197151 + (D.184168
* D.197153 + (D.185672 * D.197155 + (D.187606 * D.197157 + D.190042 *
D.197159)))))))))))) * *py.4912 + ((D.180606 * D.197135 + (D.180642 * D.197137
+ (D.180728 * D.197139 + (D.180888 * D.197141 + (D.181157 * D.197143 +
(D.181571 * D.197145 + (D.182177 * D.197147 + (D.183023 * D.197149 + (D.184168
* D.197151 + (D.185672 * D.197153 + (D.187606 * D.197155 + (D.190042 * D.197157
+ D.193063 * D.197159)))))))))))) * *(py.4912 + 8) + (D.180642 * D.197135 +
(D.180728 * D.197137 + (D.180888 * D.197139 + (D.181157 * D.197141 + (D.181571
* D.197143 + (D.182177 * D.197145 + (D.183023 * D.197147 + (D.184168 * D.197149
+ (D.185672 * D.197151 + (D.187606 * D.197153 + (D.190042 * D.197155 +
(D.193063 * D.197157 + D.196753 * D.197159)))))))))))) * *(py.4912 + 16));
  if (z1.4780 == 0)
    goto <bb 339> (<L351>);
  else
    goto <bb 251>;


With the patch for fwprop and -fno-tree-ter I get 5.1s, that is same as in pre
GCC-4.0.  Why TER is not placing loads into expressions at first place?  This
seems like quite common pattern to kill register pressure to me.

I have to leave but will play with it further, try if fwprop patch is needed
and polish it.

Honza


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17863

Reply via email to