Found a code size regression for AVR target in gcc-5 and higher. Looks like it
is applicable to x86_64 also.

Test case ( options: -Os)
---------
typedef unsigned int uint8_t __attribute__((__mode__(__QI__)));
typedef unsigned int uint32_t __attribute__ ((__mode__ (__SI__)));
typedef struct rpl_instance rpl_instance_t;
struct rpl_instance {
  uint8_t dio_intcurrent;
  uint32_t dio_next_delay;
};
unsigned short random_rand(void);

void
new_dio_interval(rpl_instance_t *instance)
{
  uint32_t time;
  uint32_t ticks;
  time = 1UL << instance->dio_intcurrent;
  ticks = (time * 128) / 1000;
  instance->dio_next_delay = ticks;
  ticks = ticks / 2 + (ticks / 2 * (uint32_t)random_rand()) / 65535U;
  instance->dio_next_delay -= ticks;
}

ssa dump
--------
_3 = instance_2(D)->dio_intcurrent
_4 = (int) _3
_5 = 1 << _4
time_6  = (uint32_t)_5
_7 = time_6 * 128
ticks_8 = _7 / 1000
instance_2(D)->dio_next_delay = ticks_8
_10 = ticks_8 / 2
_11 = ticks_8 / 2
_13 = random_rand()
_14 = (unsigned int) _13
_15 = _11 * _14
_16 = _15 / 65535
ticks_17 = _11 + _16
_18 = instance_2(D)->dio_next_delay
_19 = _18 - ticks_17
instance_2(D)->dio_next_delay = _19
return

gcc-5 or higher generates un-optimal code for _10 definition as below:
  _10 = _7 / 2000
where as gcc-4 generates _10 = ticks_8 >> 1.
Below are few differences in the passes that lead to this un-optimal code.

pass      |            gcc 4              |         gcc 5 and higher     |
----------+-------------------------------+------------------------------+
ccp1      | _10 definition removed as dce | No change                    |
          | |                              |
forwprop1 | No change                     | gimple_simplified _10 & _11  |
          |                               |    _10 = _7 / 2000           |
          |                               |    _11 = _7 / 2000           |
          | |                              |
cddce1    | No change                     | _10 definition removed       |
          | |                              |
ccp2      | No change                     | No change                    |
          | |                              |
vrp1      | _10 = ticks_8 / 2             | No change                    |
          |     changed to |                              |
          | _10 = ticks_8 >> 1 |                              |
----------+-------------------------------+------------------------------+

Forward propagation in gcc-4 doesn't propagate ticks_8 to _11. Where as
gcc-5 propagates to two expressions (_10 and _11). Is that valid? This
prevents vrp pass from optimizing rhs expressions of _11 in this testcase.

Regards,
Pitchumani

Reply via email to