https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82498

--- Comment #3 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
Not to mention that the #c0 code has undefined behavior if rot is not 0, but a
multiple of 8 * sizeof(uint32_t), like 32, 64, ...
If you insisted on the rot == 0 check it would need to be done after the the
rot %= ... and then GCC wouldn't recognize the and operation to be useless.

So, on a more complete testcase where f1, f2, f3 have UB on multiples of 32
other than 0, we get most efficient code on f5 and f8
unsigned
f1 (unsigned x, unsigned char y)
{
  if (y == 0)
    return x;
  y %= __CHAR_BIT__ * __SIZEOF_INT__;
  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
}

unsigned
f2 (unsigned x, unsigned y)
{
  if (y == 0)
    return x;
  y %= __CHAR_BIT__ * __SIZEOF_INT__;
  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
}

unsigned
f3 (unsigned x, unsigned short y)
{
  if (y == 0)
    return x;
  y %= __CHAR_BIT__ * __SIZEOF_INT__;
  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
}

unsigned
f4 (unsigned x, unsigned char y)
{
  y %= __CHAR_BIT__ * __SIZEOF_INT__;
  if (y == 0)
    return x;
  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
}

unsigned
f5 (unsigned x, unsigned y)
{
  y %= __CHAR_BIT__ * __SIZEOF_INT__;
  if (y == 0)
    return x;
  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
}

unsigned
f6 (unsigned x, unsigned short y)
{
  y %= __CHAR_BIT__ * __SIZEOF_INT__;
  if (y == 0)
    return x;
  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
}

unsigned
f7 (unsigned x, unsigned char y)
{
  y %= __CHAR_BIT__ * __SIZEOF_INT__;
  return (x << y) | (x >> (-y % (__CHAR_BIT__ * __SIZEOF_INT__)));
}

unsigned
f8 (unsigned x, unsigned int y)
{
  y %= __CHAR_BIT__ * __SIZEOF_INT__;
  return (x << y) | (x >> (-y % (__CHAR_BIT__ * __SIZEOF_INT__)));
}

unsigned
f9 (unsigned x, unsigned short y)
{
  y %= __CHAR_BIT__ * __SIZEOF_INT__;
  return (x << y) | (x >> (-y % (__CHAR_BIT__ * __SIZEOF_INT__)));
}

where f8 is pattern recognized as x r<< (y & 31) and f5 is pattern recognized
and phiopt optimized into that.
The rest produce inefficient code, f1/f4/f6 with useless & and useless
comparison + cmov, f2/f3 just with useless comparison/cmov, and f7/f9 aren't
even pattern recognized as rotates.

The real question is how many different portable and non-portable ways of doing
this we really need to pattern recognize, there are many weirdo ways this can
be written.

Reply via email to